Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 86 additions & 5 deletions backend/onyx/connectors/github/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from github import Repository
from github.GithubException import GithubException
from github.Issue import Issue
from github.NamedUser import NamedUser
from github.PaginatedList import PaginatedList
from github.PullRequest import PullRequest
from github.Requester import Requester
Expand Down Expand Up @@ -219,14 +220,26 @@ def _get_batch_rate_limited(
)


def _get_userinfo(user: NamedUser) -> dict[str, str]:
return {
k: v
for k, v in {
"login": user.login,
"name": user.name,
"email": user.email,
}.items()
if v is not None
}


def _convert_pr_to_document(pull_request: PullRequest) -> Document:
return Document(
id=pull_request.html_url,
sections=[
TextSection(link=pull_request.html_url, text=pull_request.body or "")
],
source=DocumentSource.GITHUB,
semantic_identifier=pull_request.title,
semantic_identifier=f"{pull_request.number}: {pull_request.title}",
# updated_at is UTC time but is timezone unaware, explicitly add UTC
# as there is logic in indexing to prevent wrong timestamped docs
# due to local time discrepancies with UTC
Expand All @@ -236,8 +249,48 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
else None
),
metadata={
"merged": str(pull_request.merged),
"state": pull_request.state,
k: v
for k, v in {
"merged": str(pull_request.merged),
"state": pull_request.state,
"user": (
str(_get_userinfo(pull_request.user)) if pull_request.user else None
),
"assignees": [
str(_get_userinfo(assignee)) for assignee in pull_request.assignees
],
"repo": (
pull_request.base.repo.full_name if pull_request.base else None
),
"num_commits": str(pull_request.commits),
"labels": [label.name for label in pull_request.labels],
"created_at": (
str(pull_request.created_at.replace(tzinfo=timezone.utc))
if pull_request.created_at
else None
),
"updated_at": (
str(pull_request.updated_at.replace(tzinfo=timezone.utc))
if pull_request.updated_at
else None
),
"closed_at": (
str(pull_request.closed_at.replace(tzinfo=timezone.utc))
if pull_request.closed_at
else None
),
"merged_at": (
str(pull_request.merged_at.replace(tzinfo=timezone.utc))
if pull_request.merged_at
else None
),
"merged_by": (
str(_get_userinfo(pull_request.merged_by))
if pull_request.merged_by
else None
),
}.items()
if v is not None
},
)

Expand All @@ -252,11 +305,39 @@ def _convert_issue_to_document(issue: Issue) -> Document:
id=issue.html_url,
sections=[TextSection(link=issue.html_url, text=issue.body or "")],
source=DocumentSource.GITHUB,
semantic_identifier=issue.title,
semantic_identifier=f"{issue.number}: {issue.title}",
# updated_at is UTC time but is timezone unaware
doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
metadata={
"state": issue.state,
k: str(v)
for k, v in {
"state": issue.state,
"user": str(_get_userinfo(issue.user)) if issue.user else None,
"assignees": [
str(_get_userinfo(assignee)) for assignee in issue.assignees
],
"repo": issue.repository.full_name if issue.repository else None,
"labels": [label.name for label in issue.labels],
"created_at": (
issue.created_at.replace(tzinfo=timezone.utc)
if issue.created_at
else None
),
"updated_at": (
issue.updated_at.replace(tzinfo=timezone.utc)
if issue.updated_at
else None
),
"closed_at": (
issue.closed_at.replace(tzinfo=timezone.utc)
if issue.closed_at
else None
),
"closed_by": (
str(_get_userinfo(issue.closed_by)) if issue.closed_by else None
),
}.items()
if v is not None
},
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,7 @@ def to_repository_side_effect(
assert all(isinstance(item, Document) for item in outputs[1].items)
assert {
item.semantic_identifier for item in cast(list[Document], outputs[1].items)
} == {"PR 3 Repo 2", "PR 4 Repo 2"}
} == {"3: PR 3 Repo 2", "4: PR 4 Repo 2"}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is here because we add the pr/issue number to the semantic id (seems a little unnecessary here, but normally the title of the issue/pr wouldn't include the number).

This will be important for kg because we want to be able to search for the entity "4: Fixed ABC", when the user asks for PR 4.

cp1 = outputs[1].next_checkpoint
assert (
cp1.has_more
Expand All @@ -869,7 +869,7 @@ def to_repository_side_effect(
assert all(isinstance(item, Document) for item in outputs[3].items)
assert {
item.semantic_identifier for item in cast(list[Document], outputs[3].items)
} == {"PR 1 Repo 1", "PR 2 Repo 1"}
} == {"1: PR 1 Repo 1", "2: PR 2 Repo 1"}
cp3 = outputs[3].next_checkpoint
# This checkpoint is returned early because offset had items. has_more reflects state then.
assert cp3.has_more # still need to do issues
Expand Down