Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 87 additions & 5 deletions backend/onyx/connectors/github/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from github import Repository
from github.GithubException import GithubException
from github.Issue import Issue
from github.NamedUser import NamedUser
from github.PaginatedList import PaginatedList
from github.PullRequest import PullRequest
from github.Requester import Requester
Expand Down Expand Up @@ -219,14 +220,26 @@ def _get_batch_rate_limited(
)


def _get_userinfo(user: NamedUser) -> dict[str, str]:
return {
k: v
for k, v in {
"login": user.login,
"name": user.name,
"email": user.email,
}.items()
if v is not None
}


def _convert_pr_to_document(pull_request: PullRequest) -> Document:
return Document(
id=pull_request.html_url,
sections=[
TextSection(link=pull_request.html_url, text=pull_request.body or "")
],
source=DocumentSource.GITHUB,
semantic_identifier=pull_request.title,
semantic_identifier=f"{pull_request.number}: {pull_request.title}",
# updated_at is UTC time but is timezone unaware, explicitly add UTC
# as there is logic in indexing to prevent wrong timestamped docs
# due to local time discrepancies with UTC
Expand All @@ -236,8 +249,49 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
else None
),
metadata={
"merged": str(pull_request.merged),
"state": pull_request.state,
k: [str(vi) for vi in v] if isinstance(v, list) else str(v)
for k, v in {
"object_type": "PullRequest",
"id": pull_request.number,
"merged": pull_request.merged,
"state": pull_request.state,
"user": _get_userinfo(pull_request.user) if pull_request.user else None,
"assignees": [
_get_userinfo(assignee) for assignee in pull_request.assignees
],
"repo": (
pull_request.base.repo.full_name if pull_request.base else None
),
"num_commits": str(pull_request.commits),
"num_files_changed": str(pull_request.changed_files),
"labels": [label.name for label in pull_request.labels],
"created_at": (
pull_request.created_at.replace(tzinfo=timezone.utc)
if pull_request.created_at
else None
),
"updated_at": (
pull_request.updated_at.replace(tzinfo=timezone.utc)
if pull_request.updated_at
else None
),
"closed_at": (
pull_request.closed_at.replace(tzinfo=timezone.utc)
if pull_request.closed_at
else None
),
"merged_at": (
pull_request.merged_at.replace(tzinfo=timezone.utc)
if pull_request.merged_at
else None
),
"merged_by": (
_get_userinfo(pull_request.merged_by)
if pull_request.merged_by
else None
),
}.items()
if v is not None
},
)

Expand All @@ -252,11 +306,39 @@ def _convert_issue_to_document(issue: Issue) -> Document:
id=issue.html_url,
sections=[TextSection(link=issue.html_url, text=issue.body or "")],
source=DocumentSource.GITHUB,
semantic_identifier=issue.title,
semantic_identifier=f"{issue.number}: {issue.title}",
# updated_at is UTC time but is timezone unaware
doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
metadata={
"state": issue.state,
k: [str(vi) for vi in v] if isinstance(v, list) else str(v)
for k, v in {
"object_type": "Issue",
"id": issue.number,
"state": issue.state,
"user": _get_userinfo(issue.user) if issue.user else None,
"assignees": [_get_userinfo(assignee) for assignee in issue.assignees],
"repo": issue.repository.full_name if issue.repository else None,
"labels": [label.name for label in issue.labels],
"created_at": (
issue.created_at.replace(tzinfo=timezone.utc)
if issue.created_at
else None
),
"updated_at": (
issue.updated_at.replace(tzinfo=timezone.utc)
if issue.updated_at
else None
),
"closed_at": (
issue.closed_at.replace(tzinfo=timezone.utc)
if issue.closed_at
else None
),
"closed_by": (
_get_userinfo(issue.closed_by) if issue.closed_by else None
),
}.items()
if v is not None
},
)

Expand Down
49 changes: 36 additions & 13 deletions backend/tests/daily/connectors/github/test_github_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,25 +30,48 @@ def test_github_connector_basic(github_connector: GithubConnector) -> None:
start=0,
end=time.time(),
)
assert len(docs) > 0 # We expect at least one PR to exist
assert len(docs) > 1 # We expect at least one PR and one Issue to exist

# Test the first document's structure
doc = docs[0]
pr_doc = docs[0]
issue_doc = docs[-1]

# Verify basic document properties
assert doc.source == DocumentSource.GITHUB
assert doc.secondary_owners is None
assert doc.from_ingestion_api is False
assert doc.additional_info is None
assert pr_doc.source == DocumentSource.GITHUB
assert pr_doc.secondary_owners is None
assert pr_doc.from_ingestion_api is False
assert pr_doc.additional_info is None

# Verify GitHub-specific properties
assert "github.com" in doc.id # Should be a GitHub URL
assert doc.metadata is not None
assert "state" in doc.metadata
assert "merged" in doc.metadata
assert "github.com" in pr_doc.id # Should be a GitHub URL

# Verify PR-specific properties
assert pr_doc.metadata is not None
assert pr_doc.metadata.get("object_type") == "PullRequest"
assert "id" in pr_doc.metadata
assert "merged" in pr_doc.metadata
assert "state" in pr_doc.metadata
assert "user" in pr_doc.metadata
assert "assignees" in pr_doc.metadata
assert pr_doc.metadata.get("repo") == "onyx-dot-app/documentation"
assert "num_commits" in pr_doc.metadata
assert "num_files_changed" in pr_doc.metadata
assert "labels" in pr_doc.metadata
assert "created_at" in pr_doc.metadata

# Verify Issue-specific properties
assert issue_doc.metadata is not None
assert issue_doc.metadata.get("object_type") == "Issue"
assert "id" in issue_doc.metadata
assert "state" in issue_doc.metadata
assert "user" in issue_doc.metadata
assert "assignees" in issue_doc.metadata
assert issue_doc.metadata.get("repo") == "onyx-dot-app/documentation"
assert "labels" in issue_doc.metadata
assert "created_at" in issue_doc.metadata

# Verify sections
assert len(doc.sections) == 1
section = doc.sections[0]
assert section.link == doc.id # Section link should match document ID
assert len(pr_doc.sections) == 1
section = pr_doc.sections[0]
assert section.link == pr_doc.id # Section link should match document ID
assert isinstance(section.text, str) # Should have some text content
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,7 @@ def to_repository_side_effect(
assert all(isinstance(item, Document) for item in outputs[1].items)
assert {
item.semantic_identifier for item in cast(list[Document], outputs[1].items)
} == {"PR 3 Repo 2", "PR 4 Repo 2"}
} == {"3: PR 3 Repo 2", "4: PR 4 Repo 2"}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is here because we add the pr/issue number to the semantic id (seems a little unnecessary here, but normally the title of the issue/pr wouldn't include the number).

This will be important for kg because we want to be able to search for the entity "4: Fixed ABC", when the user asks for PR 4.

cp1 = outputs[1].next_checkpoint
assert (
cp1.has_more
Expand All @@ -869,7 +869,7 @@ def to_repository_side_effect(
assert all(isinstance(item, Document) for item in outputs[3].items)
assert {
item.semantic_identifier for item in cast(list[Document], outputs[3].items)
} == {"PR 1 Repo 1", "PR 2 Repo 1"}
} == {"1: PR 1 Repo 1", "2: PR 2 Repo 1"}
cp3 = outputs[3].next_checkpoint
# This checkpoint is returned early because offset had items. has_more reflects state then.
assert cp3.has_more # still need to do issues
Expand Down