From 8e0623242eb32577a324019464a1e8cfb8363c28 Mon Sep 17 00:00:00 2001 From: Rei Meguro <36625832+Orbital-Web@users.noreply.github.com> Date: Fri, 23 May 2025 16:09:58 -0700 Subject: [PATCH 1/8] feat: updated github metadata --- backend/onyx/connectors/github/connector.py | 49 +++++++++++++++++++-- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/backend/onyx/connectors/github/connector.py b/backend/onyx/connectors/github/connector.py index 16c24a76b47..d36e582d1d8 100644 --- a/backend/onyx/connectors/github/connector.py +++ b/backend/onyx/connectors/github/connector.py @@ -236,8 +236,30 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document: else None ), metadata={ - "merged": str(pull_request.merged), - "state": pull_request.state, + k: str(v) + for k, v in { + "merged": pull_request.merged, + "state": pull_request.state, + "assignee": ( + pull_request.assignee.login if pull_request.assignee else None + ), + "created_at": pull_request.created_at.replace(tzinfo=timezone.utc), + "updated_at": ( + pull_request.updated_at.replace(tzinfo=timezone.utc) + if pull_request.updated_at + else None + ), + "merged_at": ( + pull_request.merged_at.replace(tzinfo=timezone.utc) + if pull_request.merged_at + else None + ), + "merged_by": ( + pull_request.merged_by.login if pull_request.merged_by else None + ), + "url": pull_request.url, + }.items() + if v is not None }, ) @@ -256,7 +278,28 @@ def _convert_issue_to_document(issue: Issue) -> Document: # updated_at is UTC time but is timezone unaware doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc), metadata={ - "state": issue.state, + k: str(v) + for k, v in { + "state": issue.state, + "assignee": issue.assignee.login if issue.assignee else None, + "created_at": issue.created_at.replace(tzinfo=timezone.utc), + "updated_at": ( + issue.updated_at.replace(tzinfo=timezone.utc) + if issue.updated_at + else None + ), + "closed_at": ( + issue.closed_at.replace(tzinfo=timezone.utc) + if issue.closed_at + else None + ), + "closed_by": issue.closed_by.login if issue.closed_by else None, + "labels": ( + [label.name for label in issue.labels] if issue.labels else [] + ), + "url": issue.url, + }.items() + if v is not None }, ) From c8b4c411dd67e3bf1b8dd81ca6b0dc69f8e31346 Mon Sep 17 00:00:00 2001 From: Rei Meguro <36625832+Orbital-Web@users.noreply.github.com> Date: Fri, 23 May 2025 16:15:47 -0700 Subject: [PATCH 2/8] feat: nullity check --- backend/onyx/connectors/github/connector.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/backend/onyx/connectors/github/connector.py b/backend/onyx/connectors/github/connector.py index d36e582d1d8..1c0574815a4 100644 --- a/backend/onyx/connectors/github/connector.py +++ b/backend/onyx/connectors/github/connector.py @@ -243,7 +243,11 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document: "assignee": ( pull_request.assignee.login if pull_request.assignee else None ), - "created_at": pull_request.created_at.replace(tzinfo=timezone.utc), + "created_at": ( + pull_request.created_at.replace(tzinfo=timezone.utc) + if pull_request.created_at + else None + ), "updated_at": ( pull_request.updated_at.replace(tzinfo=timezone.utc) if pull_request.updated_at @@ -282,7 +286,11 @@ def _convert_issue_to_document(issue: Issue) -> Document: for k, v in { "state": issue.state, "assignee": issue.assignee.login if issue.assignee else None, - "created_at": issue.created_at.replace(tzinfo=timezone.utc), + "created_at": ( + issue.created_at.replace(tzinfo=timezone.utc) + if issue.created_at + else None + ), "updated_at": ( issue.updated_at.replace(tzinfo=timezone.utc) if issue.updated_at From e85ccce9a8aa6649e132d8613b153f4aead46ca1 Mon Sep 17 00:00:00 2001 From: Rei Meguro <36625832+Orbital-Web@users.noreply.github.com> Date: Thu, 29 May 2025 13:01:12 -0700 Subject: [PATCH 3/8] feat: more metadata --- backend/onyx/connectors/github/connector.py | 46 ++++++++++++++++----- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/backend/onyx/connectors/github/connector.py b/backend/onyx/connectors/github/connector.py index 1c0574815a4..77e71e0fe6e 100644 --- a/backend/onyx/connectors/github/connector.py +++ b/backend/onyx/connectors/github/connector.py @@ -226,7 +226,7 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document: TextSection(link=pull_request.html_url, text=pull_request.body or "") ], source=DocumentSource.GITHUB, - semantic_identifier=pull_request.title, + semantic_identifier=f"{pull_request.number}: {pull_request.title}", # updated_at is UTC time but is timezone unaware, explicitly add UTC # as there is logic in indexing to prevent wrong timestamped docs # due to local time discrepancies with UTC @@ -240,9 +240,20 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document: for k, v in { "merged": pull_request.merged, "state": pull_request.state, - "assignee": ( - pull_request.assignee.login if pull_request.assignee else None + "user": ( + pull_request.user.email or pull_request.user.login + if pull_request.user + else None + ), + "assignees": [ + assignee.email or assignee.login + for assignee in pull_request.assignees + ], + "repo": ( + pull_request.base.repo.full_name if pull_request.base else None ), + "num_commits": pull_request.commits, + "labels": [label.name for label in pull_request.labels], "created_at": ( pull_request.created_at.replace(tzinfo=timezone.utc) if pull_request.created_at @@ -253,15 +264,22 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document: if pull_request.updated_at else None ), + "closed_at": ( + pull_request.closed_at.replace(tzinfo=timezone.utc) + if pull_request.closed_at + else None + ), "merged_at": ( pull_request.merged_at.replace(tzinfo=timezone.utc) if pull_request.merged_at else None ), "merged_by": ( - pull_request.merged_by.login if pull_request.merged_by else None + pull_request.merged_by.email or pull_request.merged_by.login + if pull_request.merged_by + else None ), - "url": pull_request.url, + "url": pull_request.html_url, }.items() if v is not None }, @@ -278,14 +296,19 @@ def _convert_issue_to_document(issue: Issue) -> Document: id=issue.html_url, sections=[TextSection(link=issue.html_url, text=issue.body or "")], source=DocumentSource.GITHUB, - semantic_identifier=issue.title, + semantic_identifier=f"{issue.number}: {issue.title}", # updated_at is UTC time but is timezone unaware doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc), metadata={ k: str(v) for k, v in { "state": issue.state, - "assignee": issue.assignee.login if issue.assignee else None, + "user": (issue.user.email or issue.user.login if issue.user else None), + "assignees": [ + assignee.email or assignee.login for assignee in issue.assignees + ], + "repo": issue.repository.full_name if issue.repository else None, + "labels": [label.name for label in issue.labels], "created_at": ( issue.created_at.replace(tzinfo=timezone.utc) if issue.created_at @@ -301,11 +324,12 @@ def _convert_issue_to_document(issue: Issue) -> Document: if issue.closed_at else None ), - "closed_by": issue.closed_by.login if issue.closed_by else None, - "labels": ( - [label.name for label in issue.labels] if issue.labels else [] + "closed_by": ( + issue.closed_by.email or issue.closed_by.login + if issue.closed_by + else None ), - "url": issue.url, + "url": issue.html_url, }.items() if v is not None }, From dadf4a506b2129f98759f5890543a98440153332 Mon Sep 17 00:00:00 2001 From: Rei Meguro <36625832+Orbital-Web@users.noreply.github.com> Date: Thu, 29 May 2025 14:05:47 -0700 Subject: [PATCH 4/8] feat: userinfo --- backend/onyx/connectors/github/connector.py | 46 +++++++++++-------- .../github/test_github_checkpointing.py | 4 +- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/backend/onyx/connectors/github/connector.py b/backend/onyx/connectors/github/connector.py index 77e71e0fe6e..cf0627d5425 100644 --- a/backend/onyx/connectors/github/connector.py +++ b/backend/onyx/connectors/github/connector.py @@ -14,6 +14,7 @@ from github import Repository from github.GithubException import GithubException from github.Issue import Issue +from github.NamedUser import NamedUser from github.PaginatedList import PaginatedList from github.PullRequest import PullRequest from github.Requester import Requester @@ -219,6 +220,18 @@ def _get_batch_rate_limited( ) +def _get_userinfo(user: NamedUser) -> dict[str, str]: + return { + k: v + for k, v in { + "login": user.login, + "name": user.name, + "email": user.email, + }.items() + if v is not None + } + + def _convert_pr_to_document(pull_request: PullRequest) -> Document: return Document( id=pull_request.html_url, @@ -236,50 +249,46 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document: else None ), metadata={ - k: str(v) + k: v for k, v in { - "merged": pull_request.merged, + "merged": str(pull_request.merged), "state": pull_request.state, "user": ( - pull_request.user.email or pull_request.user.login - if pull_request.user - else None + str(_get_userinfo(pull_request.user)) if pull_request.user else None ), "assignees": [ - assignee.email or assignee.login - for assignee in pull_request.assignees + str(_get_userinfo(assignee)) for assignee in pull_request.assignees ], "repo": ( pull_request.base.repo.full_name if pull_request.base else None ), - "num_commits": pull_request.commits, + "num_commits": str(pull_request.commits), "labels": [label.name for label in pull_request.labels], "created_at": ( - pull_request.created_at.replace(tzinfo=timezone.utc) + str(pull_request.created_at.replace(tzinfo=timezone.utc)) if pull_request.created_at else None ), "updated_at": ( - pull_request.updated_at.replace(tzinfo=timezone.utc) + str(pull_request.updated_at.replace(tzinfo=timezone.utc)) if pull_request.updated_at else None ), "closed_at": ( - pull_request.closed_at.replace(tzinfo=timezone.utc) + str(pull_request.closed_at.replace(tzinfo=timezone.utc)) if pull_request.closed_at else None ), "merged_at": ( - pull_request.merged_at.replace(tzinfo=timezone.utc) + str(pull_request.merged_at.replace(tzinfo=timezone.utc)) if pull_request.merged_at else None ), "merged_by": ( - pull_request.merged_by.email or pull_request.merged_by.login + str(_get_userinfo(pull_request.merged_by)) if pull_request.merged_by else None ), - "url": pull_request.html_url, }.items() if v is not None }, @@ -303,9 +312,9 @@ def _convert_issue_to_document(issue: Issue) -> Document: k: str(v) for k, v in { "state": issue.state, - "user": (issue.user.email or issue.user.login if issue.user else None), + "user": str(_get_userinfo(issue.user)) if issue.user else None, "assignees": [ - assignee.email or assignee.login for assignee in issue.assignees + str(_get_userinfo(assignee)) for assignee in issue.assignees ], "repo": issue.repository.full_name if issue.repository else None, "labels": [label.name for label in issue.labels], @@ -325,11 +334,8 @@ def _convert_issue_to_document(issue: Issue) -> Document: else None ), "closed_by": ( - issue.closed_by.email or issue.closed_by.login - if issue.closed_by - else None + str(_get_userinfo(issue.closed_by)) if issue.closed_by else None ), - "url": issue.html_url, }.items() if v is not None }, diff --git a/backend/tests/unit/onyx/connectors/github/test_github_checkpointing.py b/backend/tests/unit/onyx/connectors/github/test_github_checkpointing.py index 055b3840d30..e79f8f89a7a 100644 --- a/backend/tests/unit/onyx/connectors/github/test_github_checkpointing.py +++ b/backend/tests/unit/onyx/connectors/github/test_github_checkpointing.py @@ -842,7 +842,7 @@ def to_repository_side_effect( assert all(isinstance(item, Document) for item in outputs[1].items) assert { item.semantic_identifier for item in cast(list[Document], outputs[1].items) - } == {"PR 3 Repo 2", "PR 4 Repo 2"} + } == {"3: PR 3 Repo 2", "4: PR 4 Repo 2"} cp1 = outputs[1].next_checkpoint assert ( cp1.has_more @@ -869,7 +869,7 @@ def to_repository_side_effect( assert all(isinstance(item, Document) for item in outputs[3].items) assert { item.semantic_identifier for item in cast(list[Document], outputs[3].items) - } == {"PR 1 Repo 1", "PR 2 Repo 1"} + } == {"1: PR 1 Repo 1", "2: PR 2 Repo 1"} cp3 = outputs[3].next_checkpoint # This checkpoint is returned early because offset had items. has_more reflects state then. assert cp3.has_more # still need to do issues From a4d6b30e70c48f1b8fe58553be2cc048129ac1de Mon Sep 17 00:00:00 2001 From: Rei Meguro <36625832+Orbital-Web@users.noreply.github.com> Date: Thu, 29 May 2025 23:34:12 -0700 Subject: [PATCH 5/8] test: connector test + more metadata --- backend/onyx/connectors/github/connector.py | 4 ++ .../connectors/github/test_github_basic.py | 48 ++++++++++++++----- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/backend/onyx/connectors/github/connector.py b/backend/onyx/connectors/github/connector.py index cf0627d5425..714c97eef33 100644 --- a/backend/onyx/connectors/github/connector.py +++ b/backend/onyx/connectors/github/connector.py @@ -251,6 +251,8 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document: metadata={ k: v for k, v in { + "object_type": "PullRequest", + "id": str(pull_request.number), "merged": str(pull_request.merged), "state": pull_request.state, "user": ( @@ -311,6 +313,8 @@ def _convert_issue_to_document(issue: Issue) -> Document: metadata={ k: str(v) for k, v in { + "object_type": "Issue", + "id": str(issue.number), "state": issue.state, "user": str(_get_userinfo(issue.user)) if issue.user else None, "assignees": [ diff --git a/backend/tests/daily/connectors/github/test_github_basic.py b/backend/tests/daily/connectors/github/test_github_basic.py index 235352cc5ef..e9e1fbdb6db 100644 --- a/backend/tests/daily/connectors/github/test_github_basic.py +++ b/backend/tests/daily/connectors/github/test_github_basic.py @@ -30,25 +30,47 @@ def test_github_connector_basic(github_connector: GithubConnector) -> None: start=0, end=time.time(), ) - assert len(docs) > 0 # We expect at least one PR to exist + assert len(docs) > 1 # We expect at least one PR and one Issue to exist # Test the first document's structure - doc = docs[0] + pr_doc = docs[0] + issue_doc = docs[-1] # Verify basic document properties - assert doc.source == DocumentSource.GITHUB - assert doc.secondary_owners is None - assert doc.from_ingestion_api is False - assert doc.additional_info is None + assert pr_doc.source == DocumentSource.GITHUB + assert pr_doc.secondary_owners is None + assert pr_doc.from_ingestion_api is False + assert pr_doc.additional_info is None # Verify GitHub-specific properties - assert "github.com" in doc.id # Should be a GitHub URL - assert doc.metadata is not None - assert "state" in doc.metadata - assert "merged" in doc.metadata + assert "github.com" in pr_doc.id # Should be a GitHub URL + + # Verify PR-specific properties + assert pr_doc.metadata is not None + assert pr_doc.metadata.get("object_type") == "PullRequest" + assert "id" in pr_doc.metadata + assert "merged" in pr_doc.metadata + assert "state" in pr_doc.metadata + assert "user" in pr_doc.metadata + assert "assignees" in pr_doc.metadata + assert pr_doc.metadata.get("repo") == "onyx-dot-app/documentation" + assert "num_commits" in pr_doc.metadata + assert "labels" in pr_doc.metadata + assert "created_at" in pr_doc.metadata + + # Verify Issue-specific properties + assert issue_doc.metadata is not None + assert issue_doc.metadata.get("object_type") == "Issue" + assert "id" in issue_doc.metadata + assert "state" in issue_doc.metadata + assert "user" in issue_doc.metadata + assert "assignees" in issue_doc.metadata + assert issue_doc.metadata.get("repo") == "onyx-dot-app/documentation" + assert "labels" in issue_doc.metadata + assert "created_at" in issue_doc.metadata # Verify sections - assert len(doc.sections) == 1 - section = doc.sections[0] - assert section.link == doc.id # Section link should match document ID + assert len(pr_doc.sections) == 1 + section = pr_doc.sections[0] + assert section.link == pr_doc.id # Section link should match document ID assert isinstance(section.text, str) # Should have some text content From d0feb17ec443b21c652f7f78476839d59da3a338 Mon Sep 17 00:00:00 2001 From: Rei Meguro <36625832+Orbital-Web@users.noreply.github.com> Date: Fri, 30 May 2025 00:01:24 -0700 Subject: [PATCH 6/8] feat: num files changed --- backend/onyx/connectors/github/connector.py | 1 + backend/tests/daily/connectors/github/test_github_basic.py | 1 + 2 files changed, 2 insertions(+) diff --git a/backend/onyx/connectors/github/connector.py b/backend/onyx/connectors/github/connector.py index 714c97eef33..72a19b7f2a5 100644 --- a/backend/onyx/connectors/github/connector.py +++ b/backend/onyx/connectors/github/connector.py @@ -265,6 +265,7 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document: pull_request.base.repo.full_name if pull_request.base else None ), "num_commits": str(pull_request.commits), + "num_files_changed": str(pull_request.changed_files), "labels": [label.name for label in pull_request.labels], "created_at": ( str(pull_request.created_at.replace(tzinfo=timezone.utc)) diff --git a/backend/tests/daily/connectors/github/test_github_basic.py b/backend/tests/daily/connectors/github/test_github_basic.py index e9e1fbdb6db..75ad30a1ca8 100644 --- a/backend/tests/daily/connectors/github/test_github_basic.py +++ b/backend/tests/daily/connectors/github/test_github_basic.py @@ -55,6 +55,7 @@ def test_github_connector_basic(github_connector: GithubConnector) -> None: assert "assignees" in pr_doc.metadata assert pr_doc.metadata.get("repo") == "onyx-dot-app/documentation" assert "num_commits" in pr_doc.metadata + assert "num_files_changed" in pr_doc.metadata assert "labels" in pr_doc.metadata assert "created_at" in pr_doc.metadata From b382ab6d1e21666affdade760d3c5d9fd5b71200 Mon Sep 17 00:00:00 2001 From: Rei Meguro <36625832+Orbital-Web@users.noreply.github.com> Date: Tue, 3 Jun 2025 15:09:13 -0700 Subject: [PATCH 7/8] feat str --- backend/onyx/connectors/github/connector.py | 32 +++++++++------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/backend/onyx/connectors/github/connector.py b/backend/onyx/connectors/github/connector.py index 72a19b7f2a5..fa2ce6335d3 100644 --- a/backend/onyx/connectors/github/connector.py +++ b/backend/onyx/connectors/github/connector.py @@ -249,17 +249,15 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document: else None ), metadata={ - k: v + k: str(v) for k, v in { "object_type": "PullRequest", - "id": str(pull_request.number), - "merged": str(pull_request.merged), + "id": pull_request.number, + "merged": pull_request.merged, "state": pull_request.state, - "user": ( - str(_get_userinfo(pull_request.user)) if pull_request.user else None - ), + "user": _get_userinfo(pull_request.user) if pull_request.user else None, "assignees": [ - str(_get_userinfo(assignee)) for assignee in pull_request.assignees + _get_userinfo(assignee) for assignee in pull_request.assignees ], "repo": ( pull_request.base.repo.full_name if pull_request.base else None @@ -268,27 +266,27 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document: "num_files_changed": str(pull_request.changed_files), "labels": [label.name for label in pull_request.labels], "created_at": ( - str(pull_request.created_at.replace(tzinfo=timezone.utc)) + pull_request.created_at.replace(tzinfo=timezone.utc) if pull_request.created_at else None ), "updated_at": ( - str(pull_request.updated_at.replace(tzinfo=timezone.utc)) + pull_request.updated_at.replace(tzinfo=timezone.utc) if pull_request.updated_at else None ), "closed_at": ( - str(pull_request.closed_at.replace(tzinfo=timezone.utc)) + pull_request.closed_at.replace(tzinfo=timezone.utc) if pull_request.closed_at else None ), "merged_at": ( - str(pull_request.merged_at.replace(tzinfo=timezone.utc)) + pull_request.merged_at.replace(tzinfo=timezone.utc) if pull_request.merged_at else None ), "merged_by": ( - str(_get_userinfo(pull_request.merged_by)) + _get_userinfo(pull_request.merged_by) if pull_request.merged_by else None ), @@ -315,12 +313,10 @@ def _convert_issue_to_document(issue: Issue) -> Document: k: str(v) for k, v in { "object_type": "Issue", - "id": str(issue.number), + "id": issue.number, "state": issue.state, - "user": str(_get_userinfo(issue.user)) if issue.user else None, - "assignees": [ - str(_get_userinfo(assignee)) for assignee in issue.assignees - ], + "user": _get_userinfo(issue.user) if issue.user else None, + "assignees": [_get_userinfo(assignee) for assignee in issue.assignees], "repo": issue.repository.full_name if issue.repository else None, "labels": [label.name for label in issue.labels], "created_at": ( @@ -339,7 +335,7 @@ def _convert_issue_to_document(issue: Issue) -> Document: else None ), "closed_by": ( - str(_get_userinfo(issue.closed_by)) if issue.closed_by else None + _get_userinfo(issue.closed_by) if issue.closed_by else None ), }.items() if v is not None From 48791876a98f027ec19b0848572311e03ca0a2f0 Mon Sep 17 00:00:00 2001 From: Rei Meguro <36625832+Orbital-Web@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:35:29 -0700 Subject: [PATCH 8/8] feat: list of str --- backend/onyx/connectors/github/connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/onyx/connectors/github/connector.py b/backend/onyx/connectors/github/connector.py index fa2ce6335d3..c3695fdaf51 100644 --- a/backend/onyx/connectors/github/connector.py +++ b/backend/onyx/connectors/github/connector.py @@ -249,7 +249,7 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document: else None ), metadata={ - k: str(v) + k: [str(vi) for vi in v] if isinstance(v, list) else str(v) for k, v in { "object_type": "PullRequest", "id": pull_request.number, @@ -310,7 +310,7 @@ def _convert_issue_to_document(issue: Issue) -> Document: # updated_at is UTC time but is timezone unaware doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc), metadata={ - k: str(v) + k: [str(vi) for vi in v] if isinstance(v, list) else str(v) for k, v in { "object_type": "Issue", "id": issue.number,