Skip to content

Commit 9e9610b

Browse files
authored
Update GitHub Connector metadata (#4769)
* feat: updated github metadata * feat: nullity check * feat: more metadata * feat: userinfo * test: connector test + more metadata * feat: num files changed * feat str * feat: list of str
1 parent f447359 commit 9e9610b

File tree

3 files changed

+125
-20
lines changed

3 files changed

+125
-20
lines changed

backend/onyx/connectors/github/connector.py

Lines changed: 87 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from github import Repository
1515
from github.GithubException import GithubException
1616
from github.Issue import Issue
17+
from github.NamedUser import NamedUser
1718
from github.PaginatedList import PaginatedList
1819
from github.PullRequest import PullRequest
1920
from github.Requester import Requester
@@ -219,14 +220,26 @@ def _get_batch_rate_limited(
219220
)
220221

221222

223+
def _get_userinfo(user: NamedUser) -> dict[str, str]:
224+
return {
225+
k: v
226+
for k, v in {
227+
"login": user.login,
228+
"name": user.name,
229+
"email": user.email,
230+
}.items()
231+
if v is not None
232+
}
233+
234+
222235
def _convert_pr_to_document(pull_request: PullRequest) -> Document:
223236
return Document(
224237
id=pull_request.html_url,
225238
sections=[
226239
TextSection(link=pull_request.html_url, text=pull_request.body or "")
227240
],
228241
source=DocumentSource.GITHUB,
229-
semantic_identifier=pull_request.title,
242+
semantic_identifier=f"{pull_request.number}: {pull_request.title}",
230243
# updated_at is UTC time but is timezone unaware, explicitly add UTC
231244
# as there is logic in indexing to prevent wrong timestamped docs
232245
# due to local time discrepancies with UTC
@@ -236,8 +249,49 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
236249
else None
237250
),
238251
metadata={
239-
"merged": str(pull_request.merged),
240-
"state": pull_request.state,
252+
k: [str(vi) for vi in v] if isinstance(v, list) else str(v)
253+
for k, v in {
254+
"object_type": "PullRequest",
255+
"id": pull_request.number,
256+
"merged": pull_request.merged,
257+
"state": pull_request.state,
258+
"user": _get_userinfo(pull_request.user) if pull_request.user else None,
259+
"assignees": [
260+
_get_userinfo(assignee) for assignee in pull_request.assignees
261+
],
262+
"repo": (
263+
pull_request.base.repo.full_name if pull_request.base else None
264+
),
265+
"num_commits": str(pull_request.commits),
266+
"num_files_changed": str(pull_request.changed_files),
267+
"labels": [label.name for label in pull_request.labels],
268+
"created_at": (
269+
pull_request.created_at.replace(tzinfo=timezone.utc)
270+
if pull_request.created_at
271+
else None
272+
),
273+
"updated_at": (
274+
pull_request.updated_at.replace(tzinfo=timezone.utc)
275+
if pull_request.updated_at
276+
else None
277+
),
278+
"closed_at": (
279+
pull_request.closed_at.replace(tzinfo=timezone.utc)
280+
if pull_request.closed_at
281+
else None
282+
),
283+
"merged_at": (
284+
pull_request.merged_at.replace(tzinfo=timezone.utc)
285+
if pull_request.merged_at
286+
else None
287+
),
288+
"merged_by": (
289+
_get_userinfo(pull_request.merged_by)
290+
if pull_request.merged_by
291+
else None
292+
),
293+
}.items()
294+
if v is not None
241295
},
242296
)
243297

@@ -252,11 +306,39 @@ def _convert_issue_to_document(issue: Issue) -> Document:
252306
id=issue.html_url,
253307
sections=[TextSection(link=issue.html_url, text=issue.body or "")],
254308
source=DocumentSource.GITHUB,
255-
semantic_identifier=issue.title,
309+
semantic_identifier=f"{issue.number}: {issue.title}",
256310
# updated_at is UTC time but is timezone unaware
257311
doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
258312
metadata={
259-
"state": issue.state,
313+
k: [str(vi) for vi in v] if isinstance(v, list) else str(v)
314+
for k, v in {
315+
"object_type": "Issue",
316+
"id": issue.number,
317+
"state": issue.state,
318+
"user": _get_userinfo(issue.user) if issue.user else None,
319+
"assignees": [_get_userinfo(assignee) for assignee in issue.assignees],
320+
"repo": issue.repository.full_name if issue.repository else None,
321+
"labels": [label.name for label in issue.labels],
322+
"created_at": (
323+
issue.created_at.replace(tzinfo=timezone.utc)
324+
if issue.created_at
325+
else None
326+
),
327+
"updated_at": (
328+
issue.updated_at.replace(tzinfo=timezone.utc)
329+
if issue.updated_at
330+
else None
331+
),
332+
"closed_at": (
333+
issue.closed_at.replace(tzinfo=timezone.utc)
334+
if issue.closed_at
335+
else None
336+
),
337+
"closed_by": (
338+
_get_userinfo(issue.closed_by) if issue.closed_by else None
339+
),
340+
}.items()
341+
if v is not None
260342
},
261343
)
262344

backend/tests/daily/connectors/github/test_github_basic.py

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,25 +30,48 @@ def test_github_connector_basic(github_connector: GithubConnector) -> None:
3030
start=0,
3131
end=time.time(),
3232
)
33-
assert len(docs) > 0 # We expect at least one PR to exist
33+
assert len(docs) > 1 # We expect at least one PR and one Issue to exist
3434

3535
# Test the first document's structure
36-
doc = docs[0]
36+
pr_doc = docs[0]
37+
issue_doc = docs[-1]
3738

3839
# Verify basic document properties
39-
assert doc.source == DocumentSource.GITHUB
40-
assert doc.secondary_owners is None
41-
assert doc.from_ingestion_api is False
42-
assert doc.additional_info is None
40+
assert pr_doc.source == DocumentSource.GITHUB
41+
assert pr_doc.secondary_owners is None
42+
assert pr_doc.from_ingestion_api is False
43+
assert pr_doc.additional_info is None
4344

4445
# Verify GitHub-specific properties
45-
assert "github.com" in doc.id # Should be a GitHub URL
46-
assert doc.metadata is not None
47-
assert "state" in doc.metadata
48-
assert "merged" in doc.metadata
46+
assert "github.com" in pr_doc.id # Should be a GitHub URL
47+
48+
# Verify PR-specific properties
49+
assert pr_doc.metadata is not None
50+
assert pr_doc.metadata.get("object_type") == "PullRequest"
51+
assert "id" in pr_doc.metadata
52+
assert "merged" in pr_doc.metadata
53+
assert "state" in pr_doc.metadata
54+
assert "user" in pr_doc.metadata
55+
assert "assignees" in pr_doc.metadata
56+
assert pr_doc.metadata.get("repo") == "onyx-dot-app/documentation"
57+
assert "num_commits" in pr_doc.metadata
58+
assert "num_files_changed" in pr_doc.metadata
59+
assert "labels" in pr_doc.metadata
60+
assert "created_at" in pr_doc.metadata
61+
62+
# Verify Issue-specific properties
63+
assert issue_doc.metadata is not None
64+
assert issue_doc.metadata.get("object_type") == "Issue"
65+
assert "id" in issue_doc.metadata
66+
assert "state" in issue_doc.metadata
67+
assert "user" in issue_doc.metadata
68+
assert "assignees" in issue_doc.metadata
69+
assert issue_doc.metadata.get("repo") == "onyx-dot-app/documentation"
70+
assert "labels" in issue_doc.metadata
71+
assert "created_at" in issue_doc.metadata
4972

5073
# Verify sections
51-
assert len(doc.sections) == 1
52-
section = doc.sections[0]
53-
assert section.link == doc.id # Section link should match document ID
74+
assert len(pr_doc.sections) == 1
75+
section = pr_doc.sections[0]
76+
assert section.link == pr_doc.id # Section link should match document ID
5477
assert isinstance(section.text, str) # Should have some text content

backend/tests/unit/onyx/connectors/github/test_github_checkpointing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -842,7 +842,7 @@ def to_repository_side_effect(
842842
assert all(isinstance(item, Document) for item in outputs[1].items)
843843
assert {
844844
item.semantic_identifier for item in cast(list[Document], outputs[1].items)
845-
} == {"PR 3 Repo 2", "PR 4 Repo 2"}
845+
} == {"3: PR 3 Repo 2", "4: PR 4 Repo 2"}
846846
cp1 = outputs[1].next_checkpoint
847847
assert (
848848
cp1.has_more
@@ -869,7 +869,7 @@ def to_repository_side_effect(
869869
assert all(isinstance(item, Document) for item in outputs[3].items)
870870
assert {
871871
item.semantic_identifier for item in cast(list[Document], outputs[3].items)
872-
} == {"PR 1 Repo 1", "PR 2 Repo 1"}
872+
} == {"1: PR 1 Repo 1", "2: PR 2 Repo 1"}
873873
cp3 = outputs[3].next_checkpoint
874874
# This checkpoint is returned early because offset had items. has_more reflects state then.
875875
assert cp3.has_more # still need to do issues

0 commit comments

Comments
 (0)