-
Notifications
You must be signed in to change notification settings - Fork 2.1k
feat: adding support for markdown files for github #5057
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 7 commits
dfb3ca0
94fc27f
b57c42e
7c18198
f993b49
9b58550
b67b53c
146fbc4
8440591
746d230
9da55be
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
from github import Github | ||
from github import RateLimitExceededException | ||
from github import Repository | ||
from github.ContentFile import ContentFile | ||
from github.GithubException import GithubException | ||
from github.Issue import Issue | ||
from github.NamedUser import NamedUser | ||
|
@@ -343,6 +344,20 @@ def _convert_issue_to_document(issue: Issue) -> Document: | |
) | ||
|
||
|
||
def _convert_file_to_document(file: ContentFile) -> Document: | ||
return Document( | ||
id=file.html_url, | ||
sections=[TextSection(link=file.html_url, text=file.content or "")], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. logic: ContentFile.content may be base64 encoded for binary files or None for large files. Should decode content and handle these cases |
||
source=DocumentSource.GITHUB, | ||
semantic_identifier=f"{file.repository.full_name}/{file.path}", | ||
metadata={ | ||
"object_type": "File", | ||
"repo": file.repository.full_name if file.repository else "", | ||
"path": file.path, | ||
}, | ||
) | ||
|
||
|
||
class SerializedRepository(BaseModel): | ||
# id is part of the raw_data as well, just pulled out for convenience | ||
id: int | ||
|
@@ -359,6 +374,7 @@ class GithubConnectorStage(Enum): | |
START = "start" | ||
PRS = "prs" | ||
ISSUES = "issues" | ||
FILES_MD = "files_md" | ||
|
||
|
||
class GithubConnectorCheckpoint(ConnectorCheckpoint): | ||
|
@@ -402,12 +418,14 @@ def __init__( | |
state_filter: str = "all", | ||
include_prs: bool = True, | ||
include_issues: bool = False, | ||
include_files_md: bool = False, | ||
) -> None: | ||
self.repo_owner = repo_owner | ||
self.repositories = repositories | ||
self.state_filter = state_filter | ||
self.include_prs = include_prs | ||
self.include_issues = include_issues | ||
self.include_files_md = include_files_md | ||
self.github_client: Github | None = None | ||
|
||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: | ||
|
@@ -504,6 +522,30 @@ def _issues_func( | |
state=self.state_filter, sort="updated", direction="desc" | ||
) | ||
|
||
def _files_md_func(self, repo: Repository.Repository) -> list[ContentFile]: | ||
wenxi-onyx marked this conversation as resolved.
Show resolved
Hide resolved
|
||
md_files = [] | ||
contents = repo.get_contents("") | ||
|
||
if isinstance(contents, ContentFile): | ||
# if the contents is a single file or directory, we need to wrap it in a list | ||
contents = [cast(ContentFile, contents)] | ||
else: | ||
contents = cast(list[ContentFile], contents) | ||
|
||
while contents: | ||
file = contents.pop(0) | ||
if file.type == "dir": | ||
new_contents = repo.get_contents(file.path) | ||
if isinstance(new_contents, ContentFile): | ||
new_contents = [cast(ContentFile, new_contents)] | ||
else: | ||
new_contents = cast(list[ContentFile], new_contents) | ||
contents.extend(new_contents) | ||
elif file.type == "file" and file.name.endswith(".md"): | ||
md_files.append(file) | ||
|
||
return md_files | ||
trial-danswer marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def _fetch_from_github( | ||
self, | ||
checkpoint: GithubConnectorCheckpoint, | ||
|
@@ -587,6 +629,8 @@ def _fetch_from_github( | |
for pr in pr_batch: | ||
num_prs += 1 | ||
|
||
pr = cast(PullRequest, pr) | ||
|
||
# we iterate backwards in time, so at this point we stop processing prs | ||
if ( | ||
start is not None | ||
|
@@ -603,7 +647,7 @@ def _fetch_from_github( | |
): | ||
continue | ||
try: | ||
yield _convert_pr_to_document(cast(PullRequest, pr)) | ||
yield _convert_pr_to_document(pr) | ||
except Exception as e: | ||
error_msg = f"Error converting PR to document: {e}" | ||
logger.exception(error_msg) | ||
|
@@ -659,14 +703,14 @@ def _fetch_from_github( | |
for issue in issue_batch: | ||
num_issues += 1 | ||
issue = cast(Issue, issue) | ||
# we iterate backwards in time, so at this point we stop processing prs | ||
# we iterate backwards in time, so at this point we stop processing Issues | ||
if ( | ||
start is not None | ||
and issue.updated_at.replace(tzinfo=timezone.utc) < start | ||
): | ||
done_with_issues = True | ||
break | ||
# Skip PRs updated after the end date | ||
# Skip Issues updated after the end date | ||
if ( | ||
end is not None | ||
and issue.updated_at.replace(tzinfo=timezone.utc) > end | ||
|
@@ -700,6 +744,38 @@ def _fetch_from_github( | |
|
||
# if we went past the start date during the loop or there are no more | ||
# issues to get, we move on to the next repo | ||
checkpoint.stage = GithubConnectorStage.FILES_MD | ||
checkpoint.reset() | ||
|
||
checkpoint.stage = GithubConnectorStage.FILES_MD | ||
|
||
if self.include_files_md and checkpoint.stage == GithubConnectorStage.FILES_MD: | ||
logger.info(f"Fetching Markdown files for repo: {repo.name}") | ||
|
||
md_files = self._files_md_func(repo) | ||
|
||
checkpoint.curr_page += 1 | ||
num_files_md = 0 | ||
for file in md_files: | ||
num_files_md += 1 | ||
file = cast(ContentFile, file) | ||
try: | ||
yield _convert_file_to_document(file) | ||
except Exception as e: | ||
error_msg = f"Error converting Markdown file to document: {e}" | ||
logger.exception(error_msg) | ||
yield ConnectorFailure( | ||
failed_document=DocumentFailure( | ||
document_id=str(file.html_url), | ||
document_link=file.html_url, | ||
), | ||
failure_message=error_msg, | ||
exception=e, | ||
) | ||
|
||
continue | ||
|
||
logger.info(f"Fetched {num_files_md} Markdown files for repo: {repo.name}") | ||
checkpoint.stage = GithubConnectorStage.PRS | ||
checkpoint.reset() | ||
trial-danswer marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
|
||
from onyx.configs.constants import DocumentSource | ||
from onyx.connectors.github.connector import GithubConnector | ||
from onyx.connectors.models import Document | ||
from tests.daily.connectors.utils import load_all_docs_from_checkpoint_connector | ||
|
||
|
||
|
@@ -15,6 +16,7 @@ def github_connector() -> GithubConnector: | |
repositories="documentation", | ||
include_prs=True, | ||
include_issues=True, | ||
include_files_md=True, | ||
) | ||
connector.load_credentials( | ||
{ | ||
|
@@ -32,9 +34,16 @@ def test_github_connector_basic(github_connector: GithubConnector) -> None: | |
) | ||
assert len(docs) > 1 # We expect at least one PR and one Issue to exist | ||
|
||
def get_issue_doc(docs: list[Document]) -> Document | None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might need to add a docstring here as well |
||
for doc in docs: | ||
if doc.metadata.get("object_type") == "Issue": | ||
return doc | ||
return None | ||
|
||
# Test the first document's structure | ||
pr_doc = docs[0] | ||
issue_doc = docs[-1] | ||
issue_doc = get_issue_doc(docs) | ||
file_doc = docs[-1] | ||
trial-danswer marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Verify basic document properties | ||
assert pr_doc.source == DocumentSource.GITHUB | ||
|
@@ -60,6 +69,7 @@ def test_github_connector_basic(github_connector: GithubConnector) -> None: | |
assert "created_at" in pr_doc.metadata | ||
|
||
# Verify Issue-specific properties | ||
assert issue_doc is not None, "Issue document not found" | ||
assert issue_doc.metadata is not None | ||
assert issue_doc.metadata.get("object_type") == "Issue" | ||
assert "id" in issue_doc.metadata | ||
|
@@ -70,6 +80,12 @@ def test_github_connector_basic(github_connector: GithubConnector) -> None: | |
assert "labels" in issue_doc.metadata | ||
assert "created_at" in issue_doc.metadata | ||
|
||
# Verify File-specific properties | ||
assert file_doc.metadata is not None | ||
assert file_doc.metadata.get("object_type") == "File" | ||
assert "repo" in file_doc.metadata | ||
assert "path" in file_doc.metadata | ||
|
||
# Verify sections | ||
assert len(pr_doc.sections) == 1 | ||
section = pr_doc.sections[0] | ||
|
Uh oh!
There was an error while loading. Please reload this page.