diff --git a/backend/onyx/connectors/github/connector.py b/backend/onyx/connectors/github/connector.py index 54fd1ea7d3f..ee45ded174a 100644 --- a/backend/onyx/connectors/github/connector.py +++ b/backend/onyx/connectors/github/connector.py @@ -40,7 +40,7 @@ logger = setup_logger() ITEMS_PER_PAGE = 100 -CURSOR_LOG_FREQUENCY = 100 +CURSOR_LOG_FREQUENCY = 50 _MAX_NUM_RATE_LIMIT_RETRIES = 5 @@ -118,7 +118,7 @@ def _paginate_until_error( "This will retrieve all pages before the one we are resuming from, " "which may take a while and consume many API calls." ) - pag_list = pag_list[prev_num_objs:] + pag_list = cast(PaginatedList[PullRequest | Issue], pag_list[prev_num_objs:]) num_objs = 0 try: @@ -297,6 +297,19 @@ def reset(self) -> None: self.cursor_url = None +def make_cursor_url_callback( + checkpoint: GithubConnectorCheckpoint, +) -> Callable[[str | None, int], None]: + def cursor_url_callback(cursor_url: str | None, num_objs: int) -> None: + # we want to maintain the old cursor url so code after retrieval + # can determine that we are using the fallback cursor-based pagination strategy + if cursor_url: + checkpoint.cursor_url = cursor_url + checkpoint.num_retrieved = num_objs + + return cursor_url_callback + + class GithubConnector(CheckpointedConnector[GithubConnectorCheckpoint]): def __init__( self, @@ -393,6 +406,20 @@ def _get_all_repos( _sleep_after_rate_limit_exception(github_client) return self._get_all_repos(github_client, attempt_num + 1) + def _pull_requests_func( + self, repo: Repository.Repository + ) -> Callable[[], PaginatedList[PullRequest]]: + return lambda: repo.get_pulls( + state=self.state_filter, sort="updated", direction="desc" + ) + + def _issues_func( + self, repo: Repository.Repository + ) -> Callable[[], PaginatedList[Issue]]: + return lambda: repo.get_issues( + state=self.state_filter, sort="updated", direction="desc" + ) + def _fetch_from_github( self, checkpoint: GithubConnectorCheckpoint, @@ -433,7 +460,8 @@ def _fetch_from_github( # save checkpoint with repo ids retrieved return checkpoint - assert checkpoint.cached_repo is not None, "No repo saved in checkpoint" + if checkpoint.cached_repo is None: + raise ValueError("No repo saved in checkpoint") # Try to access the requester - different PyGithub versions may use different attribute names try: @@ -455,22 +483,13 @@ def _fetch_from_github( repo_id = checkpoint.cached_repo.id repo = self.github_client.get_repo(repo_id) - def cursor_url_callback(cursor_url: str | None, num_objs: int) -> None: - checkpoint.cursor_url = cursor_url - checkpoint.num_retrieved = num_objs + cursor_url_callback = make_cursor_url_callback(checkpoint) - # TODO: all PRs are also issues, so we should be able to _only_ get issues - # and then filter appropriately whenever include_issues is True if self.include_prs and checkpoint.stage == GithubConnectorStage.PRS: logger.info(f"Fetching PRs for repo: {repo.name}") - def pull_requests_func() -> PaginatedList[PullRequest]: - return repo.get_pulls( - state=self.state_filter, sort="updated", direction="desc" - ) - pr_batch = _get_batch_rate_limited( - pull_requests_func, + self._pull_requests_func(repo), checkpoint.curr_page, checkpoint.cursor_url, checkpoint.num_retrieved, @@ -521,7 +540,9 @@ def pull_requests_func() -> PaginatedList[PullRequest]: # if we found any PRs on the page and there are more PRs to get, return the checkpoint. # In offset mode, while indexing without time constraints, the pr batch # will be empty when we're done. - if num_prs > 0 and not done_with_prs and not checkpoint.cursor_url: + used_cursor = checkpoint.cursor_url is not None + logger.info(f"Fetched {num_prs} PRs for repo: {repo.name}") + if num_prs > 0 and not done_with_prs and not used_cursor: return checkpoint # if we went past the start date during the loop or there are no more @@ -529,7 +550,7 @@ def pull_requests_func() -> PaginatedList[PullRequest]: checkpoint.stage = GithubConnectorStage.ISSUES checkpoint.reset() - if checkpoint.cursor_url: + if used_cursor: # save the checkpoint after changing stage; next run will continue from issues return checkpoint @@ -538,14 +559,9 @@ def pull_requests_func() -> PaginatedList[PullRequest]: if self.include_issues and checkpoint.stage == GithubConnectorStage.ISSUES: logger.info(f"Fetching issues for repo: {repo.name}") - def issues_func() -> PaginatedList[Issue]: - return repo.get_issues( - state=self.state_filter, sort="updated", direction="desc" - ) - issue_batch = list( _get_batch_rate_limited( - issues_func, + self._issues_func(repo), checkpoint.curr_page, checkpoint.cursor_url, checkpoint.num_retrieved, @@ -575,7 +591,6 @@ def issues_func() -> PaginatedList[Issue]: if issue.pull_request is not None: # PRs are handled separately - # TODO: but they shouldn't always be continue try: @@ -593,6 +608,7 @@ def issues_func() -> PaginatedList[Issue]: ) continue + logger.info(f"Fetched {num_issues} issues for repo: {repo.name}") # if we found any issues on the page, and we're not done, return the checkpoint. # don't return if we're using cursor-based pagination to avoid infinite loops if num_issues > 0 and not done_with_issues and not checkpoint.cursor_url: @@ -613,6 +629,8 @@ def issues_func() -> PaginatedList[Issue]: raw_data=next_repo.raw_data, ) + logger.info(f"{len(checkpoint.cached_repo_ids)} repos remaining") + return checkpoint @override diff --git a/backend/requirements/dev.txt b/backend/requirements/dev.txt index 3355a8bcbc8..3fe44dd9042 100644 --- a/backend/requirements/dev.txt +++ b/backend/requirements/dev.txt @@ -6,7 +6,7 @@ faker==37.1.0 lxml==5.3.0 lxml_html_clean==0.2.2 mypy-extensions==1.0.0 -mypy==1.15.0 +mypy==1.13.0 pandas-stubs==2.2.3.241009 pandas==2.2.3 posthog==3.7.4 diff --git a/backend/tests/unit/onyx/connectors/github/test_github_checkpointing.py b/backend/tests/unit/onyx/connectors/github/test_github_checkpointing.py index 8863d58ee20..055b3840d30 100644 --- a/backend/tests/unit/onyx/connectors/github/test_github_checkpointing.py +++ b/backend/tests/unit/onyx/connectors/github/test_github_checkpointing.py @@ -12,6 +12,7 @@ from github import RateLimitExceededException from github.GithubException import GithubException from github.Issue import Issue +from github.PaginatedList import PaginatedList from github.PullRequest import PullRequest from github.RateLimit import RateLimit from github.Repository import Repository @@ -21,6 +22,7 @@ from onyx.connectors.exceptions import CredentialExpiredError from onyx.connectors.exceptions import InsufficientPermissionsError from onyx.connectors.github.connector import GithubConnector +from onyx.connectors.github.connector import GithubConnectorStage from onyx.connectors.github.connector import SerializedRepository from onyx.connectors.models import Document from tests.unit.onyx.connectors.utils import load_everything_from_checkpoint_connector @@ -43,31 +45,31 @@ def repositories() -> str: def mock_github_client() -> MagicMock: """Create a mock GitHub client with proper typing""" mock = MagicMock(spec=Github) - # Add proper return typing for get_repo method - mock.get_repo = MagicMock(return_value=MagicMock(spec=Repository)) - # Add proper return typing for get_organization method + mock.get_repo = MagicMock() mock.get_organization = MagicMock() - # Add proper return typing for get_user method mock.get_user = MagicMock() - # Add proper return typing for get_rate_limit method mock.get_rate_limit = MagicMock(return_value=MagicMock(spec=RateLimit)) - # Add requester for repository deserialization - mock.requester = MagicMock(spec=Requester) + mock._requester = MagicMock(spec=Requester) return mock @pytest.fixture -def github_connector( +def build_github_connector( repo_owner: str, repositories: str, mock_github_client: MagicMock -) -> Generator[GithubConnector, None, None]: - connector = GithubConnector( - repo_owner=repo_owner, - repositories=repositories, - include_prs=True, - include_issues=True, - ) - connector.github_client = mock_github_client - yield connector +) -> Generator[Callable[..., GithubConnector], None, None]: + def _github_connector( + repo_owner: str = repo_owner, repositories: str = repositories + ) -> GithubConnector: + connector = GithubConnector( + repo_owner=repo_owner, + repositories=repositories, + include_prs=True, + include_issues=True, + ) + connector.github_client = mock_github_client + return connector + + yield _github_connector @pytest.fixture @@ -79,6 +81,7 @@ def _create_mock_pr( state: str = "open", merged: bool = False, updated_at: datetime = datetime(2023, 1, 1, tzinfo=timezone.utc), + html_url: str | None = None, ) -> MagicMock: """Helper to create a mock PullRequest object""" mock_pr = MagicMock(spec=PullRequest) @@ -88,7 +91,12 @@ def _create_mock_pr( mock_pr.state = state mock_pr.merged = merged mock_pr.updated_at = updated_at - mock_pr.html_url = f"https://github.com/test-org/test-repo/pull/{number}" + mock_pr.html_url = ( + html_url + if html_url is not None + else f"https://github.com/test-org/test-repo/pull/{number}" + ) + mock_pr.raw_data = {} return mock_pr return _create_mock_pr @@ -112,6 +120,7 @@ def _create_mock_issue( mock_issue.updated_at = updated_at mock_issue.html_url = f"https://github.com/test-org/test-repo/issues/{number}" mock_issue.pull_request = None # Not a PR + mock_issue.raw_data = {} return mock_issue return _create_mock_issue @@ -123,25 +132,32 @@ def _create_mock_repo( name: str = "test-repo", id: int = 1, ) -> MagicMock: - """Helper to create a mock Repository object""" - mock_repo = MagicMock(spec=Repository) + mock_repo = MagicMock() mock_repo.name = name mock_repo.id = id - mock_repo.raw_headers = {"status": "200 OK", "content-type": "application/json"} - mock_repo.raw_data = { - "id": str(id), + + headers_dict = {"status": "200 OK", "content-type": "application/json"} + data_dict = { + "id": id, "name": name, "full_name": f"test-org/{name}", - "private": str(False), + "private": False, "description": "Test repository", } + + mock_repo.configure_mock(raw_headers=headers_dict, raw_data=data_dict) + + mock_repo.get_pulls = MagicMock() + mock_repo.get_issues = MagicMock() + mock_repo.get_contents = MagicMock() + return mock_repo return _create_mock_repo def test_load_from_checkpoint_happy_path( - github_connector: GithubConnector, + build_github_connector: Callable[..., GithubConnector], mock_github_client: MagicMock, create_mock_repo: Callable[..., MagicMock], create_mock_pr: Callable[..., MagicMock], @@ -149,6 +165,7 @@ def test_load_from_checkpoint_happy_path( ) -> None: """Test loading from checkpoint - happy path""" # Set up mocked repo + github_connector = build_github_connector() mock_repo = create_mock_repo() github_connector.github_client = mock_github_client mock_github_client.get_repo.return_value = mock_repo @@ -215,13 +232,14 @@ def test_load_from_checkpoint_happy_path( def test_load_from_checkpoint_with_rate_limit( - github_connector: GithubConnector, + build_github_connector: Callable[..., GithubConnector], mock_github_client: MagicMock, create_mock_repo: Callable[..., MagicMock], create_mock_pr: Callable[..., MagicMock], ) -> None: """Test loading from checkpoint with rate limit handling""" # Set up mocked repo + github_connector = build_github_connector() mock_repo = create_mock_repo() github_connector.github_client = mock_github_client mock_github_client.get_repo.return_value = mock_repo @@ -265,13 +283,14 @@ def test_load_from_checkpoint_with_rate_limit( def test_load_from_checkpoint_with_empty_repo( - github_connector: GithubConnector, + build_github_connector: Callable[..., GithubConnector], mock_github_client: MagicMock, create_mock_repo: Callable[..., MagicMock], ) -> None: """Test loading from checkpoint with an empty repository""" # Set up mocked repo mock_repo = create_mock_repo() + github_connector = build_github_connector() github_connector.github_client = mock_github_client mock_github_client.get_repo.return_value = mock_repo @@ -296,13 +315,14 @@ def test_load_from_checkpoint_with_empty_repo( def test_load_from_checkpoint_with_prs_only( - github_connector: GithubConnector, + build_github_connector: Callable[..., GithubConnector], mock_github_client: MagicMock, create_mock_repo: Callable[..., MagicMock], create_mock_pr: Callable[..., MagicMock], ) -> None: """Test loading from checkpoint with only PRs enabled""" # Configure connector to only include PRs + github_connector = build_github_connector() github_connector.include_prs = True github_connector.include_issues = False @@ -341,13 +361,14 @@ def test_load_from_checkpoint_with_prs_only( def test_load_from_checkpoint_with_issues_only( - github_connector: GithubConnector, + build_github_connector: Callable[..., GithubConnector], mock_github_client: MagicMock, create_mock_repo: Callable[..., MagicMock], create_mock_issue: Callable[..., MagicMock], ) -> None: """Test loading from checkpoint with only issues enabled""" # Configure connector to only include issues + github_connector = build_github_connector() github_connector.include_prs = False github_connector.include_issues = True @@ -407,7 +428,7 @@ def test_load_from_checkpoint_with_issues_only( ], ) def test_validate_connector_settings_errors( - github_connector: GithubConnector, + build_github_connector: Callable[..., GithubConnector], status_code: int, expected_exception: type[Exception], expected_message: str, @@ -415,6 +436,7 @@ def test_validate_connector_settings_errors( """Test validation with various error scenarios""" error = GithubException(status=status_code, data={}, headers={}) + github_connector = build_github_connector() github_client = cast(Github, github_connector.github_client) get_repo_mock = cast(MagicMock, github_client.get_repo) get_repo_mock.side_effect = error @@ -425,13 +447,14 @@ def test_validate_connector_settings_errors( def test_validate_connector_settings_success( - github_connector: GithubConnector, + build_github_connector: Callable[..., GithubConnector], mock_github_client: MagicMock, create_mock_repo: Callable[..., MagicMock], ) -> None: """Test successful validation""" # Set up mocked repo mock_repo = create_mock_repo() + github_connector = build_github_connector() github_connector.github_client = mock_github_client mock_github_client.get_repo.return_value = mock_repo @@ -445,7 +468,7 @@ def test_validate_connector_settings_success( def test_load_from_checkpoint_with_cursor_fallback( - github_connector: GithubConnector, + build_github_connector: Callable[..., GithubConnector], mock_github_client: MagicMock, create_mock_repo: Callable[..., MagicMock], create_mock_pr: Callable[..., MagicMock], @@ -453,6 +476,7 @@ def test_load_from_checkpoint_with_cursor_fallback( """Test loading from checkpoint with fallback to cursor-based pagination""" # Set up mocked repo mock_repo = create_mock_repo() + github_connector = build_github_connector() github_connector.github_client = mock_github_client mock_github_client.get_repo.return_value = mock_repo @@ -506,7 +530,7 @@ def test_load_from_checkpoint_with_cursor_fallback( def test_load_from_checkpoint_resume_cursor_pagination( - github_connector: GithubConnector, + build_github_connector: Callable[..., GithubConnector], mock_github_client: MagicMock, create_mock_repo: Callable[..., MagicMock], create_mock_pr: Callable[..., MagicMock], @@ -514,6 +538,7 @@ def test_load_from_checkpoint_resume_cursor_pagination( """Test resuming from a checkpoint that was using cursor-based pagination""" # Set up mocked repo mock_repo = create_mock_repo() + github_connector = build_github_connector() github_connector.github_client = mock_github_client mock_github_client.get_repo.return_value = mock_repo @@ -558,7 +583,7 @@ def test_load_from_checkpoint_resume_cursor_pagination( def test_load_from_checkpoint_cursor_expiration( - github_connector: GithubConnector, + build_github_connector: Callable[..., GithubConnector], mock_github_client: MagicMock, create_mock_repo: Callable[..., MagicMock], create_mock_pr: Callable[..., MagicMock], @@ -566,6 +591,7 @@ def test_load_from_checkpoint_cursor_expiration( """Test handling of cursor expiration during cursor-based pagination""" # Set up mocked repo mock_repo = create_mock_repo() + github_connector = build_github_connector() github_connector.github_client = mock_github_client mock_github_client.get_repo.return_value = mock_repo @@ -629,3 +655,262 @@ def retry_iterator() -> Generator[PullRequest, None, None]: # Verify that the slice was called with the correct argument mock_retry_paginated_list.__getitem__.assert_called_once_with(slice(3, None)) + + +def test_load_from_checkpoint_cursor_pagination_completion( + build_github_connector: Callable[..., GithubConnector], + mock_github_client: MagicMock, + create_mock_repo: Callable[..., MagicMock], + create_mock_pr: Callable[..., MagicMock], +) -> None: + """Test behavior when cursor-based pagination completes and moves to next repository""" + # Set up two repositories + mock_repo1 = create_mock_repo(name="repo1", id=1) + mock_repo2 = create_mock_repo(name="repo2", id=2) + + # Initialize connector with no specific repositories, so _get_all_repos is used + github_connector = build_github_connector(repositories="") + github_connector.github_client = mock_github_client + mock_pr1 = create_mock_pr( + number=1, + title="PR 1 Repo 1", + html_url="https://github.com/test-org/repo1/pull/1", + ) + mock_pr2 = create_mock_pr( + number=2, + title="PR 2 Repo 1", + html_url="https://github.com/test-org/repo1/pull/2", + ) + mock_pr3 = create_mock_pr( + number=3, + title="PR 3 Repo 2", + html_url="https://github.com/test-org/repo2/pull/3", + ) + mock_pr4 = create_mock_pr( + number=4, + title="PR 4 Repo 2", + html_url="https://github.com/test-org/repo2/pull/4", + ) + checkpoint = github_connector.build_dummy_checkpoint() + mock_paginated_list_repo1_prs = MagicMock(spec=PaginatedList) + + def get_page_repo1_side_effect(page_num: int) -> list[PullRequest]: + if page_num == 0: + return [mock_pr1, mock_pr2] + else: + return [] + + mock_paginated_list_repo1_prs.get_page.side_effect = get_page_repo1_side_effect + mock_repo2_cursor_paginator = MagicMock(spec=PaginatedList) + + def repo2_cursor_iterator() -> Generator[PullRequest, None, None]: + print("setting next url to cursor_step_2") + mock_repo2_cursor_paginator.__nextUrl = "cursor_step_2" + yield mock_pr3 + print("setting next url to None") + mock_repo2_cursor_paginator.__nextUrl = None + yield mock_pr4 + + mock_repo2_cursor_paginator.__iter__.return_value = repo2_cursor_iterator() + mock_repo2_cursor_paginator.__nextUrl = None + pull_requests_func_invocation_count = 0 + + def replacement_pull_requests_func( + repo: Repository, + ) -> Callable[[], PaginatedList[PullRequest]]: + nonlocal pull_requests_func_invocation_count + pull_requests_func_invocation_count += 1 + current_repo_name = repo.name + lambda_call_count_for_current_repo = 0 + + def git_objs_lambda() -> PaginatedList[PullRequest]: + nonlocal lambda_call_count_for_current_repo + lambda_call_count_for_current_repo += 1 + if current_repo_name == mock_repo2.name: + if lambda_call_count_for_current_repo == 1: + pl_for_offset_failure = MagicMock(spec=PaginatedList) + + def get_page_raises_exception(page_num: int) -> list[PullRequest]: + raise GithubException(422, message="use cursor pagination") + + pl_for_offset_failure.get_page.side_effect = ( + get_page_raises_exception + ) + return pl_for_offset_failure + else: + return mock_repo2_cursor_paginator + elif current_repo_name == mock_repo1.name: + return mock_paginated_list_repo1_prs + else: + raise ValueError(f"Unexpected repo name: {current_repo_name}") + + return git_objs_lambda + + mock_requester = MagicMock(spec=Requester) + github_connector.github_client._requester = mock_requester + + def get_repo_side_effect(repo_id: int) -> MagicMock: + repo_to_return = None + headers_dict = None + data_dict = None + if repo_id == 1: + repo_to_return = mock_repo1 + headers_dict = {"status": "200 OK", "content-type": "application/json"} + data_dict = { + "id": 1, + "name": "repo1", + "full_name": "test-org/repo1", + "private": False, + "description": "Test repository", + } + elif repo_id == 2: + repo_to_return = mock_repo2 + headers_dict = {"status": "200 OK", "content-type": "application/json"} + data_dict = { + "id": 2, + "name": "repo2", + "full_name": "test-org/repo2", + "private": False, + "description": "Test repository", + } + else: + raise ValueError(f"Unexpected repo ID: {repo_id}") + if repo_to_return and headers_dict and data_dict: + repo_to_return.configure_mock(raw_headers=headers_dict, raw_data=data_dict) + return repo_to_return + + mock_github_client.get_repo.side_effect = get_repo_side_effect + + def to_repository_side_effect( + self_serialized_repo: SerializedRepository, requester_arg: Requester + ) -> Repository: + if self_serialized_repo.id == mock_repo1.id: + return mock_repo1 + elif self_serialized_repo.id == mock_repo2.id: + return mock_repo2 + raise ValueError(f"Unexpected repo ID: {self_serialized_repo.id}") + + mock_empty_issues_list = MagicMock(spec=PaginatedList) + mock_empty_issues_list.get_page.return_value = [] + mock_empty_issues_list.__iter__.return_value = iter([]) + type(mock_empty_issues_list)._PaginatedList__nextUrl = None + mock_repo1.get_issues.return_value = mock_empty_issues_list + mock_repo2.get_issues.return_value = mock_empty_issues_list + with patch.object( + github_connector, "_get_all_repos", return_value=[mock_repo1, mock_repo2] + ), patch.object( + github_connector, + "_pull_requests_func", + side_effect=replacement_pull_requests_func, + ), patch.object( + SerializedRepository, + "to_Repository", + side_effect=to_repository_side_effect, + autospec=True, + ) as mock_to_repository: + end_time = time.time() + outputs = list( + load_everything_from_checkpoint_connector_from_checkpoint( + github_connector, 0, end_time, checkpoint + ) + ) + + # --- Assertions --- + # Expected outputs: 5 based on the latest logic refinement + # 1. Initial cp + # 2. After repo2 PRs (cursor fallback) -> yields cp for repo2 issues + # 3. After repo2 issues (empty) -> yields cp for repo1 PRs + # 4. After repo1 PRs (page 0) -> yields cp for repo1 PRs page 1 + # 5. After repo1 PRs (page 1 empty) and repo1 issues (empty) -> yields final cp + + assert ( + len(outputs) == 5 + ) # Initial, Repo2-PRs, Repo2-Issues, Repo1-PRs-P0, Repo1-Issues(final) + + # Output 0: Initial checkpoint, after _get_all_repos + cp0 = outputs[0].next_checkpoint + assert cp0.has_more + assert cp0.cached_repo is not None + assert cp0.cached_repo.id == mock_repo2.id # mock_repo2 is popped first + assert cp0.cached_repo_ids == [mock_repo1.id] + assert cp0.stage == GithubConnectorStage.PRS + assert cp0.cursor_url is None + + # Output 1: After processing PRs for mock_repo2 (via cursor fallback) + # Items should be pr3, pr4 + assert len(outputs[1].items) == 2 + assert all(isinstance(item, Document) for item in outputs[1].items) + assert { + item.semantic_identifier for item in cast(list[Document], outputs[1].items) + } == {"PR 3 Repo 2", "PR 4 Repo 2"} + cp1 = outputs[1].next_checkpoint + assert ( + cp1.has_more + ) # Still have repo1 in cached_repo_ids at the time checkpoint is yielded + assert cp1.cached_repo is not None + assert cp1.cached_repo.id == mock_repo2.id + assert cp1.stage == GithubConnectorStage.ISSUES # Moved to issues for repo2 + assert cp1.cursor_url is None # Cursor completed and reset + assert cp1.num_retrieved == 0 # Reset + assert cp1.curr_page == 0 # Reset + + # Output 2: After processing Issues for mock_repo2 (empty) + assert len(outputs[2].items) == 0 + cp2 = outputs[2].next_checkpoint + assert cp2.has_more # Checkpoint yielded BEFORE final has_more check + assert cp2.cached_repo is not None + assert cp2.cached_repo.id == mock_repo1.id # Moved to repo1 + assert cp2.cached_repo_ids == [] # Popped repo1 id + assert cp2.stage == GithubConnectorStage.PRS # For repo1 + assert cp2.cursor_url is None + + # Output 3: After processing PRs for mock_repo1 (via offset, page 0) + assert len(outputs[3].items) == 2 + assert all(isinstance(item, Document) for item in outputs[3].items) + assert { + item.semantic_identifier for item in cast(list[Document], outputs[3].items) + } == {"PR 1 Repo 1", "PR 2 Repo 1"} + cp3 = outputs[3].next_checkpoint + # This checkpoint is returned early because offset had items. has_more reflects state then. + assert cp3.has_more # still need to do issues + assert cp3.cached_repo is not None + assert cp3.cached_repo.id == mock_repo1.id + assert cp3.stage == GithubConnectorStage.PRS # Still PRS stage + assert cp3.curr_page == 1 # Offset pagination incremented page for PRs + assert cp3.cursor_url is None + + # Output 4: After processing PRs page 1 (empty) and Issues for mock_repo1 (empty) - Final checkpoint + assert len(outputs[4].items) == 0 + cp4 = outputs[4].next_checkpoint + assert not cp4.has_more # All done + assert cp4.cached_repo is not None + assert cp4.cached_repo.id == mock_repo1.id # Last processed repo + assert ( + cp4.stage == GithubConnectorStage.PRS + ) # Reset for a hypothetical next run/repo + assert cp4.curr_page == 0 + assert cp4.num_retrieved == 0 + assert cp4.cursor_url is None + + # Verify to_Repository calls + print(mock_to_repository.call_args_list) + assert ( + mock_to_repository.call_count == 4 + ) # Twice for repo2, twice for repo1 (issues don't need it) + assert ( + mock_to_repository.call_args_list[0][0][0].id == mock_repo2.id + ) # First call was for repo2 + assert ( + mock_to_repository.call_args_list[1][0][0].id == mock_repo2.id + ) # Second call was for repo2 + assert ( + mock_to_repository.call_args_list[2][0][0].id == mock_repo1.id + ) # Third call was for repo1 + assert ( + mock_to_repository.call_args_list[3][0][0].id == mock_repo1.id + ) # Fourth call was for repo1 + + # Verify _pull_requests_func was invoked for both repos' PR stages + assert ( + pull_requests_func_invocation_count == 3 + ) # twice for repo2 PRs, once for repo1 PRs