Skip to content

Commit 6377b5e

Browse files
wenxi-onyxWenxi Onyx
authored andcommitted
Fixed indexing when no sites are specified (onyx-dot-app#4822)
* Fixed indexing when no sites are specificed * Added test for Sharepoint all sites index * Accounted for paginated results. * Typing * Typing --------- Co-authored-by: Wenxi Onyx <wenxi-onyx@Wenxis-MacBook-Pro.local>
1 parent 5bd55a0 commit 6377b5e

File tree

2 files changed

+36
-2
lines changed

2 files changed

+36
-2
lines changed

backend/onyx/connectors/sharepoint/connector.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import io
22
import os
3+
from collections.abc import Generator
34
from datetime import datetime
45
from datetime import timezone
56
from typing import Any
@@ -8,6 +9,8 @@
89
import msal # type: ignore
910
from office365.graph_client import GraphClient # type: ignore
1011
from office365.onedrive.driveitems.driveItem import DriveItem # type: ignore
12+
from office365.onedrive.sites.site import Site # type: ignore
13+
from office365.onedrive.sites.sites_with_root import SitesWithRoot # type: ignore
1114
from pydantic import BaseModel
1215

1316
from onyx.configs.app_configs import INDEX_BATCH_SIZE
@@ -227,14 +230,29 @@ def _fetch_driveitems(
227230

228231
return final_driveitems
229232

233+
def _handle_paginated_sites(
234+
self, sites: SitesWithRoot
235+
) -> Generator[Site, None, None]:
236+
while sites:
237+
if sites.current_page:
238+
yield from sites.current_page
239+
if not sites.has_next:
240+
break
241+
sites = sites._get_next().execute_query()
242+
230243
def _fetch_sites(self) -> list[SiteDescriptor]:
231-
sites = self.graph_client.sites.get_all().execute_query()
244+
sites = self.graph_client.sites.get_all_sites().execute_query()
245+
246+
if not sites:
247+
raise RuntimeError("No sites found in the tenant")
248+
232249
site_descriptors = [
233250
SiteDescriptor(
234-
url=sites.resource_url,
251+
url=site.web_url,
235252
drive_name=None,
236253
folder_path=None,
237254
)
255+
for site in self._handle_paginated_sites(sites)
238256
]
239257
return site_descriptors
240258

backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,22 @@ def sharepoint_credentials() -> dict[str, str]:
8585
}
8686

8787

88+
def test_sharepoint_connector_all_sites(
89+
mock_get_unstructured_api_key: MagicMock,
90+
sharepoint_credentials: dict[str, str],
91+
) -> None:
92+
# Initialize connector with no sites
93+
connector = SharepointConnector()
94+
95+
# Load credentials
96+
connector.load_credentials(sharepoint_credentials)
97+
98+
# Not asserting expected sites because that can change in test tenant at any time
99+
# Finding any docs is good enough to verify that the connector is working
100+
document_batches = list(connector.load_from_state())
101+
assert document_batches, "Should find documents from all sites"
102+
103+
88104
def test_sharepoint_connector_specific_folder(
89105
mock_get_unstructured_api_key: MagicMock,
90106
sharepoint_credentials: dict[str, str],

0 commit comments

Comments
 (0)