@@ -142,6 +142,10 @@ class SharepointAuthMethod(Enum):
142
142
CERTIFICATE = "certificate"
143
143
144
144
145
+ class SizeCapExceeded (Exception ):
146
+ """Exception raised when the size cap is exceeded."""
147
+
148
+
145
149
def load_certificate_from_pfx (pfx_data : bytes , password : str ) -> CertificateData | None :
146
150
"""Load certificate from .pfx file for MSAL authentication"""
147
151
try :
@@ -240,7 +244,7 @@ def _download_with_cap(url: str, timeout: int, cap: int) -> bytes:
240
244
Behavior:
241
245
- Checks `Content-Length` first and aborts early if it exceeds `cap`.
242
246
- Otherwise streams the body in chunks and stops once `cap` is surpassed.
243
- - Raises `RuntimeError('size_cap_exceeded') ` when the cap would be exceeded.
247
+ - Raises `SizeCapExceeded ` when the cap would be exceeded.
244
248
- Returns the full bytes if the content fits within `cap`.
245
249
"""
246
250
with requests .get (url , stream = True , timeout = timeout ) as resp :
@@ -254,7 +258,7 @@ def _download_with_cap(url: str, timeout: int, cap: int) -> bytes:
254
258
logger .warning (
255
259
f"Content-Length { content_len } exceeds cap { cap } ; skipping download."
256
260
)
257
- raise RuntimeError ( "size_cap_exceeded " )
261
+ raise SizeCapExceeded ( "pre_download " )
258
262
259
263
buf = io .BytesIO ()
260
264
# Stream in 64KB chunks; adjust if needed for slower networks.
@@ -267,11 +271,32 @@ def _download_with_cap(url: str, timeout: int, cap: int) -> bytes:
267
271
logger .warning (
268
272
f"Streaming download exceeded cap { cap } bytes; aborting early."
269
273
)
270
- raise RuntimeError ( "size_cap_exceeded " )
274
+ raise SizeCapExceeded ( "during_download " )
271
275
272
276
return buf .getvalue ()
273
277
274
278
279
+ def _download_via_sdk_with_cap (
280
+ driveitem : DriveItem , bytes_allowed : int , chunk_size : int = 64 * 1024
281
+ ) -> bytes :
282
+ """Use the Office365 SDK streaming download with a hard byte cap.
283
+
284
+ Raises SizeCapExceeded("during_sdk_download") if the cap would be exceeded.
285
+ """
286
+ buf = io .BytesIO ()
287
+
288
+ def on_chunk (bytes_read : int ) -> None :
289
+ # bytes_read is total bytes seen so far per SDK contract
290
+ if bytes_read > bytes_allowed :
291
+ raise SizeCapExceeded ("during_sdk_download" )
292
+
293
+ # modifies the driveitem to change its download behavior
294
+ driveitem .download_session (buf , chunk_downloaded = on_chunk , chunk_size = chunk_size )
295
+ # Execute the configured request with retries using existing helper
296
+ sleep_and_retry (driveitem .context , "download_session" )
297
+ return buf .getvalue ()
298
+
299
+
275
300
def _convert_driveitem_to_document_with_permissions (
276
301
driveitem : DriveItem ,
277
302
drive_name : str ,
@@ -322,19 +347,16 @@ def _convert_driveitem_to_document_with_permissions(
322
347
content_bytes : bytes | None = None
323
348
if download_url :
324
349
try :
350
+ # Use this to test the sdk size cap
351
+ # raise requests.RequestException("test")
325
352
content_bytes = _download_with_cap (
326
353
download_url ,
327
354
REQUEST_TIMEOUT_SECONDS ,
328
355
SHAREPOINT_CONNECTOR_SIZE_THRESHOLD ,
329
356
)
330
- except RuntimeError as e :
331
- if "size_cap_exceeded" in str (e ):
332
- logger .warning (
333
- f"Skipping '{ driveitem .name } ' exceeded size cap during streaming."
334
- )
335
- return None
336
- else :
337
- raise
357
+ except SizeCapExceeded as e :
358
+ logger .warning (f"Skipping '{ driveitem .name } ' exceeded size cap: { str (e )} " )
359
+ return None
338
360
except requests .RequestException as e :
339
361
status = e .response .status_code if e .response is not None else - 1
340
362
logger .warning (
@@ -343,13 +365,15 @@ def _convert_driveitem_to_document_with_permissions(
343
365
344
366
# Fallback to SDK content if needed
345
367
if content_bytes is None :
346
- content = sleep_and_retry (driveitem .get_content (), "get_content" )
347
- if content is None or not isinstance (
348
- getattr (content , "value" , None ), (bytes , bytearray )
349
- ):
350
- logger .warning (f"Could not access content for '{ driveitem .name } '" )
351
- raise ValueError (f"Could not access content for '{ driveitem .name } '" )
352
- content_bytes = bytes (content .value )
368
+ try :
369
+ content_bytes = _download_via_sdk_with_cap (
370
+ driveitem , SHAREPOINT_CONNECTOR_SIZE_THRESHOLD
371
+ )
372
+ except SizeCapExceeded :
373
+ logger .warning (
374
+ f"Skipping '{ driveitem .name } ' exceeded size cap during SDK streaming."
375
+ )
376
+ return None
353
377
354
378
sections : list [TextSection | ImageSection ] = []
355
379
file_ext = driveitem .name .split ("." )[- 1 ]
@@ -370,24 +394,27 @@ def _convert_driveitem_to_document_with_permissions(
370
394
sections .append (image_section )
371
395
else :
372
396
# Note: we don't process Onyx metadata for connectors like Drive & Sharepoint, but could
373
- extraction_result = extract_text_and_images (
374
- file = io .BytesIO (content_bytes ), file_name = driveitem .name
375
- )
376
- if extraction_result .text_content :
377
- sections .append (
378
- TextSection (link = driveitem .web_url , text = extraction_result .text_content )
379
- )
380
-
381
- for idx , (img_data , img_name ) in enumerate (extraction_result .embedded_images ):
397
+ def _store_embedded_image (img_data : bytes , img_name : str ) -> None :
382
398
image_section , _ = store_image_and_create_section (
383
399
image_data = img_data ,
384
- file_id = f"{ driveitem .id } _img_{ idx } " ,
385
- display_name = img_name or f"{ driveitem .name } - image { idx } " ,
400
+ file_id = f"{ driveitem .id } _img_{ len ( sections ) } " ,
401
+ display_name = img_name or f"{ driveitem .name } - image { len ( sections ) } " ,
386
402
file_origin = FileOrigin .CONNECTOR ,
387
403
)
388
404
image_section .link = driveitem .web_url
389
405
sections .append (image_section )
390
406
407
+ extraction_result = extract_text_and_images (
408
+ file = io .BytesIO (content_bytes ),
409
+ file_name = driveitem .name ,
410
+ image_callback = _store_embedded_image ,
411
+ )
412
+ if extraction_result .text_content :
413
+ sections .append (
414
+ TextSection (link = driveitem .web_url , text = extraction_result .text_content )
415
+ )
416
+ # Any embedded images were stored via the callback; the returned list may be empty.
417
+
391
418
if include_permissions and ctx is not None :
392
419
logger .info (f"Getting external access for { driveitem .name } " )
393
420
external_access = get_sharepoint_external_access (
@@ -729,6 +756,7 @@ def _get_drive_items_for_drive_name(
729
756
for folder_part in site_descriptor .folder_path .split ("/" ):
730
757
root_folder = root_folder .get_by_path (folder_part )
731
758
759
+ # TODO: consider ways to avoid materializing the entire list of files in memory
732
760
query = root_folder .get_files (
733
761
recursive = True ,
734
762
page_size = 1000 ,
@@ -837,6 +865,7 @@ def _fetch_driveitems(
837
865
root_folder = root_folder .get_by_path (folder_part )
838
866
839
867
# Get all items recursively
868
+ # TODO: consider ways to avoid materializing the entire list of files in memory
840
869
query = root_folder .get_files (
841
870
recursive = True ,
842
871
page_size = 1000 ,
@@ -985,6 +1014,8 @@ def _fetch_site_pages(
985
1014
all_pages = pages_data .get ("value" , [])
986
1015
987
1016
# Handle pagination if there are more pages
1017
+ # TODO: This accumulates all pages in memory and can be heavy on large tenants.
1018
+ # We should process each page incrementally to avoid unbounded growth.
988
1019
while "@odata.nextLink" in pages_data :
989
1020
next_url = pages_data ["@odata.nextLink" ]
990
1021
response = requests .get (
0 commit comments