@@ -129,9 +129,34 @@ def _download_and_extract_sections_basic(
129
129
mime_type = file ["mimeType" ]
130
130
link = file .get (WEB_VIEW_LINK_KEY , "" )
131
131
132
- # skip images if not explicitly enabled
133
- if not allow_images and is_gdrive_image_mime_type (mime_type ):
134
- return []
132
+ # For non-Google files, download the file
133
+ # Use the correct API call for downloading files
134
+ # lazy evaluation to only download the file if necessary
135
+ def response_call () -> bytes :
136
+ return download_request (service , file_id )
137
+
138
+ if is_gdrive_image_mime_type (mime_type ):
139
+ # Skip images if not explicitly enabled
140
+ if not allow_images :
141
+ return []
142
+
143
+ # Store images for later processing
144
+ sections : list [TextSection | ImageSection ] = []
145
+ try :
146
+ with get_session_with_current_tenant () as db_session :
147
+ section , embedded_id = store_image_and_create_section (
148
+ db_session = db_session ,
149
+ image_data = response_call (),
150
+ file_id = file_id ,
151
+ display_name = file_name ,
152
+ media_type = mime_type ,
153
+ file_origin = FileOrigin .CONNECTOR ,
154
+ link = link ,
155
+ )
156
+ sections .append (section )
157
+ except Exception as e :
158
+ logger .error (f"Failed to process image { file_name } : { e } " )
159
+ return sections
135
160
136
161
# For Google Docs, Sheets, and Slides, export as plain text
137
162
if mime_type in GOOGLE_MIME_TYPES_TO_EXPORT :
@@ -154,12 +179,6 @@ def _download_and_extract_sections_basic(
154
179
text = response .decode ("utf-8" )
155
180
return [TextSection (link = link , text = text )]
156
181
157
- # For other file types, download the file
158
- # Use the correct API call for downloading files
159
- # lazy evaluation to only download the file if necessary
160
- def response_call () -> bytes :
161
- return download_request (service , file_id )
162
-
163
182
# Process based on mime type
164
183
if mime_type == "text/plain" :
165
184
try :
@@ -189,25 +208,6 @@ def response_call() -> bytes:
189
208
text = pptx_to_text (io .BytesIO (response_call ()), file_name = file_name )
190
209
return [TextSection (link = link , text = text )] if text else []
191
210
192
- elif is_gdrive_image_mime_type (mime_type ):
193
- # For images, store them for later processing
194
- sections : list [TextSection | ImageSection ] = []
195
- try :
196
- with get_session_with_current_tenant () as db_session :
197
- section , embedded_id = store_image_and_create_section (
198
- db_session = db_session ,
199
- image_data = response_call (),
200
- file_id = file_id ,
201
- display_name = file_name ,
202
- media_type = mime_type ,
203
- file_origin = FileOrigin .CONNECTOR ,
204
- link = link ,
205
- )
206
- sections .append (section )
207
- except Exception as e :
208
- logger .error (f"Failed to process image { file_name } : { e } " )
209
- return sections
210
-
211
211
elif mime_type == "application/pdf" :
212
212
text , _pdf_meta , images = read_pdf_file (io .BytesIO (response_call ()))
213
213
pdf_sections : list [TextSection | ImageSection ] = [
@@ -230,27 +230,18 @@ def response_call() -> bytes:
230
230
logger .error (f"Failed to process PDF images in { file_name } : { e } " )
231
231
return pdf_sections
232
232
233
- else :
234
- # For unsupported file types, try to extract text
235
- if mime_type in [
236
- "application/vnd.google-apps.video" ,
237
- "application/vnd.google-apps.audio" ,
238
- "application/zip" ,
239
- ]:
240
- return []
233
+ # Final attempt at extracting text
234
+ file_ext = get_file_ext (file .get ("name" , "" ))
235
+ if file_ext not in ALL_ACCEPTED_FILE_EXTENSIONS :
236
+ logger .warning (f"Skipping file { file .get ('name' )} due to extension." )
237
+ return []
241
238
242
- # don't download the file at all if it's an unhandled extension
243
- file_ext = get_file_ext (file .get ("name" , "" ))
244
- if file_ext not in ALL_ACCEPTED_FILE_EXTENSIONS :
245
- logger .warning (f"Skipping file { file .get ('name' )} due to extension." )
246
- return []
247
- # For unsupported file types, try to extract text
248
- try :
249
- text = extract_file_text (io .BytesIO (response_call ()), file_name )
250
- return [TextSection (link = link , text = text )]
251
- except Exception as e :
252
- logger .warning (f"Failed to extract text from { file_name } : { e } " )
253
- return []
239
+ try :
240
+ text = extract_file_text (io .BytesIO (response_call ()), file_name )
241
+ return [TextSection (link = link , text = text )]
242
+ except Exception as e :
243
+ logger .warning (f"Failed to extract text from { file_name } : { e } " )
244
+ return []
254
245
255
246
256
247
def _find_nth (haystack : str , needle : str , n : int , start : int = 0 ) -> int :
@@ -451,6 +442,19 @@ def _get_docs_service() -> GoogleDocsService:
451
442
logger .info ("Skipping shortcut/folder." )
452
443
return None
453
444
445
+ size_str = file .get ("size" )
446
+ if size_str :
447
+ try :
448
+ size_int = int (size_str )
449
+ except ValueError :
450
+ logger .warning (f"Parsing string to int failed: size_str={ size_str } " )
451
+ else :
452
+ if size_int > size_threshold :
453
+ logger .warning (
454
+ f"{ file .get ('name' )} exceeds size threshold of { size_threshold } . Skipping."
455
+ )
456
+ return None
457
+
454
458
# If it's a Google Doc, we might do advanced parsing
455
459
if file .get ("mimeType" ) == GDriveMimeType .DOC .value :
456
460
try :
@@ -476,22 +480,8 @@ def _get_docs_service() -> GoogleDocsService:
476
480
logger .warning (
477
481
f"Error in advanced parsing: { e } . Falling back to basic extraction."
478
482
)
479
-
480
- size_str = file .get ("size" )
481
- if size_str :
482
- try :
483
- size_int = int (size_str )
484
- except ValueError :
485
- logger .warning (f"Parsing string to int failed: size_str={ size_str } " )
486
- else :
487
- if size_int > size_threshold :
488
- logger .warning (
489
- f"{ file .get ('name' )} exceeds size threshold of { size_threshold } . Skipping."
490
- )
491
- return None
492
-
493
- # If we don't have sections yet, use the basic extraction method
494
- if not sections :
483
+ # Not Google Doc, attempt basic extraction
484
+ else :
495
485
sections = _download_and_extract_sections_basic (
496
486
file , _get_drive_service (), allow_images
497
487
)
0 commit comments