Skip to content

Commit 9bd5a1d

Browse files
wenxi-onyxWenxi Onyx
andauthored
check file size first and clarify processing logic (#4985)
* check file size first and clarify processing logic * basic gdrive extraction clariy * typo --------- Co-authored-by: Wenxi Onyx <wenxi-onyx@Wenxis-MacBook-Pro.local>
1 parent d3c5a4f commit 9bd5a1d

File tree

1 file changed

+54
-64
lines changed

1 file changed

+54
-64
lines changed

backend/onyx/connectors/google_drive/doc_conversion.py

Lines changed: 54 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,34 @@ def _download_and_extract_sections_basic(
129129
mime_type = file["mimeType"]
130130
link = file.get(WEB_VIEW_LINK_KEY, "")
131131

132-
# skip images if not explicitly enabled
133-
if not allow_images and is_gdrive_image_mime_type(mime_type):
134-
return []
132+
# For non-Google files, download the file
133+
# Use the correct API call for downloading files
134+
# lazy evaluation to only download the file if necessary
135+
def response_call() -> bytes:
136+
return download_request(service, file_id)
137+
138+
if is_gdrive_image_mime_type(mime_type):
139+
# Skip images if not explicitly enabled
140+
if not allow_images:
141+
return []
142+
143+
# Store images for later processing
144+
sections: list[TextSection | ImageSection] = []
145+
try:
146+
with get_session_with_current_tenant() as db_session:
147+
section, embedded_id = store_image_and_create_section(
148+
db_session=db_session,
149+
image_data=response_call(),
150+
file_id=file_id,
151+
display_name=file_name,
152+
media_type=mime_type,
153+
file_origin=FileOrigin.CONNECTOR,
154+
link=link,
155+
)
156+
sections.append(section)
157+
except Exception as e:
158+
logger.error(f"Failed to process image {file_name}: {e}")
159+
return sections
135160

136161
# For Google Docs, Sheets, and Slides, export as plain text
137162
if mime_type in GOOGLE_MIME_TYPES_TO_EXPORT:
@@ -154,12 +179,6 @@ def _download_and_extract_sections_basic(
154179
text = response.decode("utf-8")
155180
return [TextSection(link=link, text=text)]
156181

157-
# For other file types, download the file
158-
# Use the correct API call for downloading files
159-
# lazy evaluation to only download the file if necessary
160-
def response_call() -> bytes:
161-
return download_request(service, file_id)
162-
163182
# Process based on mime type
164183
if mime_type == "text/plain":
165184
try:
@@ -189,25 +208,6 @@ def response_call() -> bytes:
189208
text = pptx_to_text(io.BytesIO(response_call()), file_name=file_name)
190209
return [TextSection(link=link, text=text)] if text else []
191210

192-
elif is_gdrive_image_mime_type(mime_type):
193-
# For images, store them for later processing
194-
sections: list[TextSection | ImageSection] = []
195-
try:
196-
with get_session_with_current_tenant() as db_session:
197-
section, embedded_id = store_image_and_create_section(
198-
db_session=db_session,
199-
image_data=response_call(),
200-
file_id=file_id,
201-
display_name=file_name,
202-
media_type=mime_type,
203-
file_origin=FileOrigin.CONNECTOR,
204-
link=link,
205-
)
206-
sections.append(section)
207-
except Exception as e:
208-
logger.error(f"Failed to process image {file_name}: {e}")
209-
return sections
210-
211211
elif mime_type == "application/pdf":
212212
text, _pdf_meta, images = read_pdf_file(io.BytesIO(response_call()))
213213
pdf_sections: list[TextSection | ImageSection] = [
@@ -230,27 +230,18 @@ def response_call() -> bytes:
230230
logger.error(f"Failed to process PDF images in {file_name}: {e}")
231231
return pdf_sections
232232

233-
else:
234-
# For unsupported file types, try to extract text
235-
if mime_type in [
236-
"application/vnd.google-apps.video",
237-
"application/vnd.google-apps.audio",
238-
"application/zip",
239-
]:
240-
return []
233+
# Final attempt at extracting text
234+
file_ext = get_file_ext(file.get("name", ""))
235+
if file_ext not in ALL_ACCEPTED_FILE_EXTENSIONS:
236+
logger.warning(f"Skipping file {file.get('name')} due to extension.")
237+
return []
241238

242-
# don't download the file at all if it's an unhandled extension
243-
file_ext = get_file_ext(file.get("name", ""))
244-
if file_ext not in ALL_ACCEPTED_FILE_EXTENSIONS:
245-
logger.warning(f"Skipping file {file.get('name')} due to extension.")
246-
return []
247-
# For unsupported file types, try to extract text
248-
try:
249-
text = extract_file_text(io.BytesIO(response_call()), file_name)
250-
return [TextSection(link=link, text=text)]
251-
except Exception as e:
252-
logger.warning(f"Failed to extract text from {file_name}: {e}")
253-
return []
239+
try:
240+
text = extract_file_text(io.BytesIO(response_call()), file_name)
241+
return [TextSection(link=link, text=text)]
242+
except Exception as e:
243+
logger.warning(f"Failed to extract text from {file_name}: {e}")
244+
return []
254245

255246

256247
def _find_nth(haystack: str, needle: str, n: int, start: int = 0) -> int:
@@ -451,6 +442,19 @@ def _get_docs_service() -> GoogleDocsService:
451442
logger.info("Skipping shortcut/folder.")
452443
return None
453444

445+
size_str = file.get("size")
446+
if size_str:
447+
try:
448+
size_int = int(size_str)
449+
except ValueError:
450+
logger.warning(f"Parsing string to int failed: size_str={size_str}")
451+
else:
452+
if size_int > size_threshold:
453+
logger.warning(
454+
f"{file.get('name')} exceeds size threshold of {size_threshold}. Skipping."
455+
)
456+
return None
457+
454458
# If it's a Google Doc, we might do advanced parsing
455459
if file.get("mimeType") == GDriveMimeType.DOC.value:
456460
try:
@@ -476,22 +480,8 @@ def _get_docs_service() -> GoogleDocsService:
476480
logger.warning(
477481
f"Error in advanced parsing: {e}. Falling back to basic extraction."
478482
)
479-
480-
size_str = file.get("size")
481-
if size_str:
482-
try:
483-
size_int = int(size_str)
484-
except ValueError:
485-
logger.warning(f"Parsing string to int failed: size_str={size_str}")
486-
else:
487-
if size_int > size_threshold:
488-
logger.warning(
489-
f"{file.get('name')} exceeds size threshold of {size_threshold}. Skipping."
490-
)
491-
return None
492-
493-
# If we don't have sections yet, use the basic extraction method
494-
if not sections:
483+
# Not Google Doc, attempt basic extraction
484+
else:
495485
sections = _download_and_extract_sections_basic(
496486
file, _get_drive_service(), allow_images
497487
)

0 commit comments

Comments
 (0)