file processing refactor

evan-onyx · evan-onyx · commit bca4d28e9cb4 · 2025-07-29T18:18:12.000-07:00
diff --git a/backend/onyx/background/celery/celery_utils.py b/backend/onyx/background/celery/celery_utils.py
@@ -8,10 +8,12 @@
 
 from onyx.configs.app_configs import MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE
 from onyx.configs.app_configs import VESPA_REQUEST_TIMEOUT
+from onyx.connectors.connector_runner import batched_docs
 from onyx.connectors.cross_connector_utils.rate_limit_wrapper import (
     rate_limit_builder,
 )
 from onyx.connectors.interfaces import BaseConnector
+from onyx.connectors.interfaces import CheckpointedConnector
 from onyx.connectors.interfaces import LoadConnector
 from onyx.connectors.interfaces import PollConnector
 from onyx.connectors.interfaces import SlimConnector
@@ -22,6 +24,7 @@
 
 
 logger = setup_logger()
+PRUNING_CHECKPOINTED_BATCH_SIZE = 32
 
 
 def document_batch_to_ids(
@@ -54,6 +57,16 @@ def extract_ids_from_runnable_connector(
         start = datetime(1970, 1, 1, tzinfo=timezone.utc).timestamp()
         end = datetime.now(timezone.utc).timestamp()
         doc_batch_generator = runnable_connector.poll_source(start=start, end=end)
+    elif isinstance(runnable_connector, CheckpointedConnector):
+        start = datetime(1970, 1, 1, tzinfo=timezone.utc).timestamp()
+        end = datetime.now(timezone.utc).timestamp()
+        checkpoint = runnable_connector.build_dummy_checkpoint()
+        checkpoint_generator = runnable_connector.load_from_checkpoint(
+            start=start, end=end, checkpoint=checkpoint
+        )
+        doc_batch_generator = batched_docs(
+            checkpoint_generator, batch_size=PRUNING_CHECKPOINTED_BATCH_SIZE
+        )
     else:
         raise RuntimeError("Pruning job could not find a valid runnable_connector.")
 
diff --git a/backend/onyx/connectors/connector_runner.py b/backend/onyx/connectors/connector_runner.py
@@ -25,6 +25,24 @@
 CT = TypeVar("CT", bound=ConnectorCheckpoint)
 
 
+def batched_docs(
+    checkpoint_connector_generator: CheckpointOutput[CT],
+    batch_size: int,
+) -> Generator[list[Document], None, None]:
+    batch: list[Document] = []
+    for document, failure, next_checkpoint in CheckpointOutputWrapper[CT]()(
+        checkpoint_connector_generator
+    ):
+        if document is None:
+            continue
+        batch.append(document)
+        if len(batch) >= batch_size:
+            yield batch
+            batch = []
+    if len(batch) > 0:
+        yield batch
+
+
 class CheckpointOutputWrapper(Generic[CT]):
     """
     Wraps a CheckpointOutput generator to give things back in a more digestible format.
diff --git a/backend/onyx/file_processing/file_validation.py b/backend/onyx/file_processing/file_validation.py
@@ -32,9 +32,11 @@ def is_valid_image_type(mime_type: str) -> bool:
     Returns:
         True if the MIME type is a valid image type, False otherwise
     """
-    if not mime_type:
-        return False
-    return mime_type.startswith("image/") and mime_type not in EXCLUDED_IMAGE_TYPES
+    return (
+        bool(mime_type)
+        and mime_type.startswith("image/")
+        and mime_type not in EXCLUDED_IMAGE_TYPES
+    )
 
 
 def is_supported_by_vision_llm(mime_type: str) -> bool:
diff --git a/backend/onyx/file_store/utils.py b/backend/onyx/file_store/utils.py
@@ -46,7 +46,6 @@ def store_user_file_plaintext(user_file_id: int, plaintext_content: str) -> bool
     # Get plaintext file name
     plaintext_file_name = user_file_id_to_plaintext_file_name(user_file_id)
 
-    # Use a separate session to avoid committing the caller's transaction
     try:
         file_store = get_default_file_store()
         file_content = BytesIO(plaintext_content.encode("utf-8"))
diff --git a/backend/onyx/indexing/indexing_pipeline.py b/backend/onyx/indexing/indexing_pipeline.py
@@ -867,30 +867,31 @@ def index_doc_batch(
         for document_id in updatable_ids:
             # Only calculate token counts for documents that have a user file ID
             if (
-                document_id in doc_id_to_user_file_id
-                and doc_id_to_user_file_id[document_id] is not None
+                document_id not in doc_id_to_user_file_id
+                or doc_id_to_user_file_id[document_id] is None
             ):
-                user_file_id = doc_id_to_user_file_id[document_id]
-                if not user_file_id:
-                    continue
-                document_chunks = [
-                    chunk
-                    for chunk in chunks_with_embeddings
-                    if chunk.source_document.id == document_id
-                ]
-                if document_chunks:
-                    combined_content = " ".join(
-                        [chunk.content for chunk in document_chunks]
-                    )
-                    token_count = (
-                        len(llm_tokenizer.encode(combined_content))
-                        if llm_tokenizer
-                        else 0
-                    )
-                    user_file_id_to_token_count[user_file_id] = token_count
-                    user_file_id_to_raw_text[user_file_id] = combined_content
-                else:
-                    user_file_id_to_token_count[user_file_id] = None
+                continue
+
+            user_file_id = doc_id_to_user_file_id[document_id]
+            if user_file_id is None:
+                continue
+
+            document_chunks = [
+                chunk
+                for chunk in chunks_with_embeddings
+                if chunk.source_document.id == document_id
+            ]
+            if document_chunks:
+                combined_content = " ".join(
+                    [chunk.content for chunk in document_chunks]
+                )
+                token_count = (
+                    len(llm_tokenizer.encode(combined_content)) if llm_tokenizer else 0
+                )
+                user_file_id_to_token_count[user_file_id] = token_count
+                user_file_id_to_raw_text[user_file_id] = combined_content
+            else:
+                user_file_id_to_token_count[user_file_id] = None
 
         # we're concerned about race conditions where multiple simultaneous indexings might result
         # in one set of metadata overwriting another one in vespa.
diff --git a/backend/onyx/server/documents/connector.py b/backend/onyx/server/documents/connector.py
@@ -1,3 +1,4 @@
+import io
 import json
 import mimetypes
 import os
@@ -101,8 +102,9 @@
 from onyx.db.models import IndexingStatus
 from onyx.db.models import User
 from onyx.db.models import UserGroup__ConnectorCredentialPair
-from onyx.file_processing.extract_file_text import convert_docx_to_txt
+from onyx.file_processing.extract_file_text import extract_file_text
 from onyx.file_store.file_store import get_default_file_store
+from onyx.file_store.models import ChatFileType
 from onyx.key_value_store.interface import KvKeyNotFoundError
 from onyx.server.documents.models import AuthStatus
 from onyx.server.documents.models import AuthUrl
@@ -124,6 +126,7 @@
 from onyx.server.documents.models import ObjectCreationIdResponse
 from onyx.server.documents.models import RunConnectorRequest
 from onyx.server.models import StatusResponse
+from onyx.server.query_and_chat.chat_utils import mime_type_to_chat_file_type
 from onyx.utils.logger import setup_logger
 from onyx.utils.telemetry import create_milestone_and_report
 from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
@@ -434,6 +437,7 @@ def should_process_file(file_path: str) -> bool:
         file_store = get_default_file_store()
         seen_zip = False
         for file in files:
+            file_type = mime_type_to_chat_file_type(file.content_type)
             if file.content_type and file.content_type.startswith("application/zip"):
                 if seen_zip:
                     raise HTTPException(status_code=400, detail=SEEN_ZIP_DETAIL)
@@ -462,12 +466,16 @@ def should_process_file(file_path: str) -> bool:
                         deduped_file_paths.append(file_id)
                 continue
 
-            # Special handling for docx files - only store the plaintext version
-            if file.content_type and file.content_type.startswith(
-                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-            ):
-                docx_file_id = convert_docx_to_txt(file, file_store)
-                deduped_file_paths.append(docx_file_id)
+            # Special handling for doc files - only store the plaintext version
+            if file_type == ChatFileType.DOC:
+                extracted_text = extract_file_text(file.file, file.filename or "")
+                text_file_id = file_store.save_file(
+                    content=io.BytesIO(extracted_text.encode()),
+                    display_name=file.filename,
+                    file_origin=FileOrigin.CHAT_UPLOAD,
+                    file_type="text/plain",
+                )
+                deduped_file_paths.append(text_file_id)
                 continue
 
             # Default handling for all other file types
diff --git a/backend/onyx/server/query_and_chat/chat_backend.py b/backend/onyx/server/query_and_chat/chat_backend.py