onyx-dot-app · Weves · Apr 27, 2025 · Apr 25, 2025 · Apr 26, 2025 · Apr 26, 2025
@@ -9,4 +9,6 @@ api_keys.py
 vespa-app.zip
 dynamic_config_storage/
 celerybeat-schedule*
-onyx/connectors/salesforce/data/
+onyx/connectors/salesforce/data/
+.test.env
+
@@ -184,10 +184,10 @@ def event_provide_iam_token_for_alembic(
             except Exception as e:
                 logger.error(f"Error migrating schema {schema}: {e}")
                 if not continue_on_error:
-                    logger.error("--continue is not set, raising exception!")
+                    logger.error("--continue=true is not set, raising exception!")
                     raise
 
-                logger.warning("--continue is set, continuing to next schema.")
+                logger.warning("--continue=true is set, continuing to next schema.")
 
     else:
         try:

@@ -0,0 +1,24 @@
+"""Add content type to UserFile
+
+Revision ID: 5c448911b12f
+Revises: 47a07e1a38f1
+Create Date: 2025-04-25 16:59:48.182672
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "5c448911b12f"
+down_revision = "47a07e1a38f1"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column("user_file", sa.Column("content_type", sa.String(), nullable=True))
+
+
+def downgrade() -> None:
+    op.drop_column("user_file", "content_type")
@@ -2477,6 +2477,7 @@ class UserFile(Base):
         "ConnectorCredentialPair", back_populates="user_file"
     )
     link_url: Mapped[str | None] = mapped_column(String, nullable=True)
+    content_type: Mapped[str | None] = mapped_column(String, nullable=True)
 
 
 """

@@ -40,6 +40,10 @@ def create_user_files(
     db_session: Session,
     link_url: str | None = None,
 ) -> list[UserFile]:
+    """NOTE(rkuo): This function can take -1 (RECENT_DOCS_FOLDER_ID for folder_id.
+    Document what this does?
+    """
+
     # NOTE: At the moment, zip metadata is not used for user files.
     # Should revisit to decide whether this should be a feature.
     upload_response = upload_files(files, db_session)
@@ -54,21 +58,25 @@ def create_user_files(
             name=file.filename,
             token_count=None,
             link_url=link_url,
+            content_type=file.content_type,
         )
         db_session.add(new_file)
         user_files.append(new_file)
     db_session.commit()
     return user_files
 
 
-def create_user_file_with_indexing(
+def upload_files_to_user_files_with_indexing(
     files: List[UploadFile],
     folder_id: int | None,
     user: User,
     db_session: Session,
     trigger_index: bool = True,
 ) -> list[UserFile]:
-    """Create user files and trigger immediate indexing"""
+    """NOTE(rkuo): This function can take -1 (RECENT_DOCS_FOLDER_ID for folder_id.
+    Document what this does?
+
+    Create user files and trigger immediate indexing"""
     # Create the user files first
     user_files = create_user_files(files, folder_id, user, db_session)
 

@@ -36,6 +36,7 @@
 
 logger = setup_logger()
 
+# NOTE(rkuo): Unify this with upload_files_for_chat and file_valiation.py
 TEXT_SECTION_SEPARATOR = "\n\n"
 
 ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS = [

@@ -2,6 +2,8 @@
 Centralized file type validation utilities.
 """
 
+# NOTE(rkuo): Unify this with upload_files_for_chat and extract_file_text
+
 # Standard image MIME types supported by most vision LLMs
 IMAGE_MIME_TYPES = [
     "image/png",

@@ -14,6 +14,9 @@ class ChatFileType(str, Enum):
     # Plain text only contain the text
     PLAIN_TEXT = "plain_text"
     CSV = "csv"
+
+    # NOTE(rkuo): don't understand the motivation for this
+    # "user knowledge" is not a file type, it's a source or intent
     USER_KNOWLEDGE = "user_knowledge"
 
 

@@ -12,6 +12,7 @@
 from onyx.db.models import ChatMessage
 from onyx.db.models import UserFile
 from onyx.db.models import UserFolder
+from onyx.file_processing.extract_file_text import IMAGE_MEDIA_TYPES
 from onyx.file_store.file_store import get_default_file_store
 from onyx.file_store.models import ChatFileType
 from onyx.file_store.models import FileDescriptor
@@ -111,6 +112,9 @@ def load_user_folder(folder_id: int, db_session: Session) -> list[InMemoryChatFi
 
 
 def load_user_file(file_id: int, db_session: Session) -> InMemoryChatFile:
+    chat_file_type = ChatFileType.USER_KNOWLEDGE
+    status = "not_loaded"
+
     user_file = db_session.query(UserFile).filter(UserFile.id == file_id).first()
     if not user_file:
         raise ValueError(f"User file with id {file_id} not found")
@@ -119,26 +123,38 @@ def load_user_file(file_id: int, db_session: Session) -> InMemoryChatFile:
     file_store = get_default_file_store(db_session)
     plaintext_file_name = user_file_id_to_plaintext_file_name(file_id)
 
+    # check for plain text normalized version first, then use original file otherwise
     try:
         file_io = file_store.read_file(plaintext_file_name, mode="b")
-        return InMemoryChatFile(
+        chat_file = InMemoryChatFile(
             file_id=str(user_file.file_id),
             content=file_io.read(),
             file_type=ChatFileType.USER_KNOWLEDGE,
             filename=user_file.name,
         )
-    except Exception as e:
-        logger.warning(
-            f"Failed to load plaintext file {plaintext_file_name}, defaulting to original file: {e}"
-        )
+        status = "plaintext"
+        return chat_file
+    except Exception:
         # Fall back to original file if plaintext not available
         file_io = file_store.read_file(user_file.file_id, mode="b")
-        return InMemoryChatFile(
+        file_record = file_store.read_file_record(user_file.file_id)
+        if file_record.file_type in IMAGE_MEDIA_TYPES:
+            chat_file_type = ChatFileType.IMAGE
+
+        chat_file = InMemoryChatFile(
             file_id=str(user_file.file_id),
             content=file_io.read(),
-            file_type=ChatFileType.USER_KNOWLEDGE,
+            file_type=chat_file_type,
             filename=user_file.name,
         )
+        status = "original"
+        return chat_file
+    finally:
+        logger.debug(
+            f"load_user_file finished: file_id={user_file.file_id} "
+            f"chat_file_type={chat_file_type} "
+            f"status={status}"
+        )
 
 
 def load_all_user_files(

@@ -173,7 +173,7 @@ def undelete_persona(
     )
 
 
-# used for assistat profile pictures
+# used for assistant profile pictures
 @admin_router.post("/upload-image")
 def upload_file(
     file: UploadFile,

@@ -76,6 +76,7 @@
 )
 from onyx.server.documents.models import ConnectorBase
 from onyx.server.documents.models import CredentialBase
+from onyx.server.query_and_chat.chat_utils import mime_type_to_chat_file_type
 from onyx.server.query_and_chat.models import ChatFeedbackRequest
 from onyx.server.query_and_chat.models import ChatMessageIdentifier
 from onyx.server.query_and_chat.models import ChatRenameRequest
@@ -96,6 +97,7 @@
 from onyx.server.query_and_chat.models import UpdateChatSessionTemperatureRequest
 from onyx.server.query_and_chat.models import UpdateChatSessionThreadRequest
 from onyx.server.query_and_chat.token_limit import check_token_rate_limits
+from onyx.utils.file_types import UploadMimeTypes
 from onyx.utils.headers import get_custom_tool_additional_request_headers
 from onyx.utils.logger import setup_logger
 from onyx.utils.telemetry import create_milestone_and_report
@@ -661,51 +663,45 @@ def upload_files_for_chat(
     db_session: Session = Depends(get_session),
     user: User | None = Depends(current_user),
 ) -> dict[str, list[FileDescriptor]]:
-    image_content_types = {"image/jpeg", "image/png", "image/webp"}
-    csv_content_types = {"text/csv"}
-    text_content_types = {
-        "text/plain",
-        "text/markdown",
-        "text/x-markdown",
-        "text/x-config",
-        "text/tab-separated-values",
-        "application/json",
-        "application/xml",
-        "text/xml",
-        "application/x-yaml",
-    }
-    document_content_types = {
-        "application/pdf",
-        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-        "message/rfc822",
-        "application/epub+zip",
-    }
 
-    allowed_content_types = (
-        image_content_types.union(text_content_types)
-        .union(document_content_types)
-        .union(csv_content_types)
-    )
+    # NOTE(rkuo): Unify this with file_validation.py and extract_file_text.py
+    # image_content_types = {"image/jpeg", "image/png", "image/webp"}
+    # csv_content_types = {"text/csv"}
+    # text_content_types = {
+    #     "text/plain",
+    #     "text/markdown",
+    #     "text/x-markdown",
+    #     "text/x-config",
+    #     "text/tab-separated-values",
+    #     "application/json",
+    #     "application/xml",
+    #     "text/xml",
+    #     "application/x-yaml",
+    # }
+    # document_content_types = {
+    #     "application/pdf",
+    #     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    #     "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+    #     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    #     "message/rfc822",
+    #     "application/epub+zip",
+    # }
+
+    # allowed_content_types = (
+    #     image_content_types.union(text_content_types)
+    #     .union(document_content_types)
+    #     .union(csv_content_types)
+    # )
 
     for file in files:
         if not file.content_type:
             raise HTTPException(status_code=400, detail="File content type is required")
 
-        if file.content_type not in allowed_content_types:
-            if file.content_type in image_content_types:
-                error_detail = "Unsupported image file type. Supported image types include .jpg, .jpeg, .png, .webp."
-            elif file.content_type in text_content_types:
-                error_detail = "Unsupported text file type."
-            elif file.content_type in csv_content_types:
-                error_detail = "Unsupported CSV file type."
-            else:
-                error_detail = "Unsupported document file type."
-            raise HTTPException(status_code=400, detail=error_detail)
+        if file.content_type not in UploadMimeTypes.ALLOWED_MIME_TYPES:
+            raise HTTPException(status_code=400, detail="Unsupported file type.")
 
         if (
-            file.content_type in image_content_types
+            file.content_type in UploadMimeTypes.IMAGE_MIME_TYPES
             and file.size
             and file.size > 20 * 1024 * 1024
         ):
@@ -718,19 +714,7 @@ def upload_files_for_chat(
 
     file_info: list[tuple[str, str | None, ChatFileType]] = []
     for file in files:
-        file_type = (
-            ChatFileType.IMAGE
-            if file.content_type in image_content_types
-            else (
-                ChatFileType.CSV
-                if file.content_type in csv_content_types
-                else (
-                    ChatFileType.DOC
-                    if file.content_type in document_content_types
-                    else ChatFileType.PLAIN_TEXT
-                )
-            )
-        )
+        file_type = mime_type_to_chat_file_type(file.content_type)
 
         file_content = file.file.read()  # Read the file content
 

@@ -0,0 +1,18 @@
+from onyx.file_store.models import ChatFileType
+from onyx.utils.file_types import UploadMimeTypes
+
+
+def mime_type_to_chat_file_type(mime_type: str | None) -> ChatFileType:
+    if mime_type is None:
+        return ChatFileType.PLAIN_TEXT
+
+    if mime_type in UploadMimeTypes.IMAGE_MIME_TYPES:
+        return ChatFileType.IMAGE
+
+    if mime_type in UploadMimeTypes.CSV_MIME_TYPES:
+        return ChatFileType.CSV
+
+    if mime_type in UploadMimeTypes.DOCUMENT_MIME_TYPES:
+        return ChatFileType.DOC
+
+    return ChatFileType.PLAIN_TEXT
@@ -31,13 +31,13 @@
 from onyx.db.models import UserFile
 from onyx.db.models import UserFolder
 from onyx.db.user_documents import calculate_user_files_token_count
-from onyx.db.user_documents import create_user_file_with_indexing
 from onyx.db.user_documents import create_user_files
 from onyx.db.user_documents import get_user_file_indexing_status
 from onyx.db.user_documents import share_file_with_assistant
 from onyx.db.user_documents import share_folder_with_assistant
 from onyx.db.user_documents import unshare_file_with_assistant
 from onyx.db.user_documents import unshare_folder_with_assistant
+from onyx.db.user_documents import upload_files_to_user_files_with_indexing
 from onyx.file_processing.html_utils import web_html_cleanup
 from onyx.server.documents.connector import trigger_indexing_for_cc_pair
 from onyx.server.documents.models import ConnectorBase
@@ -156,7 +156,7 @@ def upload_user_files(
 
     try:
         # Use our consolidated function that handles indexing properly
-        user_files = create_user_file_with_indexing(
+        user_files = upload_files_to_user_files_with_indexing(
             files, folder_id or -1, user, db_session
         )
 

@@ -8,6 +8,8 @@
 from onyx.db.enums import ConnectorCredentialPairStatus
 from onyx.db.models import UserFile
 from onyx.db.models import UserFolder
+from onyx.file_store.models import ChatFileType
+from onyx.server.query_and_chat.chat_utils import mime_type_to_chat_file_type
 
 
 class UserFileStatus(str, PyEnum):
@@ -17,6 +19,7 @@ class UserFileStatus(str, PyEnum):
     REINDEXING = "REINDEXING"
 
 
+# this maps to FileResponse on the front end
 class UserFileSnapshot(BaseModel):
     id: int
     name: str
@@ -30,6 +33,7 @@ class UserFileSnapshot(BaseModel):
     indexed: bool
     link_url: str | None
     status: UserFileStatus
+    chat_file_type: ChatFileType
 
     @classmethod
     def from_model(cls, model: UserFile) -> "UserFileSnapshot":
@@ -73,6 +77,7 @@ def from_model(cls, model: UserFile) -> "UserFileSnapshot":
                 else False
             ),
             link_url=model.link_url,
+            chat_file_type=mime_type_to_chat_file_type(model.content_type),
         )
-Original file line number
+Diff line change
@@ Expand Up / @@ -2477,6 +2477,7 @@ class UserFile(Base): @@
             "ConnectorCredentialPair", back_populates="user_file"
         )
         link_url: Mapped[str | None] = mapped_column(String, nullable=True)
+        content_type: Mapped[str | None] = mapped_column(String, nullable=True)
     """
@@ Expand Down @@