diff --git a/backend/.gitignore b/backend/.gitignore index 9c2da46d35a..9128bc48ec0 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -9,4 +9,6 @@ api_keys.py vespa-app.zip dynamic_config_storage/ celerybeat-schedule* -onyx/connectors/salesforce/data/ \ No newline at end of file +onyx/connectors/salesforce/data/ +.test.env + diff --git a/backend/alembic/env.py b/backend/alembic/env.py index e820fdf86c6..1b22df5f418 100644 --- a/backend/alembic/env.py +++ b/backend/alembic/env.py @@ -184,10 +184,10 @@ def event_provide_iam_token_for_alembic( except Exception as e: logger.error(f"Error migrating schema {schema}: {e}") if not continue_on_error: - logger.error("--continue is not set, raising exception!") + logger.error("--continue=true is not set, raising exception!") raise - logger.warning("--continue is set, continuing to next schema.") + logger.warning("--continue=true is set, continuing to next schema.") else: try: diff --git a/backend/alembic/versions/5c448911b12f_add_content_type_to_userfile.py b/backend/alembic/versions/5c448911b12f_add_content_type_to_userfile.py new file mode 100644 index 00000000000..17d7b80c037 --- /dev/null +++ b/backend/alembic/versions/5c448911b12f_add_content_type_to_userfile.py @@ -0,0 +1,24 @@ +"""Add content type to UserFile + +Revision ID: 5c448911b12f +Revises: 47a07e1a38f1 +Create Date: 2025-04-25 16:59:48.182672 + +""" + +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = "5c448911b12f" +down_revision = "47a07e1a38f1" +branch_labels: None = None +depends_on: None = None + + +def upgrade() -> None: + op.add_column("user_file", sa.Column("content_type", sa.String(), nullable=True)) + + +def downgrade() -> None: + op.drop_column("user_file", "content_type") diff --git a/backend/onyx/db/models.py b/backend/onyx/db/models.py index 274de758a07..3489be67967 100644 --- a/backend/onyx/db/models.py +++ b/backend/onyx/db/models.py @@ -2477,6 +2477,7 @@ class UserFile(Base): "ConnectorCredentialPair", back_populates="user_file" ) link_url: Mapped[str | None] = mapped_column(String, nullable=True) + content_type: Mapped[str | None] = mapped_column(String, nullable=True) """ diff --git a/backend/onyx/db/user_documents.py b/backend/onyx/db/user_documents.py index 03a6b3c7265..d2048544d99 100644 --- a/backend/onyx/db/user_documents.py +++ b/backend/onyx/db/user_documents.py @@ -40,6 +40,10 @@ def create_user_files( db_session: Session, link_url: str | None = None, ) -> list[UserFile]: + """NOTE(rkuo): This function can take -1 (RECENT_DOCS_FOLDER_ID for folder_id. + Document what this does? + """ + # NOTE: At the moment, zip metadata is not used for user files. # Should revisit to decide whether this should be a feature. upload_response = upload_files(files, db_session) @@ -54,6 +58,7 @@ def create_user_files( name=file.filename, token_count=None, link_url=link_url, + content_type=file.content_type, ) db_session.add(new_file) user_files.append(new_file) @@ -61,14 +66,17 @@ def create_user_files( return user_files -def create_user_file_with_indexing( +def upload_files_to_user_files_with_indexing( files: List[UploadFile], folder_id: int | None, user: User, db_session: Session, trigger_index: bool = True, ) -> list[UserFile]: - """Create user files and trigger immediate indexing""" + """NOTE(rkuo): This function can take -1 (RECENT_DOCS_FOLDER_ID for folder_id. + Document what this does? + + Create user files and trigger immediate indexing""" # Create the user files first user_files = create_user_files(files, folder_id, user, db_session) diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py index febdf54b203..deaba6e9894 100644 --- a/backend/onyx/file_processing/extract_file_text.py +++ b/backend/onyx/file_processing/extract_file_text.py @@ -36,6 +36,7 @@ logger = setup_logger() +# NOTE(rkuo): Unify this with upload_files_for_chat and file_valiation.py TEXT_SECTION_SEPARATOR = "\n\n" ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS = [ diff --git a/backend/onyx/file_processing/file_validation.py b/backend/onyx/file_processing/file_validation.py index 747913f095d..34f33dd2f55 100644 --- a/backend/onyx/file_processing/file_validation.py +++ b/backend/onyx/file_processing/file_validation.py @@ -2,6 +2,8 @@ Centralized file type validation utilities. """ +# NOTE(rkuo): Unify this with upload_files_for_chat and extract_file_text + # Standard image MIME types supported by most vision LLMs IMAGE_MIME_TYPES = [ "image/png", diff --git a/backend/onyx/file_store/models.py b/backend/onyx/file_store/models.py index 86e48d7ed4f..a59e15aef20 100644 --- a/backend/onyx/file_store/models.py +++ b/backend/onyx/file_store/models.py @@ -14,6 +14,9 @@ class ChatFileType(str, Enum): # Plain text only contain the text PLAIN_TEXT = "plain_text" CSV = "csv" + + # NOTE(rkuo): don't understand the motivation for this + # "user knowledge" is not a file type, it's a source or intent USER_KNOWLEDGE = "user_knowledge" diff --git a/backend/onyx/file_store/utils.py b/backend/onyx/file_store/utils.py index 87095d62b11..ca19f765664 100644 --- a/backend/onyx/file_store/utils.py +++ b/backend/onyx/file_store/utils.py @@ -12,6 +12,7 @@ from onyx.db.models import ChatMessage from onyx.db.models import UserFile from onyx.db.models import UserFolder +from onyx.file_processing.extract_file_text import IMAGE_MEDIA_TYPES from onyx.file_store.file_store import get_default_file_store from onyx.file_store.models import ChatFileType from onyx.file_store.models import FileDescriptor @@ -111,6 +112,9 @@ def load_user_folder(folder_id: int, db_session: Session) -> list[InMemoryChatFi def load_user_file(file_id: int, db_session: Session) -> InMemoryChatFile: + chat_file_type = ChatFileType.USER_KNOWLEDGE + status = "not_loaded" + user_file = db_session.query(UserFile).filter(UserFile.id == file_id).first() if not user_file: raise ValueError(f"User file with id {file_id} not found") @@ -119,26 +123,38 @@ def load_user_file(file_id: int, db_session: Session) -> InMemoryChatFile: file_store = get_default_file_store(db_session) plaintext_file_name = user_file_id_to_plaintext_file_name(file_id) + # check for plain text normalized version first, then use original file otherwise try: file_io = file_store.read_file(plaintext_file_name, mode="b") - return InMemoryChatFile( + chat_file = InMemoryChatFile( file_id=str(user_file.file_id), content=file_io.read(), file_type=ChatFileType.USER_KNOWLEDGE, filename=user_file.name, ) - except Exception as e: - logger.warning( - f"Failed to load plaintext file {plaintext_file_name}, defaulting to original file: {e}" - ) + status = "plaintext" + return chat_file + except Exception: # Fall back to original file if plaintext not available file_io = file_store.read_file(user_file.file_id, mode="b") - return InMemoryChatFile( + file_record = file_store.read_file_record(user_file.file_id) + if file_record.file_type in IMAGE_MEDIA_TYPES: + chat_file_type = ChatFileType.IMAGE + + chat_file = InMemoryChatFile( file_id=str(user_file.file_id), content=file_io.read(), - file_type=ChatFileType.USER_KNOWLEDGE, + file_type=chat_file_type, filename=user_file.name, ) + status = "original" + return chat_file + finally: + logger.debug( + f"load_user_file finished: file_id={user_file.file_id} " + f"chat_file_type={chat_file_type} " + f"status={status}" + ) def load_all_user_files( diff --git a/backend/onyx/server/features/persona/api.py b/backend/onyx/server/features/persona/api.py index bd7ccd6d25e..b528a4891c2 100644 --- a/backend/onyx/server/features/persona/api.py +++ b/backend/onyx/server/features/persona/api.py @@ -173,7 +173,7 @@ def undelete_persona( ) -# used for assistat profile pictures +# used for assistant profile pictures @admin_router.post("/upload-image") def upload_file( file: UploadFile, diff --git a/backend/onyx/server/query_and_chat/chat_backend.py b/backend/onyx/server/query_and_chat/chat_backend.py index 70ba54dfef7..84e16b4ecf6 100644 --- a/backend/onyx/server/query_and_chat/chat_backend.py +++ b/backend/onyx/server/query_and_chat/chat_backend.py @@ -76,6 +76,7 @@ ) from onyx.server.documents.models import ConnectorBase from onyx.server.documents.models import CredentialBase +from onyx.server.query_and_chat.chat_utils import mime_type_to_chat_file_type from onyx.server.query_and_chat.models import ChatFeedbackRequest from onyx.server.query_and_chat.models import ChatMessageIdentifier from onyx.server.query_and_chat.models import ChatRenameRequest @@ -96,6 +97,7 @@ from onyx.server.query_and_chat.models import UpdateChatSessionTemperatureRequest from onyx.server.query_and_chat.models import UpdateChatSessionThreadRequest from onyx.server.query_and_chat.token_limit import check_token_rate_limits +from onyx.utils.file_types import UploadMimeTypes from onyx.utils.headers import get_custom_tool_additional_request_headers from onyx.utils.logger import setup_logger from onyx.utils.telemetry import create_milestone_and_report @@ -661,51 +663,45 @@ def upload_files_for_chat( db_session: Session = Depends(get_session), user: User | None = Depends(current_user), ) -> dict[str, list[FileDescriptor]]: - image_content_types = {"image/jpeg", "image/png", "image/webp"} - csv_content_types = {"text/csv"} - text_content_types = { - "text/plain", - "text/markdown", - "text/x-markdown", - "text/x-config", - "text/tab-separated-values", - "application/json", - "application/xml", - "text/xml", - "application/x-yaml", - } - document_content_types = { - "application/pdf", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "message/rfc822", - "application/epub+zip", - } - allowed_content_types = ( - image_content_types.union(text_content_types) - .union(document_content_types) - .union(csv_content_types) - ) + # NOTE(rkuo): Unify this with file_validation.py and extract_file_text.py + # image_content_types = {"image/jpeg", "image/png", "image/webp"} + # csv_content_types = {"text/csv"} + # text_content_types = { + # "text/plain", + # "text/markdown", + # "text/x-markdown", + # "text/x-config", + # "text/tab-separated-values", + # "application/json", + # "application/xml", + # "text/xml", + # "application/x-yaml", + # } + # document_content_types = { + # "application/pdf", + # "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + # "application/vnd.openxmlformats-officedocument.presentationml.presentation", + # "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + # "message/rfc822", + # "application/epub+zip", + # } + + # allowed_content_types = ( + # image_content_types.union(text_content_types) + # .union(document_content_types) + # .union(csv_content_types) + # ) for file in files: if not file.content_type: raise HTTPException(status_code=400, detail="File content type is required") - if file.content_type not in allowed_content_types: - if file.content_type in image_content_types: - error_detail = "Unsupported image file type. Supported image types include .jpg, .jpeg, .png, .webp." - elif file.content_type in text_content_types: - error_detail = "Unsupported text file type." - elif file.content_type in csv_content_types: - error_detail = "Unsupported CSV file type." - else: - error_detail = "Unsupported document file type." - raise HTTPException(status_code=400, detail=error_detail) + if file.content_type not in UploadMimeTypes.ALLOWED_MIME_TYPES: + raise HTTPException(status_code=400, detail="Unsupported file type.") if ( - file.content_type in image_content_types + file.content_type in UploadMimeTypes.IMAGE_MIME_TYPES and file.size and file.size > 20 * 1024 * 1024 ): @@ -718,19 +714,7 @@ def upload_files_for_chat( file_info: list[tuple[str, str | None, ChatFileType]] = [] for file in files: - file_type = ( - ChatFileType.IMAGE - if file.content_type in image_content_types - else ( - ChatFileType.CSV - if file.content_type in csv_content_types - else ( - ChatFileType.DOC - if file.content_type in document_content_types - else ChatFileType.PLAIN_TEXT - ) - ) - ) + file_type = mime_type_to_chat_file_type(file.content_type) file_content = file.file.read() # Read the file content diff --git a/backend/onyx/server/query_and_chat/chat_utils.py b/backend/onyx/server/query_and_chat/chat_utils.py new file mode 100644 index 00000000000..ced0276913d --- /dev/null +++ b/backend/onyx/server/query_and_chat/chat_utils.py @@ -0,0 +1,18 @@ +from onyx.file_store.models import ChatFileType +from onyx.utils.file_types import UploadMimeTypes + + +def mime_type_to_chat_file_type(mime_type: str | None) -> ChatFileType: + if mime_type is None: + return ChatFileType.PLAIN_TEXT + + if mime_type in UploadMimeTypes.IMAGE_MIME_TYPES: + return ChatFileType.IMAGE + + if mime_type in UploadMimeTypes.CSV_MIME_TYPES: + return ChatFileType.CSV + + if mime_type in UploadMimeTypes.DOCUMENT_MIME_TYPES: + return ChatFileType.DOC + + return ChatFileType.PLAIN_TEXT diff --git a/backend/onyx/server/user_documents/api.py b/backend/onyx/server/user_documents/api.py index 731ae954cd7..0012bf47d8a 100644 --- a/backend/onyx/server/user_documents/api.py +++ b/backend/onyx/server/user_documents/api.py @@ -31,13 +31,13 @@ from onyx.db.models import UserFile from onyx.db.models import UserFolder from onyx.db.user_documents import calculate_user_files_token_count -from onyx.db.user_documents import create_user_file_with_indexing from onyx.db.user_documents import create_user_files from onyx.db.user_documents import get_user_file_indexing_status from onyx.db.user_documents import share_file_with_assistant from onyx.db.user_documents import share_folder_with_assistant from onyx.db.user_documents import unshare_file_with_assistant from onyx.db.user_documents import unshare_folder_with_assistant +from onyx.db.user_documents import upload_files_to_user_files_with_indexing from onyx.file_processing.html_utils import web_html_cleanup from onyx.server.documents.connector import trigger_indexing_for_cc_pair from onyx.server.documents.models import ConnectorBase @@ -156,7 +156,7 @@ def upload_user_files( try: # Use our consolidated function that handles indexing properly - user_files = create_user_file_with_indexing( + user_files = upload_files_to_user_files_with_indexing( files, folder_id or -1, user, db_session ) diff --git a/backend/onyx/server/user_documents/models.py b/backend/onyx/server/user_documents/models.py index 4540005c8ab..8b20d5357bd 100644 --- a/backend/onyx/server/user_documents/models.py +++ b/backend/onyx/server/user_documents/models.py @@ -8,6 +8,8 @@ from onyx.db.enums import ConnectorCredentialPairStatus from onyx.db.models import UserFile from onyx.db.models import UserFolder +from onyx.file_store.models import ChatFileType +from onyx.server.query_and_chat.chat_utils import mime_type_to_chat_file_type class UserFileStatus(str, PyEnum): @@ -17,6 +19,7 @@ class UserFileStatus(str, PyEnum): REINDEXING = "REINDEXING" +# this maps to FileResponse on the front end class UserFileSnapshot(BaseModel): id: int name: str @@ -30,6 +33,7 @@ class UserFileSnapshot(BaseModel): indexed: bool link_url: str | None status: UserFileStatus + chat_file_type: ChatFileType @classmethod def from_model(cls, model: UserFile) -> "UserFileSnapshot": @@ -73,6 +77,7 @@ def from_model(cls, model: UserFile) -> "UserFileSnapshot": else False ), link_url=model.link_url, + chat_file_type=mime_type_to_chat_file_type(model.content_type), ) diff --git a/backend/onyx/utils/file_types.py b/backend/onyx/utils/file_types.py new file mode 100644 index 00000000000..306c8bcfb6d --- /dev/null +++ b/backend/onyx/utils/file_types.py @@ -0,0 +1,26 @@ +class UploadMimeTypes: + IMAGE_MIME_TYPES = {"image/jpeg", "image/png", "image/webp"} + CSV_MIME_TYPES = {"text/csv"} + TEXT_MIME_TYPES = { + "text/plain", + "text/markdown", + "text/x-markdown", + "text/x-config", + "text/tab-separated-values", + "application/json", + "application/xml", + "text/xml", + "application/x-yaml", + } + DOCUMENT_MIME_TYPES = { + "application/pdf", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "message/rfc822", + "application/epub+zip", + } + + ALLOWED_MIME_TYPES = IMAGE_MIME_TYPES.union( + TEXT_MIME_TYPES, DOCUMENT_MIME_TYPES, CSV_MIME_TYPES + ) diff --git a/web/src/app/chat/ChatPage.tsx b/web/src/app/chat/ChatPage.tsx index 7371991e5ea..37f97a889d7 100644 --- a/web/src/app/chat/ChatPage.tsx +++ b/web/src/app/chat/ChatPage.tsx @@ -141,6 +141,11 @@ const TEMP_USER_MESSAGE_ID = -1; const TEMP_ASSISTANT_MESSAGE_ID = -2; const SYSTEM_MESSAGE_ID = -3; +export enum UploadIntent { + ATTACH_TO_MESSAGE, // For files uploaded via ChatInputBar (paste, drag/drop) + ADD_TO_DOCUMENTS, // For files uploaded via FilePickerModal or similar (just add to repo) +} + export function ChatPage({ toggle, documentSidebarInitialWidth, @@ -633,7 +638,7 @@ export function ChatPage({ }: { messages: Message[]; // if calling this function repeatedly with short delay, stay may not update in time - // and result in weird behavipr + // and result in weird behavior completeMessageMapOverride?: Map | null; chatSessionId?: string; replacementsMap?: Map | null; @@ -1946,7 +1951,10 @@ export function ChatPage({ } }; - const handleImageUpload = async (acceptedFiles: File[]) => { + const handleImageUpload = async ( + acceptedFiles: File[], + intent: UploadIntent + ) => { const [_, llmModel] = getFinalLLM( llmProviders, liveAssistant, @@ -1969,14 +1977,34 @@ export function ChatPage({ updateChatState("uploading", currentSessionId()); + const newlyUploadedFileDescriptors: FileDescriptor[] = []; + for (let file of acceptedFiles) { const formData = new FormData(); formData.append("files", file); - const response = await uploadFile(formData, null); + const response: FileResponse[] = await uploadFile(formData, null); if (response.length > 0) { const uploadedFile = response[0]; - addSelectedFile(uploadedFile); + + if (intent == UploadIntent.ADD_TO_DOCUMENTS) { + addSelectedFile(uploadedFile); + } else { + const newFileDescriptor: FileDescriptor = { + // Use file_id (storage ID) if available, otherwise fallback to DB id + // Ensure it's a string as FileDescriptor expects + id: uploadedFile.file_id + ? String(uploadedFile.file_id) + : String(uploadedFile.id), + type: uploadedFile.chat_file_type + ? uploadedFile.chat_file_type + : ChatFileType.PLAIN_TEXT, + name: uploadedFile.name, + isUploading: false, // Mark as successfully uploaded + }; + + setCurrentMessageFiles((prev) => [...prev, newFileDescriptor]); + } } else { setPopup({ type: "error", @@ -2582,7 +2610,12 @@ export function ChatPage({ {documentSidebarInitialWidth !== undefined && isReady ? ( + handleImageUpload( + acceptedFiles, + UploadIntent.ATTACH_TO_MESSAGE + ) + } noClick > {({ getRootProps }) => ( diff --git a/web/src/app/chat/input/ChatInputBar.tsx b/web/src/app/chat/input/ChatInputBar.tsx index 12ee9f8a191..9407da57c51 100644 --- a/web/src/app/chat/input/ChatInputBar.tsx +++ b/web/src/app/chat/input/ChatInputBar.tsx @@ -1,4 +1,4 @@ -import React, { useContext, useEffect, useRef, useState } from "react"; +import React, { useContext, useEffect, useMemo, useRef, useState } from "react"; import { FiPlusCircle, FiPlus, FiInfo, FiX, FiFilter } from "react-icons/fi"; import { FiLoader } from "react-icons/fi"; import { ChatInputOption } from "./ChatInputOption"; @@ -39,6 +39,7 @@ import { AgenticToggle } from "./AgenticToggle"; import { SettingsContext } from "@/components/settings/SettingsProvider"; import { getProviderIcon } from "@/app/admin/configuration/llm/interfaces"; import { useDocumentsContext } from "../my-documents/DocumentsContext"; +import { UploadIntent } from "../ChatPage"; const MAX_INPUT_HEIGHT = 200; export const SourceChip2 = ({ @@ -187,7 +188,7 @@ interface ChatInputBarProps { setAlternativeAssistant: (alternativeAssistant: Persona | null) => void; toggleDocumentSidebar: () => void; setFiles: (files: FileDescriptor[]) => void; - handleFileUpload: (files: File[]) => void; + handleFileUpload: (files: File[], intent: UploadIntent) => void; textAreaRef: React.RefObject; filterManager: FilterManager; availableSources: SourceMetadata[]; @@ -237,6 +238,13 @@ export function ChatInputBar({ setCurrentMessageFiles, } = useDocumentsContext(); + // Create a Set of IDs from currentMessageFiles for efficient lookup + // Assuming FileDescriptor.id corresponds conceptually to FileResponse.file_id or FileResponse.id + const currentMessageFileIds = useMemo( + () => new Set(currentMessageFiles.map((f) => String(f.id))), // Ensure IDs are strings for comparison + [currentMessageFiles] + ); + const settings = useContext(SettingsContext); useEffect(() => { const textarea = textAreaRef.current; @@ -261,7 +269,7 @@ export function ChatInputBar({ } if (pastedFiles.length > 0) { event.preventDefault(); - handleFileUpload(pastedFiles); + handleFileUpload(pastedFiles, UploadIntent.ATTACH_TO_MESSAGE); } } }; @@ -662,14 +670,21 @@ export function ChatInputBar({ /> ))} - {selectedFiles.map((file) => ( - } - title={file.name} - onRemove={() => removeSelectedFile(file)} - /> - ))} + {/* This is excluding image types because they get rendered differently via currentMessageFiles.map + Seems quite hacky ... all rendering should probably be done in one place? */} + {selectedFiles.map( + (file) => + !currentMessageFileIds.has( + String(file.file_id || file.id) + ) && ( + } + title={file.name} + onRemove={() => removeSelectedFile(file)} + /> + ) + )} {selectedFolders.map((folder) => ( => { let authType: AuthType; - // Override fasapi users auth so we can use both + // Override fastapi users auth so we can use both if (NEXT_PUBLIC_CLOUD_ENABLED) { authType = "cloud"; } else {