Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion backend/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ api_keys.py
vespa-app.zip
dynamic_config_storage/
celerybeat-schedule*
onyx/connectors/salesforce/data/
onyx/connectors/salesforce/data/
.test.env

4 changes: 2 additions & 2 deletions backend/alembic/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,10 +184,10 @@ def event_provide_iam_token_for_alembic(
except Exception as e:
logger.error(f"Error migrating schema {schema}: {e}")
if not continue_on_error:
logger.error("--continue is not set, raising exception!")
logger.error("--continue=true is not set, raising exception!")
raise

logger.warning("--continue is set, continuing to next schema.")
logger.warning("--continue=true is set, continuing to next schema.")

else:
try:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Add content type to UserFile

Revision ID: 5c448911b12f
Revises: 47a07e1a38f1
Create Date: 2025-04-25 16:59:48.182672

"""

from alembic import op
import sqlalchemy as sa

# revision identifiers, used by Alembic.
revision = "5c448911b12f"
down_revision = "47a07e1a38f1"
branch_labels: None = None
depends_on: None = None


def upgrade() -> None:
op.add_column("user_file", sa.Column("content_type", sa.String(), nullable=True))


def downgrade() -> None:
op.drop_column("user_file", "content_type")
1 change: 1 addition & 0 deletions backend/onyx/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2477,6 +2477,7 @@ class UserFile(Base):
"ConnectorCredentialPair", back_populates="user_file"
)
link_url: Mapped[str | None] = mapped_column(String, nullable=True)
content_type: Mapped[str | None] = mapped_column(String, nullable=True)


"""
Expand Down
12 changes: 10 additions & 2 deletions backend/onyx/db/user_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ def create_user_files(
db_session: Session,
link_url: str | None = None,
) -> list[UserFile]:
"""NOTE(rkuo): This function can take -1 (RECENT_DOCS_FOLDER_ID for folder_id.
Document what this does?
"""

# NOTE: At the moment, zip metadata is not used for user files.
# Should revisit to decide whether this should be a feature.
upload_response = upload_files(files, db_session)
Expand All @@ -54,21 +58,25 @@ def create_user_files(
name=file.filename,
token_count=None,
link_url=link_url,
content_type=file.content_type,
)
db_session.add(new_file)
user_files.append(new_file)
db_session.commit()
return user_files


def create_user_file_with_indexing(
def upload_files_to_user_files_with_indexing(
files: List[UploadFile],
folder_id: int | None,
user: User,
db_session: Session,
trigger_index: bool = True,
) -> list[UserFile]:
"""Create user files and trigger immediate indexing"""
"""NOTE(rkuo): This function can take -1 (RECENT_DOCS_FOLDER_ID for folder_id.
Document what this does?

Create user files and trigger immediate indexing"""
# Create the user files first
user_files = create_user_files(files, folder_id, user, db_session)

Expand Down
1 change: 1 addition & 0 deletions backend/onyx/file_processing/extract_file_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

logger = setup_logger()

# NOTE(rkuo): Unify this with upload_files_for_chat and file_valiation.py
TEXT_SECTION_SEPARATOR = "\n\n"

ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS = [
Expand Down
2 changes: 2 additions & 0 deletions backend/onyx/file_processing/file_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Centralized file type validation utilities.
"""

# NOTE(rkuo): Unify this with upload_files_for_chat and extract_file_text

# Standard image MIME types supported by most vision LLMs
IMAGE_MIME_TYPES = [
"image/png",
Expand Down
3 changes: 3 additions & 0 deletions backend/onyx/file_store/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ class ChatFileType(str, Enum):
# Plain text only contain the text
PLAIN_TEXT = "plain_text"
CSV = "csv"

# NOTE(rkuo): don't understand the motivation for this
# "user knowledge" is not a file type, it's a source or intent
USER_KNOWLEDGE = "user_knowledge"


Expand Down
30 changes: 23 additions & 7 deletions backend/onyx/file_store/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from onyx.db.models import ChatMessage
from onyx.db.models import UserFile
from onyx.db.models import UserFolder
from onyx.file_processing.extract_file_text import IMAGE_MEDIA_TYPES
from onyx.file_store.file_store import get_default_file_store
from onyx.file_store.models import ChatFileType
from onyx.file_store.models import FileDescriptor
Expand Down Expand Up @@ -111,6 +112,9 @@ def load_user_folder(folder_id: int, db_session: Session) -> list[InMemoryChatFi


def load_user_file(file_id: int, db_session: Session) -> InMemoryChatFile:
chat_file_type = ChatFileType.USER_KNOWLEDGE
status = "not_loaded"

user_file = db_session.query(UserFile).filter(UserFile.id == file_id).first()
if not user_file:
raise ValueError(f"User file with id {file_id} not found")
Expand All @@ -119,26 +123,38 @@ def load_user_file(file_id: int, db_session: Session) -> InMemoryChatFile:
file_store = get_default_file_store(db_session)
plaintext_file_name = user_file_id_to_plaintext_file_name(file_id)

# check for plain text normalized version first, then use original file otherwise
try:
file_io = file_store.read_file(plaintext_file_name, mode="b")
return InMemoryChatFile(
chat_file = InMemoryChatFile(
file_id=str(user_file.file_id),
content=file_io.read(),
file_type=ChatFileType.USER_KNOWLEDGE,
filename=user_file.name,
)
except Exception as e:
logger.warning(
f"Failed to load plaintext file {plaintext_file_name}, defaulting to original file: {e}"
)
status = "plaintext"
return chat_file
except Exception:
# Fall back to original file if plaintext not available
file_io = file_store.read_file(user_file.file_id, mode="b")
return InMemoryChatFile(
file_record = file_store.read_file_record(user_file.file_id)
if file_record.file_type in IMAGE_MEDIA_TYPES:
chat_file_type = ChatFileType.IMAGE

chat_file = InMemoryChatFile(
file_id=str(user_file.file_id),
content=file_io.read(),
file_type=ChatFileType.USER_KNOWLEDGE,
file_type=chat_file_type,
filename=user_file.name,
)
status = "original"
return chat_file
finally:
logger.debug(
f"load_user_file finished: file_id={user_file.file_id} "
f"chat_file_type={chat_file_type} "
f"status={status}"
)


def load_all_user_files(
Expand Down
2 changes: 1 addition & 1 deletion backend/onyx/server/features/persona/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def undelete_persona(
)


# used for assistat profile pictures
# used for assistant profile pictures
@admin_router.post("/upload-image")
def upload_file(
file: UploadFile,
Expand Down
84 changes: 34 additions & 50 deletions backend/onyx/server/query_and_chat/chat_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
)
from onyx.server.documents.models import ConnectorBase
from onyx.server.documents.models import CredentialBase
from onyx.server.query_and_chat.chat_utils import mime_type_to_chat_file_type
from onyx.server.query_and_chat.models import ChatFeedbackRequest
from onyx.server.query_and_chat.models import ChatMessageIdentifier
from onyx.server.query_and_chat.models import ChatRenameRequest
Expand All @@ -96,6 +97,7 @@
from onyx.server.query_and_chat.models import UpdateChatSessionTemperatureRequest
from onyx.server.query_and_chat.models import UpdateChatSessionThreadRequest
from onyx.server.query_and_chat.token_limit import check_token_rate_limits
from onyx.utils.file_types import UploadMimeTypes
from onyx.utils.headers import get_custom_tool_additional_request_headers
from onyx.utils.logger import setup_logger
from onyx.utils.telemetry import create_milestone_and_report
Expand Down Expand Up @@ -661,51 +663,45 @@ def upload_files_for_chat(
db_session: Session = Depends(get_session),
user: User | None = Depends(current_user),
) -> dict[str, list[FileDescriptor]]:
image_content_types = {"image/jpeg", "image/png", "image/webp"}
csv_content_types = {"text/csv"}
text_content_types = {
"text/plain",
"text/markdown",
"text/x-markdown",
"text/x-config",
"text/tab-separated-values",
"application/json",
"application/xml",
"text/xml",
"application/x-yaml",
}
document_content_types = {
"application/pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"message/rfc822",
"application/epub+zip",
}

allowed_content_types = (
image_content_types.union(text_content_types)
.union(document_content_types)
.union(csv_content_types)
)
# NOTE(rkuo): Unify this with file_validation.py and extract_file_text.py
# image_content_types = {"image/jpeg", "image/png", "image/webp"}
# csv_content_types = {"text/csv"}
# text_content_types = {
# "text/plain",
# "text/markdown",
# "text/x-markdown",
# "text/x-config",
# "text/tab-separated-values",
# "application/json",
# "application/xml",
# "text/xml",
# "application/x-yaml",
# }
# document_content_types = {
# "application/pdf",
# "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
# "application/vnd.openxmlformats-officedocument.presentationml.presentation",
# "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
# "message/rfc822",
# "application/epub+zip",
# }

# allowed_content_types = (
# image_content_types.union(text_content_types)
# .union(document_content_types)
# .union(csv_content_types)
# )

for file in files:
if not file.content_type:
raise HTTPException(status_code=400, detail="File content type is required")

if file.content_type not in allowed_content_types:
if file.content_type in image_content_types:
error_detail = "Unsupported image file type. Supported image types include .jpg, .jpeg, .png, .webp."
elif file.content_type in text_content_types:
error_detail = "Unsupported text file type."
elif file.content_type in csv_content_types:
error_detail = "Unsupported CSV file type."
else:
error_detail = "Unsupported document file type."
raise HTTPException(status_code=400, detail=error_detail)
if file.content_type not in UploadMimeTypes.ALLOWED_MIME_TYPES:
raise HTTPException(status_code=400, detail="Unsupported file type.")

if (
file.content_type in image_content_types
file.content_type in UploadMimeTypes.IMAGE_MIME_TYPES
and file.size
and file.size > 20 * 1024 * 1024
):
Expand All @@ -718,19 +714,7 @@ def upload_files_for_chat(

file_info: list[tuple[str, str | None, ChatFileType]] = []
for file in files:
file_type = (
ChatFileType.IMAGE
if file.content_type in image_content_types
else (
ChatFileType.CSV
if file.content_type in csv_content_types
else (
ChatFileType.DOC
if file.content_type in document_content_types
else ChatFileType.PLAIN_TEXT
)
)
)
file_type = mime_type_to_chat_file_type(file.content_type)

file_content = file.file.read() # Read the file content

Expand Down
18 changes: 18 additions & 0 deletions backend/onyx/server/query_and_chat/chat_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from onyx.file_store.models import ChatFileType
from onyx.utils.file_types import UploadMimeTypes


def mime_type_to_chat_file_type(mime_type: str | None) -> ChatFileType:
if mime_type is None:
return ChatFileType.PLAIN_TEXT

if mime_type in UploadMimeTypes.IMAGE_MIME_TYPES:
return ChatFileType.IMAGE

if mime_type in UploadMimeTypes.CSV_MIME_TYPES:
return ChatFileType.CSV

if mime_type in UploadMimeTypes.DOCUMENT_MIME_TYPES:
return ChatFileType.DOC

return ChatFileType.PLAIN_TEXT
4 changes: 2 additions & 2 deletions backend/onyx/server/user_documents/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@
from onyx.db.models import UserFile
from onyx.db.models import UserFolder
from onyx.db.user_documents import calculate_user_files_token_count
from onyx.db.user_documents import create_user_file_with_indexing
from onyx.db.user_documents import create_user_files
from onyx.db.user_documents import get_user_file_indexing_status
from onyx.db.user_documents import share_file_with_assistant
from onyx.db.user_documents import share_folder_with_assistant
from onyx.db.user_documents import unshare_file_with_assistant
from onyx.db.user_documents import unshare_folder_with_assistant
from onyx.db.user_documents import upload_files_to_user_files_with_indexing
from onyx.file_processing.html_utils import web_html_cleanup
from onyx.server.documents.connector import trigger_indexing_for_cc_pair
from onyx.server.documents.models import ConnectorBase
Expand Down Expand Up @@ -156,7 +156,7 @@ def upload_user_files(

try:
# Use our consolidated function that handles indexing properly
user_files = create_user_file_with_indexing(
user_files = upload_files_to_user_files_with_indexing(
files, folder_id or -1, user, db_session
)

Expand Down
5 changes: 5 additions & 0 deletions backend/onyx/server/user_documents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.models import UserFile
from onyx.db.models import UserFolder
from onyx.file_store.models import ChatFileType
from onyx.server.query_and_chat.chat_utils import mime_type_to_chat_file_type


class UserFileStatus(str, PyEnum):
Expand All @@ -17,6 +19,7 @@ class UserFileStatus(str, PyEnum):
REINDEXING = "REINDEXING"


# this maps to FileResponse on the front end
class UserFileSnapshot(BaseModel):
id: int
name: str
Expand All @@ -30,6 +33,7 @@ class UserFileSnapshot(BaseModel):
indexed: bool
link_url: str | None
status: UserFileStatus
chat_file_type: ChatFileType

@classmethod
def from_model(cls, model: UserFile) -> "UserFileSnapshot":
Expand Down Expand Up @@ -73,6 +77,7 @@ def from_model(cls, model: UserFile) -> "UserFileSnapshot":
else False
),
link_url=model.link_url,
chat_file_type=mime_type_to_chat_file_type(model.content_type),
)


Expand Down
Loading
Loading