Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions backend/onyx/connectors/confluence/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,11 +373,24 @@ def _fetch_page_attachments(
cql=attachment_query,
expand=",".join(_ATTACHMENT_EXPANSION_FIELDS),
):
attachment["metadata"].get("mediaType", "")
media_type: str = attachment.get("metadata", {}).get("mediaType", "")

# TODO(rkuo): this check is partially redundant with validate_attachment_filetype
# and checks in convert_attachment_to_content/process_attachment
# but doing the check here avoids an unnecessary download. Due for refactoring.
if not self.allow_images:
if media_type.startswith("image/"):
logger.info(
f"Skipping attachment because allow images is False: {attachment['title']}"
)
continue

if not validate_attachment_filetype(
attachment,
):
logger.info(f"Skipping attachment: {attachment['title']}")
logger.info(
f"Skipping attachment because it is not an accepted file type: {attachment['title']}"
)
continue

logger.info(f"Processing attachment: {attachment['title']}")
Expand Down
13 changes: 9 additions & 4 deletions backend/onyx/connectors/confluence/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@
from onyx.db.pg_file_store import create_populate_lobj
from onyx.db.pg_file_store import save_bytes_to_pgfilestore
from onyx.db.pg_file_store import upsert_pgfilestore
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.file_processing.extract_file_text import (
OnyxExtensionType,
extract_file_text,
is_accepted_file_ext,
)
from onyx.file_processing.file_validation import is_valid_image_type
from onyx.file_processing.image_utils import store_image_and_create_section
from onyx.utils.logger import setup_logger
Expand All @@ -59,16 +63,17 @@ def validate_attachment_filetype(
"""
Validates if the attachment is a supported file type.
"""
attachment.get("metadata", {})
media_type = attachment.get("metadata", {}).get("mediaType", "")

if media_type.startswith("image/"):
return is_valid_image_type(media_type)

# For non-image files, check if we support the extension
title = attachment.get("title", "")
extension = Path(title).suffix.lstrip(".").lower() if "." in title else ""
return extension in ["pdf", "doc", "docx", "txt", "md", "rtf"]

return is_accepted_file_ext(
"." + extension, OnyxExtensionType.Plain | OnyxExtensionType.Document
)


class AttachmentProcessingResult(BaseModel):
Expand Down
Loading