Skip to content

Commit 7372d48

Browse files
committed
fix(sharepoint): Add secondary filter for embedded images
1 parent d186d8e commit 7372d48

File tree

1 file changed

+20
-0
lines changed

1 file changed

+20
-0
lines changed

backend/onyx/connectors/sharepoint/connector.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
from onyx.file_processing.extract_file_text import get_file_ext
6060
from onyx.file_processing.file_validation import EXCLUDED_IMAGE_TYPES
6161
from onyx.file_processing.image_utils import store_image_and_create_section
62+
from onyx.utils.b64 import get_image_type_from_bytes
6263
from onyx.utils.logger import setup_logger
6364

6465
logger = setup_logger()
@@ -395,6 +396,25 @@ def _convert_driveitem_to_document_with_permissions(
395396
else:
396397
# Note: we don't process Onyx metadata for connectors like Drive & Sharepoint, but could
397398
def _store_embedded_image(img_data: bytes, img_name: str) -> None:
399+
try:
400+
mime_type = get_image_type_from_bytes(img_data)
401+
except ValueError:
402+
logger.debug(
403+
"Skipping embedded image with unknown format for %s",
404+
driveitem.name,
405+
)
406+
return
407+
408+
# The only mime type that would be returned by get_image_type_from_bytes that is in
409+
# EXCLUDED_IMAGE_TYPES is image/gif.
410+
if mime_type in EXCLUDED_IMAGE_TYPES:
411+
logger.debug(
412+
"Skipping embedded image of excluded type %s for %s",
413+
mime_type,
414+
driveitem.name,
415+
)
416+
return
417+
398418
image_section, _ = store_image_and_create_section(
399419
image_data=img_data,
400420
file_id=f"{driveitem.id}_img_{len(sections)}",

0 commit comments

Comments
 (0)