From 39708889834a33288a4b1d45bce1d33ed370783a Mon Sep 17 00:00:00 2001 From: Emerson Gomes Date: Mon, 30 Jun 2025 11:57:26 -0500 Subject: [PATCH] Remove hardcoded image extraction flag for PDFs PDFs currently always have their images extracted. This will make use of the "Enable Image Extraction and Analysis" workspace configuration instead. --- backend/onyx/file_processing/extract_file_text.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py index ab636d0ae4b..9c0cb47527a 100644 --- a/backend/onyx/file_processing/extract_file_text.py +++ b/backend/onyx/file_processing/extract_file_text.py @@ -28,6 +28,7 @@ from onyx.configs.constants import FileOrigin from onyx.configs.constants import ONYX_METADATA_FILENAME +from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled from onyx.file_processing.html_utils import parse_html_page_basic from onyx.file_processing.unstructured import get_unstructured_api_key from onyx.file_processing.unstructured import unstructured_to_text @@ -533,7 +534,7 @@ def extract_text_and_images( if extension == ".pdf": file.seek(0) text_content, pdf_metadata, images = read_pdf_file( - file, pdf_pass, extract_images=True + file, pdf_pass, extract_images=get_image_extraction_and_analysis_enabled() ) return ExtractionResult( text_content=text_content, embedded_images=images, metadata=pdf_metadata