Skip to content

Commit 22e83ab

Browse files
committed
passing tests
1 parent 5a15078 commit 22e83ab

File tree

2 files changed

+117
-13
lines changed

2 files changed

+117
-13
lines changed

backend/onyx/file_processing/extract_file_text.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import base64
21
import io
32
import json
43
import os
@@ -308,17 +307,16 @@ def read_pdf_file(
308307
return "", metadata, []
309308

310309

311-
def extract_docx_images(markdown: str) -> list[tuple[bytes, str]]:
312-
"""
313-
Extract images from a markdown string.
314-
"""
315-
pattern = re.compile(r"!\[[^\]]*\]\((data:image/[^;]+;base64,([^)]*))\)")
316-
images = []
317-
for i, m in enumerate(pattern.finditer(markdown)):
318-
full_uri, b64 = m.groups()
319-
img_bytes = base64.b64decode(b64)
320-
images.append((img_bytes, f"image_{i}.{full_uri.split(';')[0][5:]}"))
321-
return images
310+
def extract_docx_images(docx_bytes: IO[Any]) -> list[tuple[bytes, str]]:
311+
out = []
312+
try:
313+
with zipfile.ZipFile(docx_bytes) as z:
314+
for name in z.namelist():
315+
if name.startswith("word/media/"):
316+
out.append((z.read(name), name.split("/")[-1]))
317+
except Exception:
318+
logger.exception("Failed to extract all docx images")
319+
return out
322320

323321

324322
def docx_to_text_and_images(
@@ -349,7 +347,7 @@ def docx_to_text_and_images(
349347
)
350348
return text_content_raw or "", []
351349

352-
return doc.markdown, extract_docx_images(doc.markdown)
350+
return doc.markdown, extract_docx_images(to_bytesio(file))
353351

354352

355353
def pptx_to_text(file: IO[Any], file_name: str = "") -> str:

backend/tests/integration/tests/image_indexing/test_indexing_images.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
FILE_NAME = "Sample.pdf"
2323
FILE_PATH = "tests/integration/common_utils/test_files"
24+
DOCX_FILE_NAME = "three_images.docx"
2425

2526

2627
def test_image_indexing(
@@ -110,3 +111,108 @@ def test_image_indexing(
110111
else:
111112
assert document.image_file_id is not None
112113
assert file_paths[0] in document.image_file_id
114+
115+
116+
def test_docx_image_indexing(
117+
reset: None,
118+
admin_user: DATestUser,
119+
vespa_client: vespa_fixture,
120+
) -> None:
121+
"""Test that images from docx files are correctly extracted and indexed."""
122+
os.makedirs(FILE_PATH, exist_ok=True)
123+
test_file_path = os.path.join(FILE_PATH, DOCX_FILE_NAME)
124+
125+
# Use FileManager to upload the test file
126+
upload_response = FileManager.upload_file_for_connector(
127+
file_path=test_file_path,
128+
file_name=DOCX_FILE_NAME,
129+
user_performing_action=admin_user,
130+
)
131+
132+
LLMProviderManager.create(
133+
name="test_llm_docx",
134+
user_performing_action=admin_user,
135+
)
136+
137+
SettingsManager.update_settings(
138+
DATestSettings(
139+
search_time_image_analysis_enabled=True,
140+
image_extraction_and_analysis_enabled=True,
141+
),
142+
user_performing_action=admin_user,
143+
)
144+
145+
file_paths = upload_response.file_paths
146+
147+
if not file_paths:
148+
pytest.fail("File upload failed - no file paths returned")
149+
150+
# Create a dummy credential for the file connector
151+
credential = CredentialManager.create(
152+
source=DocumentSource.FILE,
153+
credential_json={},
154+
user_performing_action=admin_user,
155+
)
156+
157+
# Create the connector
158+
connector_name = f"DocxFileConnector-{int(datetime.now().timestamp())}"
159+
connector = ConnectorManager.create(
160+
name=connector_name,
161+
source=DocumentSource.FILE,
162+
input_type=InputType.LOAD_STATE,
163+
connector_specific_config={"file_locations": file_paths, "zip_metadata": {}},
164+
access_type=AccessType.PUBLIC,
165+
groups=[],
166+
user_performing_action=admin_user,
167+
)
168+
169+
# Link the credential to the connector
170+
cc_pair = CCPairManager.create(
171+
credential_id=credential.id,
172+
connector_id=connector.id,
173+
access_type=AccessType.PUBLIC,
174+
user_performing_action=admin_user,
175+
)
176+
177+
# Explicitly run the connector to start indexing
178+
CCPairManager.run_once(
179+
cc_pair=cc_pair,
180+
from_beginning=True,
181+
user_performing_action=admin_user,
182+
)
183+
CCPairManager.wait_for_indexing_completion(
184+
cc_pair=cc_pair,
185+
after=datetime.now(timezone.utc),
186+
timeout=300,
187+
user_performing_action=admin_user,
188+
)
189+
190+
with get_session_with_current_tenant() as db_session:
191+
# Fetch documents from Vespa - expect text content plus 3 images
192+
documents = DocumentManager.fetch_documents_for_cc_pair(
193+
cc_pair_id=cc_pair.id,
194+
db_session=db_session,
195+
vespa_client=vespa_client,
196+
)
197+
198+
# Should have documents for text content plus 3 images
199+
assert (
200+
len(documents) >= 3
201+
), f"Expected at least 3 documents (3 images), got {len(documents)}"
202+
203+
# Count documents with images
204+
image_documents = [doc for doc in documents if doc.image_file_id is not None]
205+
text_documents = [doc for doc in documents if doc.image_file_id is None]
206+
207+
assert (
208+
len(image_documents) == 3
209+
), f"Expected exactly 3 image documents, got {len(image_documents)}"
210+
assert (
211+
len(text_documents) >= 1
212+
), f"Expected at least 1 text document, got {len(text_documents)}"
213+
214+
# Verify each image document has a valid image_file_id pointing to our uploaded file
215+
for image_doc in image_documents:
216+
assert file_paths[0] in (
217+
image_doc.image_file_id or ""
218+
), f"Image document should reference uploaded file: {image_doc.image_file_id}"

0 commit comments

Comments
 (0)