Skip to content

Commit e09fa3c

Browse files
committed
file names and extensions etc
1 parent 7a24d65 commit e09fa3c

File tree

2 files changed

+47
-3
lines changed

2 files changed

+47
-3
lines changed

backend/onyx/connectors/sharepoint/connector.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1696,7 +1696,7 @@ def _load_from_checkpoint(
16961696

16971697
try:
16981698
log_memory_usage(
1699-
f"_load_from_checkpoint:before_convert_driveitem:{driveitem.id}"
1699+
f"_load_from_checkpoint:before_convert_driveitem:{driveitem.id}:{driveitem.name}"
17001700
)
17011701
doc = _convert_driveitem_to_document_with_permissions(
17021702
driveitem,
@@ -1706,7 +1706,7 @@ def _load_from_checkpoint(
17061706
include_permissions=include_permissions,
17071707
)
17081708
log_memory_usage(
1709-
f"_load_from_checkpoint:after_convert_driveitem:{driveitem.id}",
1709+
f"_load_from_checkpoint:after_convert_driveitem:{driveitem.id}:{driveitem.name}",
17101710
doc,
17111711
"converted_doc",
17121712
)

backend/onyx/file_processing/extract_file_text.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,11 @@ def read_text_file(
223223
"""
224224
metadata = {}
225225
file_content_raw = ""
226+
log_memory_usage(
227+
f"read_text_file:before_read_text_file:{encoding}:{errors}:{ignore_onyx_metadata}",
228+
file,
229+
"file",
230+
)
226231
for ind, line in enumerate(file):
227232
# decode
228233
try:
@@ -243,6 +248,16 @@ def read_text_file(
243248

244249
file_content_raw += line
245250

251+
log_memory_usage(
252+
f"read_text_file:after_read_text_file:{encoding}:{errors}:{ignore_onyx_metadata}",
253+
file_content_raw,
254+
"file_content_raw",
255+
)
256+
log_memory_usage(
257+
f"read_text_file:after_read_text_file:{encoding}:{errors}:{ignore_onyx_metadata}",
258+
metadata,
259+
"metadata",
260+
)
246261
return file_content_raw, metadata
247262

248263

@@ -604,6 +619,11 @@ def _extract_text_and_images(
604619
) -> ExtractionResult:
605620
file.seek(0)
606621

622+
log_memory_usage(
623+
f"extract_text_and_images:before_unstructured:{file_name}:{content_type}",
624+
file,
625+
"file",
626+
)
607627
if get_unstructured_api_key():
608628
try:
609629
text_content = unstructured_to_text(file, file_name)
@@ -627,7 +647,11 @@ def _extract_text_and_images(
627647
# Default processing
628648
try:
629649
extension = get_file_ext(file_name)
630-
650+
log_memory_usage(
651+
f"extract_text_and_images:before_unstructured:{file_name}:{content_type}:{extension}",
652+
file,
653+
"file",
654+
)
631655
# docx example for embedded images
632656
if extension == ".docx":
633657
log_memory_usage(
@@ -655,12 +679,32 @@ def _extract_text_and_images(
655679
# PDF example: we do not show complicated PDF image extraction here
656680
# so we simply extract text for now and skip images.
657681
if extension == ".pdf":
682+
log_memory_usage(
683+
"extract_text_and_images:before_read_pdf_file",
684+
file,
685+
"file",
686+
)
658687
text_content, pdf_metadata, images = read_pdf_file(
659688
file,
660689
pdf_pass,
661690
extract_images=get_image_extraction_and_analysis_enabled(),
662691
image_callback=image_callback,
663692
)
693+
log_memory_usage(
694+
"extract_text_and_images:after_read_pdf_file",
695+
text_content,
696+
"text_content",
697+
)
698+
log_memory_usage(
699+
"extract_text_and_images:after_read_pdf_file",
700+
pdf_metadata,
701+
"pdf_metadata",
702+
)
703+
log_memory_usage(
704+
"extract_text_and_images:after_read_pdf_file",
705+
images,
706+
"images",
707+
)
664708
return ExtractionResult(
665709
text_content=text_content, embedded_images=images, metadata=pdf_metadata
666710
)

0 commit comments

Comments
 (0)