@@ -223,6 +223,11 @@ def read_text_file(
223
223
"""
224
224
metadata = {}
225
225
file_content_raw = ""
226
+ log_memory_usage (
227
+ f"read_text_file:before_read_text_file:{ encoding } :{ errors } :{ ignore_onyx_metadata } " ,
228
+ file ,
229
+ "file" ,
230
+ )
226
231
for ind , line in enumerate (file ):
227
232
# decode
228
233
try :
@@ -243,6 +248,16 @@ def read_text_file(
243
248
244
249
file_content_raw += line
245
250
251
+ log_memory_usage (
252
+ f"read_text_file:after_read_text_file:{ encoding } :{ errors } :{ ignore_onyx_metadata } " ,
253
+ file_content_raw ,
254
+ "file_content_raw" ,
255
+ )
256
+ log_memory_usage (
257
+ f"read_text_file:after_read_text_file:{ encoding } :{ errors } :{ ignore_onyx_metadata } " ,
258
+ metadata ,
259
+ "metadata" ,
260
+ )
246
261
return file_content_raw , metadata
247
262
248
263
@@ -604,6 +619,11 @@ def _extract_text_and_images(
604
619
) -> ExtractionResult :
605
620
file .seek (0 )
606
621
622
+ log_memory_usage (
623
+ f"extract_text_and_images:before_unstructured:{ file_name } :{ content_type } " ,
624
+ file ,
625
+ "file" ,
626
+ )
607
627
if get_unstructured_api_key ():
608
628
try :
609
629
text_content = unstructured_to_text (file , file_name )
@@ -627,7 +647,11 @@ def _extract_text_and_images(
627
647
# Default processing
628
648
try :
629
649
extension = get_file_ext (file_name )
630
-
650
+ log_memory_usage (
651
+ f"extract_text_and_images:before_unstructured:{ file_name } :{ content_type } :{ extension } " ,
652
+ file ,
653
+ "file" ,
654
+ )
631
655
# docx example for embedded images
632
656
if extension == ".docx" :
633
657
log_memory_usage (
@@ -655,12 +679,32 @@ def _extract_text_and_images(
655
679
# PDF example: we do not show complicated PDF image extraction here
656
680
# so we simply extract text for now and skip images.
657
681
if extension == ".pdf" :
682
+ log_memory_usage (
683
+ "extract_text_and_images:before_read_pdf_file" ,
684
+ file ,
685
+ "file" ,
686
+ )
658
687
text_content , pdf_metadata , images = read_pdf_file (
659
688
file ,
660
689
pdf_pass ,
661
690
extract_images = get_image_extraction_and_analysis_enabled (),
662
691
image_callback = image_callback ,
663
692
)
693
+ log_memory_usage (
694
+ "extract_text_and_images:after_read_pdf_file" ,
695
+ text_content ,
696
+ "text_content" ,
697
+ )
698
+ log_memory_usage (
699
+ "extract_text_and_images:after_read_pdf_file" ,
700
+ pdf_metadata ,
701
+ "pdf_metadata" ,
702
+ )
703
+ log_memory_usage (
704
+ "extract_text_and_images:after_read_pdf_file" ,
705
+ images ,
706
+ "images" ,
707
+ )
664
708
return ExtractionResult (
665
709
text_content = text_content , embedded_images = images , metadata = pdf_metadata
666
710
)
0 commit comments