@@ -31,22 +31,41 @@ class DocumentHelper:
31
31
def __init__ (self , ax_client : Axiomatic ):
32
32
self ._ax_client = ax_client
33
33
34
- def pdf_from_url (self , url : str ) -> ParseResponse :
35
- """Download a PDF document from a URL and parse it into a Markdown response."""
36
- if "arxiv" in url and "abs" in url :
37
- url = url .replace ("abs" , "pdf" )
38
- print ("The URL is an arXiv abstract page. Replacing 'abs' with 'pdf' to download the PDF." )
39
- file = requests .get (url )
40
- response = self ._ax_client .document .parse (file = file .content )
41
- return response
42
-
43
- def pdf_from_file (self , path : str ) -> ParseResponse :
34
+ def pdf_from_file (self , path : str ):
44
35
"""Open a PDF document from a file path and parse it into a Markdown response."""
45
36
with open (path , "rb" ) as f :
46
- file = f .read ()
47
- response = self ._ax_client .document .parse (file = file )
37
+ file_bytes = f .read ()
38
+
39
+ # Create a tuple with (filename, content and content-type)
40
+ # we do this because .parse expects a FastAPI Uploadfile
41
+ file_name = path .split ("/" )[- 1 ]
42
+ file_tuple = (file_name , file_bytes , "application/pdf" )
43
+
44
+ response = self ._ax_client .document .parse (file = file_tuple )
48
45
return response
49
46
47
+ def pdf_from_url (self , url : str ):
48
+ """Download a PDF document from a URL and parse it into a Markdown response."""
49
+ if "arxiv.org" in url and "abs" in url :
50
+ url = url .replace ("abs" , "pdf" )
51
+ print ("The URL is an arXiv abstract page. Replacing 'abs' with 'pdf' to download the PDF." )
52
+ response = requests .get (url )
53
+
54
+ if response .status_code != 200 :
55
+ raise Exception (f"Failed to download PDF. Status code: { response .status_code } " )
56
+
57
+ # Extract filename from URL or use a default
58
+ file_name = url .split ("/" )[- 1 ]
59
+ if not file_name .endswith (".pdf" ):
60
+ file_name = "document.pdf"
61
+
62
+ # Create a tuple with (filename, content and content-type)
63
+ # we do this because .parse expects a FastAPI Uploadfile
64
+ file_tuple = (file_name , response .content , "application/pdf" )
65
+
66
+ parse_response = self ._ax_client .document .parse (file = file_tuple )
67
+ return parse_response
68
+
50
69
def plot_b64_images (self , images : Dict [str , str ]):
51
70
"""Plot a dictionary of base64 images."""
52
71
import ipywidgets as widgets # type: ignore
0 commit comments