Skip to content

Commit 254a7d2

Browse files
fix parser nb
1 parent 4a41b8e commit 254a7d2

File tree

1 file changed

+31
-12
lines changed

1 file changed

+31
-12
lines changed

src/axiomatic/client.py

+31-12
Original file line numberDiff line numberDiff line change
@@ -31,22 +31,41 @@ class DocumentHelper:
3131
def __init__(self, ax_client: Axiomatic):
3232
self._ax_client = ax_client
3333

34-
def pdf_from_url(self, url: str) -> ParseResponse:
35-
"""Download a PDF document from a URL and parse it into a Markdown response."""
36-
if "arxiv" in url and "abs" in url:
37-
url = url.replace("abs", "pdf")
38-
print("The URL is an arXiv abstract page. Replacing 'abs' with 'pdf' to download the PDF.")
39-
file = requests.get(url)
40-
response = self._ax_client.document.parse(file=file.content)
41-
return response
42-
43-
def pdf_from_file(self, path: str) -> ParseResponse:
34+
def pdf_from_file(self, path: str):
4435
"""Open a PDF document from a file path and parse it into a Markdown response."""
4536
with open(path, "rb") as f:
46-
file = f.read()
47-
response = self._ax_client.document.parse(file=file)
37+
file_bytes = f.read()
38+
39+
# Create a tuple with (filename, content and content-type)
40+
# we do this because .parse expects a FastAPI Uploadfile
41+
file_name = path.split("/")[-1]
42+
file_tuple = (file_name, file_bytes, "application/pdf")
43+
44+
response = self._ax_client.document.parse(file=file_tuple)
4845
return response
4946

47+
def pdf_from_url(self, url: str):
48+
"""Download a PDF document from a URL and parse it into a Markdown response."""
49+
if "arxiv.org" in url and "abs" in url:
50+
url = url.replace("abs", "pdf")
51+
print("The URL is an arXiv abstract page. Replacing 'abs' with 'pdf' to download the PDF.")
52+
response = requests.get(url)
53+
54+
if response.status_code != 200:
55+
raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
56+
57+
# Extract filename from URL or use a default
58+
file_name = url.split("/")[-1]
59+
if not file_name.endswith(".pdf"):
60+
file_name = "document.pdf"
61+
62+
# Create a tuple with (filename, content and content-type)
63+
# we do this because .parse expects a FastAPI Uploadfile
64+
file_tuple = (file_name, response.content, "application/pdf")
65+
66+
parse_response = self._ax_client.document.parse(file=file_tuple)
67+
return parse_response
68+
5069
def plot_b64_images(self, images: Dict[str, str]):
5170
"""Plot a dictionary of base64 images."""
5271
import ipywidgets as widgets # type: ignore

0 commit comments

Comments
 (0)