Skip to content

Commit 70af9f1

Browse files
authored
Merge pull request #113 from CatchTheTornado/feature/54-add-docling-support
Feature/54 add docling support
2 parents da03a37 + 341c8eb commit 70af9f1

File tree

10 files changed

+177
-34
lines changed

10 files changed

+177
-34
lines changed

Makefile

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ install:
3131
printf "%s\n" "$$padding"; \
3232
printf "\e[1;34m Do you want to run the application locally or with Docker?\e[0m\n"; \
3333
printf "\e[1;33m [L] \e[0m Local - Run the application locally\n"; \
34-
printf "\e[1;33m [D] \e[0m Docker - Run the axpplication in Docker\n"; \
34+
printf "\e[1;33m [D] \e[0m Docker - Run the application in Docker\n"; \
3535
read -p " > " choice; \
3636
case "$$choice" in \
3737
[lL]) echo -e "\033[1;32m ✔ You chose: Local Setup\033[0m"; $(MAKE) setup-local ;; \
@@ -47,38 +47,46 @@ setup-local:
4747
cp .env.localhost.example .env.localhost; \
4848
fi
4949
@while true; do \
50-
printf "\n\e[1;34m Python setup environment...\e[0m"; \
50+
printf "\n\e[1;34m Python setup environment \e[0m"; \
5151
printf "\e[1;34m\n Do you want to install requirements?\e[0m\n"; \
52-
printf "\e[1;33m [y] \e[0m Yes - Install and then run application locally\n"; \
53-
printf "\e[1;33m [n] \e[0m No - Skip and run application locally \n"; \
54-
read -p " > " choice; \
55-
case "$$choice" in \
56-
[yY]) \
57-
echo -e "\033[1;32m ✔ Installing Python dependencies...\033[0m"; \
58-
$(MAKE) install-requirements; \
59-
$(MAKE) run; \
60-
break; \
61-
;; \
62-
[nN]|[sS]) \
63-
echo -e "\033[1;33m Skipping requirement installation. Starting the local server instead...\033[0m"; \
64-
$(MAKE) run; \
65-
break; \
66-
;; \
67-
*) \
68-
echo -e "\033[1;31m Invalid input: Please enter 'y', 'n', or 's' to proceed.\033[0m"; \
69-
;; \
70-
esac; \
52+
printf "\e[1;33m [Y] \e[0m Yes - Install and then run application locally\n"; \
53+
printf "\e[1;33m [N] \e[0m No - Skip and run application locally \n"; \
54+
read -p " > " choice; \
55+
case "$$choice" in \
56+
[yY]) \
57+
echo -e "\033[1;32m ✔ Installing Python dependencies...\033[0m"; \
58+
$(MAKE) install-requirements; \
59+
$(MAKE) run; \
60+
break; \
61+
;; \
62+
[nN]|[sS]) \
63+
echo -e "\033[1;32m ✔ Skipping requirement installation. Starting the local server instead...\033[0m"; \
64+
$(MAKE) run; \
65+
break; \
66+
;; \
67+
*) \
68+
echo -e "\033[1;31m Invalid input: Please enter 'y', 'n', or 's' to proceed.\033[0m"; \
69+
;; \
70+
esac; \
7171
done
7272

7373
.PHONY: install-linux
7474
install-linux:
75-
@echo -e "\033[1;34m Installing Linux dependencies...\033[0m"; \
75+
@echo -e "\033[1;32m ✔ Installing Linux dependencies...\033[0m"; \
7676
sudo apt update && sudo apt install -y libmagic1 poppler-utils pkg-config
7777

7878
.PHONY: install-macos
7979
install-macos:
80-
@echo -e "\033[1;34m Installing macOS dependencies...\033[0m"; \
81-
brew update && brew install libmagic poppler pkg-config ghostscript ffmpeg automake autoconf
80+
@commandToRun="brew update && brew install libmagic poppler pkg-config ghostscript ffmpeg automake autoconf"; \
81+
printf "\e[1;34m\n The installer will execute the following command:\n > $$commandToRun\033[0m\n"; \
82+
printf " Press \e[1;33m[ANY KEY]\e[0m to proceed with the installation, or \e[1;33m[N]\e[0m to skip (ensure these dependencies are installed manually):\n"; \
83+
read -p " > " choice; \
84+
if [ "$$choice" != "n" ] && [ "$$choice" != "N" ]; then \
85+
sh -c "$$commandToRun"; \
86+
echo -e "\033[1;32m ✔ macOS dependencies installed successfully.\033[0m"; \
87+
else \
88+
echo -e "\033[2m ➖ macOS dependency installation skipped.\033[0m"; \
89+
fi
8290

8391
.PHONY: install-requirements
8492
install-requirements:
@@ -89,7 +97,7 @@ install-requirements:
8997
.PHONY: run
9098
run:
9199
@$(call load_env,.env.localhost)
92-
@echo "Starting the local application server..."; \
100+
@printf "\033[1;32m ✔ Starting the local application server...\033[0m"; \
93101
DISABLE_VENV=$(DISABLE_VENV) DISABLE_LOCAL_OLLAMA=$(DISABLE_LOCAL_OLLAMA) ./run.sh
94102

95103
.PHONY: setup-docker
@@ -111,17 +119,17 @@ setup-docker:
111119

112120
.PHONY: run-docker
113121
run-docker:
114-
@echo -e "\033[1;34m Starting Docker container with CPU support...\033[0m";
122+
@echo -e "\033[1;32m ✔ Starting Docker container with CPU support...\033[0m";
115123
@docker-compose -f docker-compose.yml up --build
116124

117125
.PHONY: run-docker-gpu
118126
run-docker-gpu:
119-
@echo -e "\033[1;34m Starting Docker container with GPU support...\033[0m";
127+
@echo -e "\033[1;32m ✔ Starting Docker container with GPU support...\033[0m";
120128
@docker-compose -f docker-compose.gpu.yml -p text-extract-api-gpu up --build
121129

122130
.PHONY: clean
123131
clean:
124-
@echo "Cleaning project..."; \
132+
@echo "\033[1;32m ✔ Cleaning project...\033[0m"; \
125133
docker-compose down -v; \
126134
$(MAKE) clear-cache
127135

config/strategies.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ strategies:
99
prompt: You are OCR. Convert image to markdown. Return only the markdown with no explanation text. Do not exclude any content from the page.
1010
easyocr:
1111
class: text_extract_api.extract.strategies.easyocr.EasyOCRStrategy
12+
docling:
13+
class: text_extract_api.extract.strategies.docling.DoclingStrategy
14+
model: llama3.1
15+
prompt: You are OCR. Convert image to markdown. Return only the markdown with no explanation text. Do not exclude any content from the page.
1216
remote:
1317
class: text_extract_api.extract.strategies.remote.RemoteStrategy
1418
url:

examples/example-word-lorem.docx

49 KB
Binary file not shown.

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ dependencies = [
2828
"google-auth-httplib2",
2929
"google-auth-oauthlib",
3030
"transformers",
31+
"accelerate",
3132
"boto3",
3233
"Pillow",
3334
"python-magic==0.4.27",
@@ -40,6 +41,8 @@ dependencies = [
4041
"numpy",
4142
"pydantic",
4243
"python-dotenv",
44+
"docling",
45+
"docling-parse"
4346
]
4447
[project.optional-dependencies]
4548
dev = [
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import tempfile
2+
3+
from docling.document_converter import DocumentConverter
4+
from docling_core.types.doc.document import ( # Assuming a compatible Docling library or module
5+
DoclingDocument,
6+
)
7+
8+
from text_extract_api.extract.extract_result import ExtractResult
9+
from text_extract_api.extract.strategies.strategy import Strategy
10+
from text_extract_api.files.file_formats import FileFormat, PdfFileFormat
11+
12+
class DoclingStrategy(Strategy):
13+
"""
14+
Extraction strategy for processing PDF documents using Docling.
15+
"""
16+
17+
def name(self) -> str:
18+
return "docling"
19+
20+
def extract_text(
21+
self, file_format: FileFormat, language: str = "en"
22+
) -> ExtractResult:
23+
"""
24+
Extracts text from a file using Docling and returns an ExtractResult.
25+
26+
:param file_format: Instance of FileFormat (which supports most docling formats).
27+
:param language: Language of the text (default is 'en').
28+
:return: ExtractResult containing the extracted DoclingDocument and metadata.
29+
"""
30+
31+
# Save file content to a temporary file
32+
temp_file_path = self._save_to_temp_file(file_format)
33+
34+
# Convert the document using Docling
35+
docling_document = self._convert_to_docling(temp_file_path)
36+
37+
# Return the result wrapped in ExtractResult
38+
return ExtractResult(value=docling_document, text_gatherer=self.text_gatherer)
39+
40+
def text_gatherer(self, docling_document: DoclingDocument) -> str:
41+
"""
42+
Gathers text content from a DoclingDocument in markdown format.
43+
44+
:param docling_document: Instance of DoclingDocument.
45+
:return: Text content in markdown format.
46+
"""
47+
return docling_document.export_to_markdown()
48+
49+
def _convert_to_docling(self, file_path: str) -> DoclingDocument:
50+
"""
51+
Converts a file into a DoclingDocument instance.
52+
53+
:param file_path: Path to the PDF file to be converted.
54+
:return: DoclingDocument instance.
55+
"""
56+
# Placeholder for actual conversion logic using the Docling API
57+
try:
58+
converter = DocumentConverter()
59+
docling_document = converter.convert(file_path).document
60+
return docling_document
61+
except Exception as e:
62+
raise RuntimeError(f"Failed to convert document using Docling: {e}")
63+
64+
def _save_to_temp_file(self, file_format: FileFormat) -> str:
65+
"""
66+
Saves the content of a FileFormat instance to a temporary file.
67+
68+
:param file_format: Instance of FileFormat.
69+
:return: Path to the temporary file containing the file content.
70+
"""
71+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
72+
temp_file.write(file_format.binary)
73+
return temp_file.name

text_extract_api/extract/strategies/ollama.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
import tempfile
33
import time
44

5-
import ollama
5+
import httpx
6+
from ollama import Client
67

78
from extract.extract_result import ExtractResult
89
from text_extract_api.extract.strategies.strategy import Strategy
@@ -26,7 +27,6 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
2627
raise TypeError(
2728
f"Ollama OCR - format {file_format.mime_type} is not supported (yet?)"
2829
)
29-
3030
images = FileFormat.convert_to(file_format, ImageFileFormat)
3131
extracted_text = ""
3232
start_time = time.time()
@@ -38,9 +38,10 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
3838
temp_file.write(image.binary)
3939
temp_filename = temp_file.name
4040

41-
print(self._strategy_config)
4241
# Generate text using the specified model
4342
try:
43+
timeout = httpx.Timeout(connect=180.0, read=180.0, write=180.0, pool=180.0) # @todo move those values to .env
44+
ollama = Client(timeout=timeout)
4445
response = ollama.chat(self._strategy_config.get('model'), [{
4546
'role': 'user',
4647
'content': self._strategy_config.get('prompt'),
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
2+
### WARNING
3+
### This file is generated dynamically before git commit.
4+
### Run ./scripts/dev/gen-file-format-init.sh from repository root.
5+
6+
from .file_format import FileFormat
7+
from .docling import DoclingFileFormat
8+
from .pdf import PdfFileFormat
9+
from .image import ImageFileFormat
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from typing import Type, Dict, Callable, Iterator
2+
from text_extract_api.files.file_formats.file_format import FileFormat
3+
4+
5+
class DoclingFileFormat(FileFormat):
6+
DEFAULT_FILENAME: str = "document.docling"
7+
DEFAULT_MIME_TYPE: str = "application/vnd.docling"
8+
9+
@staticmethod
10+
def accepted_mime_types() -> list[str]:
11+
return [
12+
"application/vnd.docling", # Docling documents
13+
# Do not put all formats handled by docling here - only those that are not supported by dedicated file formats"
14+
"text/plain",
15+
"text/markdown",
16+
"text/html",
17+
"application/msword",
18+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
19+
"application/vnd.oasis.opendocument.text",
20+
"application/vnd.ms-excel",
21+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
22+
"application/vnd.ms-powerpoint",
23+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
24+
"text/csv",
25+
"application/json",
26+
"application/xml",
27+
]
28+
29+
@staticmethod
30+
def is_pageable() -> bool:
31+
return True
32+
33+
@classmethod
34+
def default_iterator_file_format(cls) -> Type[FileFormat]:
35+
return cls
36+
37+
@staticmethod
38+
def convertible_to() -> Dict[Type["FileFormat"], Callable[[], Iterator["FileFormat"]]]:
39+
# No specific converters needed as the strategy will handle conversion
40+
return {}
41+
42+
@staticmethod
43+
def validate(binary_file_content: bytes):
44+
if not binary_file_content or len(binary_file_content) == 0:
45+
raise ValueError("Empty file content")

text_extract_api/files/file_formats/file_format.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ def from_binary(
6565
filename: Optional[str] = None,
6666
mime_type: Optional[str] = None
6767
) -> Type["FileFormat"]:
68+
if mime_type == "application/octet-stream":
69+
mime_type = None
6870
mime_type = mime_type or FileFormat._guess_mime_type(binary_data=binary, filename=filename)
6971
from text_extract_api.files.file_formats.pdf import PdfFileFormat # type: ignore
7072
file_format_class = cls._get_file_format_class(mime_type)
@@ -196,6 +198,7 @@ def unify(self) -> "FileFormat":
196198
def _get_file_format_class(mime_type: str) -> Type["FileFormat"]:
197199
import text_extract_api.files.file_formats.pdf # noqa - its not unused import @todo autodiscover
198200
import text_extract_api.files.file_formats.image # noqa - its not unused import @todo autodiscover
201+
import text_extract_api.files.file_formats.docling # noqa - its not unused import @todo autodiscover
199202
for subclass in FileFormat.__subclasses__():
200203
if mime_type in subclass.accepted_mime_types():
201204
return subclass

text_extract_api/main.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
# Define base path as text_extract_api - required for keeping absolute namespaces
2020
sys.path.insert(0, str(pathlib.Path(__file__).parent.resolve()))
2121

22-
2322
def storage_profile_exists(profile_name: str) -> bool:
2423
profile_path = os.path.abspath(
2524
os.path.join(os.getenv('STORAGE_PROFILE_PATH', './storage_profiles'), f'{profile_name}.yaml'))
@@ -29,13 +28,11 @@ def storage_profile_exists(profile_name: str) -> bool:
2928
return os.path.isfile(sub_profile_path)
3029
return True
3130

32-
3331
app = FastAPI()
3432
# Connect to Redis
3533
redis_url = os.getenv('REDIS_CACHE_URL', 'redis://redis:6379/1')
3634
redis_client = redis.StrictRedis.from_url(redis_url)
3735

38-
3936
@app.post("/ocr")
4037
async def ocr_endpoint(
4138
strategy: str = Form(...),

0 commit comments

Comments
 (0)