Skip to content

Commit 067ade2

Browse files
authored
feat: parsing documents using markitdown
2 parents 1b0c4e1 + 1387c55 commit 067ade2

File tree

9 files changed

+459
-49
lines changed

9 files changed

+459
-49
lines changed

.env.example

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Log Level (Optional)
2+
LOG_LEVEL=INFO
3+
4+
#General Config (Optional)
5+
CORS_ALLOW_ORIGINS=*
6+
MAX_FILE_SIZE=10.0
7+
PDF_MAX_IMAGES=10
8+
9+
# Audio Config (Optional)
10+
AZURE_SPEECH_KEY=
11+
AZURE_SPEECH_REGION=
12+
13+
# Storage Config (Optional)
14+
STORAGE_TYPE=
15+
LOCAL_STORAGE_DOMAIN=
16+
17+
# AWS S3 (Optional)
18+
S3_SIGN_VERSION=
19+
S3_ACCESS_KEY=
20+
S3_SECRET_KEY=
21+
S3_BUCKET=
22+
S3_REGION=
23+
24+
# Telegram CDN (Optional)
25+
TG_ENDPOINT=
26+
TG_PASSWORD=
27+
28+
# OCR Config (Optional)
29+
OCR_ENDPOINT=
30+
31+
# MarkItDown Config (Optional)
32+
MARKITDOWN_ENABLE=false
33+
MARKITDOWN_ENABLE_PLUGINS=false
34+
MARKITDOWN_USE_DOCINTEL=false
35+
MARKITDOWN_DOCINTEL_ENDPOINT=
36+
MARKITDOWN_DOCINTEL_KEY=
37+
MARKITDOWN_USE_LLM=false
38+
MARKITDOWN_LLM_MODEL=gpt-4o
39+
MARKITDOWN_LLM_ENDPOINT=https://api.openai.com/v1
40+
MARKITDOWN_LLM_API_KEY=

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
- 📦 **Multiple Storage Options**: Base64, Local, S3, Cloudflare R2, Min IO, Telegram CDN etc.
1717
- 🔍 **OCR Support**: Extract Text from Image (Require Paddle OCR API)
1818
- 🔊 **Audio Support**: Convert Audio to Text (Require Azure Speech to Text Service)
19+
- 📄 **MarkItDown Support**: Advanced document parsing using Microsoft's MarkItDown library
1920

2021
## Supported File Types
2122
- Text
@@ -191,6 +192,27 @@ Response
191192
- `OCR_ENDPOINT` Paddle OCR Endpoint
192193
- *e.g.: *http://example.com:8000*
193194

195+
## `5` 📄 MarkItDown Support
196+
You can enable MarkItDown for better document parsing by configuring the following environment variables:
197+
198+
```
199+
MARKITDOWN_ENABLE=true # Enable MarkItDown
200+
MARKITDOWN_ENABLE_PLUGINS=false # Whether to enable plugins
201+
MARKITDOWN_USE_DOCINTEL=false # Whether to use Document Intelligence
202+
MARKITDOWN_DOCINTEL_ENDPOINT= # Document Intelligence endpoint
203+
MARKITDOWN_DOCINTEL_KEY= # Document Intelligence API key
204+
MARKITDOWN_USE_LLM=false # Whether to use LLM for image descriptions
205+
MARKITDOWN_LLM_MODEL=gpt-4o # LLM model for image descriptions
206+
MARKITDOWN_LLM_ENDPOINT= # Custom OpenAI API endpoint (optional)
207+
MARKITDOWN_LLM_API_KEY= # Custom OpenAI API key (optional)
208+
```
209+
210+
When MarkItDown is enabled, it will be used as the primary processor for supported file types, falling back to the built-in processors if any error occurs.
211+
212+
You can use custom OpenAI API endpoints by setting `MARKITDOWN_LLM_ENDPOINT` to your proxy or mirror site URL (e.g., `https://your-proxy.com/v1`). This is useful if you need to access OpenAI services through a proxy or alternative service provider.
213+
214+
For Azure Document Intelligence, you can provide both the endpoint and API key instead of using DefaultAzureCredential, which provides better security by limiting the permissions of the authentication method.
215+
194216
## Common Errors
195217
- *Cannot Use `Save All` Options Without Storage Config*:
196218
- This error occurs when you enable `save_all` option without storage config. You need to set `STORAGE_TYPE` to `local` or other storage type to use this option.

config.py

Lines changed: 159 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,86 +1,199 @@
11
from os import environ
2-
2+
import os
3+
from pathlib import Path
4+
from dotenv import load_dotenv
5+
from utils import logger
6+
7+
config_cache = {}
8+
9+
def load_env_files():
10+
"""load .env file"""
11+
base_dir = Path.cwd()
12+
env_file = base_dir / '.env'
13+
14+
if env_file.exists():
15+
logger.info(f"Loading environment variables from {env_file}")
16+
load_dotenv(dotenv_path=str(env_file), override=True)
17+
else:
18+
logger.info("No .env file found, using environment variables only")
19+
20+
load_env_files()
21+
22+
def reload_config():
23+
"""reload config, used for command line parameter override environment variable"""
24+
global config_cache
25+
config_cache = {}
26+
init_config()
27+
logger.info("Configuration reloaded")
28+
29+
def get_env(key: str, default=None):
30+
"""
31+
get value from environment variable or .env file,
32+
.env file has higher priority
33+
"""
34+
return os.environ.get(key, default)
335

436
def to_str(key: str, default: str = "") -> str:
537
"""Converts string to string."""
6-
7-
value = environ.get(key, default)
8-
return value.strip()
38+
if key in config_cache:
39+
return config_cache[key]
40+
41+
value = get_env(key, default)
42+
result = value.strip() if value else default.strip()
43+
44+
config_cache[key] = result
45+
return result
946

1047

1148
def to_none_str(key: str, default: str = None) -> str:
1249
"""Converts string to string."""
13-
14-
value = environ.get(key, default)
15-
return value.strip() if value else None
50+
if f"none_str:{key}" in config_cache:
51+
return config_cache[f"none_str:{key}"]
52+
53+
value = get_env(key, default)
54+
result = value.strip() if value else None
55+
56+
config_cache[f"none_str:{key}"] = result
57+
return result
1658

1759

1860
def to_endpoint(key: str, default: str = "") -> str:
1961
"""Converts string to string."""
20-
return to_str(key, default).rstrip("/")
62+
if f"endpoint:{key}" in config_cache:
63+
return config_cache[f"endpoint:{key}"]
64+
65+
result = to_str(key, default).rstrip("/")
66+
67+
config_cache[f"endpoint:{key}"] = result
68+
return result
2169

2270

2371
def to_list(key: str, default: list) -> list:
2472
"""Converts comma-separated string to list."""
25-
key = to_str(key, "")
26-
if not key:
73+
if f"list:{key}" in config_cache:
74+
return config_cache[f"list:{key}"]
75+
76+
key_value = to_str(key, "")
77+
if not key_value:
78+
config_cache[f"list:{key}"] = default
2779
return default
2880

29-
return [item for item in key.split(",") if item]
81+
result = [item for item in key_value.split(",") if item]
82+
83+
config_cache[f"list:{key}"] = result
84+
return result
3085

3186

3287
def to_bool(key: str, default: bool) -> bool:
3388
"""Converts string to bool."""
89+
if f"bool:{key}" in config_cache:
90+
return config_cache[f"bool:{key}"]
91+
3492
value = to_str(key, "")
3593
if not value:
94+
config_cache[f"bool:{key}"] = default
3695
return default
3796

38-
return value.lower() == "true" or value == "1"
97+
result = value.lower() == "true" or value == "1"
98+
99+
config_cache[f"bool:{key}"] = result
100+
return result
39101

40102

41103
def to_float(key: str, default: float) -> float:
42104
"""Converts string to float."""
105+
if f"float:{key}" in config_cache:
106+
return config_cache[f"float:{key}"]
107+
43108
value = to_str(key, "")
44109
if not value:
110+
config_cache[f"float:{key}"] = default
45111
return default
46112

47-
return float(value)
113+
try:
114+
result = float(value)
115+
config_cache[f"float:{key}"] = result
116+
return result
117+
except ValueError:
118+
logger.warning(f"Could not convert {key}={value} to float, using default {default}")
119+
config_cache[f"float:{key}"] = default
120+
return default
48121

49122

50-
def to_int(value: str, default: int) -> int:
123+
def to_int(key: str, default: int) -> int:
51124
"""Converts string to int."""
52-
value = to_str(value, "")
125+
if f"int:{key}" in config_cache:
126+
return config_cache[f"int:{key}"]
127+
128+
value = to_str(key, "")
53129
if not value:
130+
config_cache[f"int:{key}"] = default
131+
return default
132+
133+
try:
134+
result = int(value)
135+
config_cache[f"int:{key}"] = result
136+
return result
137+
except ValueError:
138+
logger.warning(f"Could not convert {key}={value} to int, using default {default}")
139+
config_cache[f"int:{key}"] = default
54140
return default
55141

56-
return int(value)
57-
58-
59-
# General Config
60-
CORS_ALLOW_ORIGINS = to_list("CORS_ALLOW_ORIGINS", ["*"]) # CORS Allow Origins
61-
MAX_FILE_SIZE = to_float("MAX_FILE_SIZE", -1) # Max File Size
62-
PDF_MAX_IMAGES = to_int("PDF_MAX_IMAGES", 10) # PDF Max Images
63-
AZURE_SPEECH_KEY = to_str("AZURE_SPEECH_KEY") # Azure Speech Key
64-
AZURE_SPEECH_REGION = to_str("AZURE_SPEECH_REGION") # Azure Speech Region
65-
ENABLE_AZURE_SPEECH = AZURE_SPEECH_KEY and AZURE_SPEECH_REGION # Enable Azure Speech
66-
67-
# Storage Config
68-
STORAGE_TYPE = to_str("STORAGE_TYPE", "common") # Storage Type
69-
LOCAL_STORAGE_DOMAIN = to_str("LOCAL_STORAGE_DOMAIN", "").rstrip("/") # Local Storage Domain
70-
S3_BUCKET = to_str("S3_BUCKET", "") # S3 Bucket
71-
S3_ACCESS_KEY = to_str("S3_ACCESS_KEY", "") # S3 Access Key
72-
S3_SECRET_KEY = to_str("S3_SECRET_KEY", "") # S3 Secret Key
73-
S3_REGION = to_str("S3_REGION", "") # S3 Region
74-
S3_DOMAIN = to_endpoint("S3_DOMAIN", "") # S3 Domain (Optional)
75-
S3_DIRECT_URL_DOMAIN = to_endpoint("S3_DIRECT_URL_DOMAIN", "") # S3 Direct/Proxy URL Domain (Optional)
76-
S3_SIGN_VERSION = to_none_str("S3_SIGN_VERSION") # S3 Sign Version
77-
S3_API = S3_DOMAIN or f"https://{S3_BUCKET}.s3.{S3_REGION}.amazonaws.com" # S3 API
78-
S3_SPACE = S3_DIRECT_URL_DOMAIN or S3_API # S3 Image URL Domain
79-
TG_ENDPOINT = to_endpoint("TG_ENDPOINT", "") # Telegram Endpoint
80-
TG_PASSWORD = to_str("TG_PASSWORD", "") # Telegram Password
81-
TG_API = TG_ENDPOINT + "/api" + (f"?pass={TG_PASSWORD}" if TG_PASSWORD and len(TG_PASSWORD) > 0 else "") # Telegram API
82-
83-
# OCR Config
84-
OCR_ENDPOINT = to_endpoint("OCR_ENDPOINT", "") # OCR Endpoint
85-
OCR_SKIP_MODELS = to_list("OCR_SKIP_MODELS", []) # OCR Skip Models
86-
OCR_SPEC_MODELS = to_list("OCR_SPEC_MODELS", []) # OCR Specific Models
142+
def init_config():
143+
"""initialize all config items"""
144+
global CORS_ALLOW_ORIGINS, MAX_FILE_SIZE, PDF_MAX_IMAGES
145+
global AZURE_SPEECH_KEY, AZURE_SPEECH_REGION, ENABLE_AZURE_SPEECH
146+
global STORAGE_TYPE, LOCAL_STORAGE_DOMAIN
147+
global S3_BUCKET, S3_ACCESS_KEY, S3_SECRET_KEY, S3_REGION
148+
global S3_DOMAIN, S3_DIRECT_URL_DOMAIN, S3_SIGN_VERSION
149+
global S3_API, S3_SPACE
150+
global TG_ENDPOINT, TG_PASSWORD, TG_API
151+
global OCR_ENDPOINT, OCR_SKIP_MODELS, OCR_SPEC_MODELS
152+
global LOG_LEVEL
153+
global MARKITDOWN_ENABLE, MARKITDOWN_ENABLE_PLUGINS, MARKITDOWN_USE_DOCINTEL
154+
global MARKITDOWN_DOCINTEL_ENDPOINT, MARKITDOWN_DOCINTEL_KEY, MARKITDOWN_USE_LLM, MARKITDOWN_LLM_MODEL
155+
global MARKITDOWN_LLM_ENDPOINT, MARKITDOWN_LLM_API_KEY
156+
157+
# General Config
158+
CORS_ALLOW_ORIGINS = to_list("CORS_ALLOW_ORIGINS", ["*"]) # CORS Allow Origins
159+
MAX_FILE_SIZE = to_float("MAX_FILE_SIZE", -1) # Max File Size
160+
PDF_MAX_IMAGES = to_int("PDF_MAX_IMAGES", 10) # PDF Max Images
161+
AZURE_SPEECH_KEY = to_str("AZURE_SPEECH_KEY") # Azure Speech Key
162+
AZURE_SPEECH_REGION = to_str("AZURE_SPEECH_REGION") # Azure Speech Region
163+
ENABLE_AZURE_SPEECH = AZURE_SPEECH_KEY and AZURE_SPEECH_REGION # Enable Azure Speech
164+
165+
# Storage Config
166+
STORAGE_TYPE = to_str("STORAGE_TYPE", "common") # Storage Type
167+
LOCAL_STORAGE_DOMAIN = to_str("LOCAL_STORAGE_DOMAIN", "").rstrip("/") # Local Storage Domain
168+
S3_BUCKET = to_str("S3_BUCKET", "") # S3 Bucket
169+
S3_ACCESS_KEY = to_str("S3_ACCESS_KEY", "") # S3 Access Key
170+
S3_SECRET_KEY = to_str("S3_SECRET_KEY", "") # S3 Secret Key
171+
S3_REGION = to_str("S3_REGION", "") # S3 Region
172+
S3_DOMAIN = to_endpoint("S3_DOMAIN", "") # S3 Domain (Optional)
173+
S3_DIRECT_URL_DOMAIN = to_endpoint("S3_DIRECT_URL_DOMAIN", "") # S3 Direct/Proxy URL Domain (Optional)
174+
S3_SIGN_VERSION = to_none_str("S3_SIGN_VERSION") # S3 Sign Version
175+
S3_API = S3_DOMAIN or f"https://{S3_BUCKET}.s3.{S3_REGION}.amazonaws.com" # S3 API
176+
S3_SPACE = S3_DIRECT_URL_DOMAIN or S3_API # S3 Image URL Domain
177+
TG_ENDPOINT = to_endpoint("TG_ENDPOINT", "") # Telegram Endpoint
178+
TG_PASSWORD = to_str("TG_PASSWORD", "") # Telegram Password
179+
TG_API = TG_ENDPOINT + "/api" + (f"?pass={TG_PASSWORD}" if TG_PASSWORD and len(TG_PASSWORD) > 0 else "") # Telegram API
180+
181+
# OCR Config
182+
OCR_ENDPOINT = to_endpoint("OCR_ENDPOINT", "") # OCR Endpoint
183+
OCR_SKIP_MODELS = to_list("OCR_SKIP_MODELS", []) # OCR Skip Models
184+
OCR_SPEC_MODELS = to_list("OCR_SPEC_MODELS", []) # OCR Specific Models
185+
186+
# MarkItDown Config
187+
MARKITDOWN_ENABLE = to_bool("MARKITDOWN_ENABLE", False) # Enable MarkItDown
188+
MARKITDOWN_ENABLE_PLUGINS = to_bool("MARKITDOWN_ENABLE_PLUGINS", False) # Enable MarkItDown Plugins
189+
MARKITDOWN_USE_DOCINTEL = to_bool("MARKITDOWN_USE_DOCINTEL", False) # Use Document Intelligence
190+
MARKITDOWN_DOCINTEL_ENDPOINT = to_str("MARKITDOWN_DOCINTEL_ENDPOINT", "") # Document Intelligence Endpoint
191+
MARKITDOWN_DOCINTEL_KEY = to_str("MARKITDOWN_DOCINTEL_KEY", "") # Document Intelligence API Key
192+
MARKITDOWN_USE_LLM = to_bool("MARKITDOWN_USE_LLM", False) # Use LLM for image descriptions
193+
MARKITDOWN_LLM_MODEL = to_str("MARKITDOWN_LLM_MODEL", "gpt-4o") # LLM Model for image descriptions
194+
MARKITDOWN_LLM_ENDPOINT = to_str("MARKITDOWN_LLM_ENDPOINT", "") # LLM Endpoint
195+
MARKITDOWN_LLM_API_KEY = to_str("MARKITDOWN_LLM_API_KEY", "") # LLM API Key
196+
197+
LOG_LEVEL = to_str("LOG_LEVEL", "INFO").upper() # log level
198+
199+
init_config()

0 commit comments

Comments
 (0)