Skip to content

Commit 4e7c70a

Browse files
Limits for docmuncher (#535)
* add content limits for ingesting documents * update import modal to show errors * debugging * add page limits to workspace * modify error messages in terms of pages * fix linting * remove page quota saving to db
1 parent 5f06f3b commit 4e7c70a

File tree

10 files changed

+110
-66
lines changed

10 files changed

+110
-66
lines changed

.secrets.baseline

Lines changed: 1 addition & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -348,57 +348,6 @@
348348
"line_number": 15
349349
}
350350
],
351-
"core_backend/tests/api/conftest.py": [
352-
{
353-
"type": "Secret Keyword",
354-
"filename": "core_backend/tests/api/conftest.py",
355-
"hashed_secret": "407c6798fe20fd5d75de4a233c156cc0fce510e3",
356-
"is_verified": false,
357-
"line_number": 46
358-
},
359-
{
360-
"type": "Secret Keyword",
361-
"filename": "core_backend/tests/api/conftest.py",
362-
"hashed_secret": "42553e798bc193bcf25368b5e53ec7cd771483a7",
363-
"is_verified": false,
364-
"line_number": 47
365-
},
366-
{
367-
"type": "Secret Keyword",
368-
"filename": "core_backend/tests/api/conftest.py",
369-
"hashed_secret": "9fb7fe1217aed442b04c0f5e43b5d5a7d3287097",
370-
"is_verified": false,
371-
"line_number": 50
372-
},
373-
{
374-
"type": "Secret Keyword",
375-
"filename": "core_backend/tests/api/conftest.py",
376-
"hashed_secret": "767ef7376d44bb6e52b390ddcd12c1cb1b3902a4",
377-
"is_verified": false,
378-
"line_number": 51
379-
},
380-
{
381-
"type": "Secret Keyword",
382-
"filename": "core_backend/tests/api/conftest.py",
383-
"hashed_secret": "70240b5d0947cc97447de496284791c12b2e678a",
384-
"is_verified": false,
385-
"line_number": 56
386-
},
387-
{
388-
"type": "Secret Keyword",
389-
"filename": "core_backend/tests/api/conftest.py",
390-
"hashed_secret": "80fea3e25cb7e28550d13af9dfda7a9bd08c1a78",
391-
"is_verified": false,
392-
"line_number": 57
393-
},
394-
{
395-
"type": "Secret Keyword",
396-
"filename": "core_backend/tests/api/conftest.py",
397-
"hashed_secret": "3465834d516797458465ae4ed2c62e7020032c4e",
398-
"is_verified": false,
399-
"line_number": 317
400-
}
401-
],
402351
"core_backend/tests/api/test.env": [
403352
{
404353
"type": "Secret Keyword",
@@ -581,5 +530,5 @@
581530
}
582531
]
583532
},
584-
"generated_at": "2025-01-24T13:35:08Z"
533+
"generated_at": "2025-04-21T11:10:33Z"
585534
}

admin_app/src/app/content/api.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,11 @@ const usePostDocumentToIndex = (token: string) => {
313313
},
314314
});
315315
return { status: response.status, detail: response.data };
316-
} catch (error) {
316+
} catch (error: any) {
317+
if (error.response) {
318+
const errorResponse = error.response.data;
319+
throw new Error(errorResponse.detail || "Error indexing document");
320+
}
317321
throw new Error("Error indexing document");
318322
}
319323
},

admin_app/src/app/content/components/ImportFromPDFModal.tsx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,8 @@ export const ImportFromPDFModal: React.FC<ImportFromPDFModalProps> = ({
9090
setImportSuccess(true);
9191
setFiles([]);
9292
},
93-
onError: (error) => {
94-
setImportErrorMessages(["An unknown error occurred"]);
93+
onError: (error: Error) => {
94+
setImportErrorMessages(["An error occurred: " + error.message]);
9595
},
9696
onSettled: () => {
9797
setLoading(false);

core_backend/app/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@
8585
DEFAULT_CONTENT_QUOTA = int(os.environ.get("DEFAULT_CONTENT_QUOTA", 50))
8686
DEFAULT_API_QUOTA = int(os.environ.get("DEFAULT_API_QUOTA", 100))
8787
CHECK_API_LIMIT = os.environ.get("CHECK_API_LIMIT", True)
88+
PAGES_TO_CARDS_CONVERSION = int(os.environ.get("PAGES_TO_CARDS_CONVERSION", 2))
8889

8990
# Alignment Score variables
9091
ALIGN_SCORE_THRESHOLD = os.environ.get("ALIGN_SCORE_THRESHOLD", 0.7)

core_backend/app/docmuncher/dependencies.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,20 @@
11
import json
22
import os
33
from datetime import datetime, timezone
4+
from io import BytesIO
45
from typing import Dict
56

67
from fastapi import HTTPException, Request, status
78
from langchain.text_splitter import MarkdownHeaderTextSplitter
89
from langchain_core.documents import Document
910
from mistralai import DocumentURLChunk, Mistral
11+
from PyPDF2 import PdfReader
1012
from sqlalchemy.ext.asyncio import AsyncSession
1113

1214
from ..config import (
1315
LITELLM_MODEL_DOCMUNCHER_PARAPHRASE_TABLE,
1416
LITELLM_MODEL_DOCMUNCHER_TITLE,
17+
PAGES_TO_CARDS_CONVERSION,
1518
REDIS_DOC_INGEST_EXPIRY_TIME,
1619
)
1720
from ..contents.models import save_content_to_db
@@ -455,6 +458,19 @@ async def process_pdf_file(
455458

456459
finally:
457460
await redis.set(task_id, job_status_pydantic.model_dump_json())
461+
462+
temp_docmuncher_contents = await redis.get(
463+
f"{workspace_id}_docmuncher_contents"
464+
)
465+
num_pages = len(PdfReader(BytesIO(content)).pages)
466+
467+
# Update expected contents since task has finished
468+
await redis.set(
469+
f"{workspace_id}_docmuncher_contents",
470+
max(
471+
0, int(temp_docmuncher_contents) - num_pages * PAGES_TO_CARDS_CONVERSION
472+
),
473+
)
458474
await asession.close()
459475

460476
return job_status_pydantic

core_backend/app/docmuncher/routers.py

Lines changed: 76 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
import re
23
import zipfile
34
from datetime import datetime, timezone
45
from io import BytesIO
@@ -16,17 +17,24 @@
1617
UploadFile,
1718
status,
1819
)
20+
from PyPDF2 import PdfReader
1921
from sqlalchemy.ext.asyncio import AsyncSession
2022

2123
from ..auth.dependencies import get_current_user, get_current_workspace_name
22-
from ..config import REDIS_DOC_INGEST_EXPIRY_TIME as REDIS_EXPIRATION_SECONDS
24+
from ..config import (
25+
CHECK_CONTENT_LIMIT,
26+
PAGES_TO_CARDS_CONVERSION,
27+
REDIS_DOC_INGEST_EXPIRY_TIME,
28+
)
29+
from ..contents.routers import (
30+
ExceedsContentQuotaError,
31+
_check_content_quota_availability,
32+
)
2333
from ..database import get_async_session
2434
from ..users.models import UserDB, user_has_required_role_in_workspace
2535
from ..users.schemas import UserRoles
2636
from ..utils import setup_logger
27-
from ..workspaces.utils import (
28-
get_workspace_by_workspace_name,
29-
)
37+
from ..workspaces.utils import get_workspace_by_workspace_name
3038
from .dependencies import JOB_KEY_PREFIX, process_pdf_file
3139
from .schemas import (
3240
DocIngestionStatusZip,
@@ -59,8 +67,9 @@ async def upload_document(
5967
The process is as follows:
6068
6169
1. Parameters for the endpoint are checked first.
62-
2. Create a copy of the file and asession
63-
3. Start a document ingestion job and return a job ID.
70+
2. Check if content / page limits are reached
71+
3. Create a copy of the file and asession
72+
4. Start a document ingestion job and return a job ID.
6473
6574
Parameters
6675
----------
@@ -86,6 +95,7 @@ async def upload_document(
8695
If the file is not a .pdf or .zip file.
8796
"""
8897
logger.info("Document upload request received.")
98+
redis = request.app.state.redis
8999

90100
# 1.
91101
workspace_db = await get_workspace_by_workspace_name(
@@ -110,6 +120,7 @@ async def upload_document(
110120
)
111121

112122
pdf_files = []
123+
num_pages = 0
113124
parent_file_name = None
114125
if file.filename.endswith(".zip"):
115126
parent_file_name = file.filename
@@ -123,10 +134,14 @@ async def upload_document(
123134
status_code=status.HTTP_400_BAD_REQUEST,
124135
detail="The zip file does not contain any PDF files.",
125136
)
137+
num_pages = sum(
138+
len(PdfReader(BytesIO(content)).pages) for _, content in pdf_files
139+
)
126140
await file.close()
127141
elif file.filename.endswith(".pdf"):
128142
file_content = await file.read()
129143
pdf_files = [(file.filename, file_content)]
144+
num_pages = len(PdfReader(BytesIO(file_content)).pages)
130145
await file.close()
131146

132147
else:
@@ -135,17 +150,63 @@ async def upload_document(
135150
detail="Only PDF files are supported for document ingestion.",
136151
)
137152

138-
# 2.
153+
# 3.
154+
# Get temporary log of expected contents to be created by running jobs
155+
temp_docmuncher_contents = await redis.get(
156+
f"{workspace_db.workspace_id}_docmuncher_contents"
157+
)
158+
if not temp_docmuncher_contents:
159+
temp_docmuncher_contents = 0
160+
else:
161+
temp_docmuncher_contents = int(temp_docmuncher_contents.decode("utf-8"))
162+
num_expected_contents = (
163+
num_pages * PAGES_TO_CARDS_CONVERSION + temp_docmuncher_contents
164+
)
165+
166+
if CHECK_CONTENT_LIMIT:
167+
if workspace_db.content_quota and (
168+
num_pages > workspace_db.content_quota / PAGES_TO_CARDS_CONVERSION
169+
):
170+
raise HTTPException(
171+
status_code=status.HTTP_403_FORBIDDEN,
172+
detail=f"Document ingestion exceeds page quota:\n\
173+
There are {num_pages} pages in your upload, but only\
174+
{workspace_db.content_quota / PAGES_TO_CARDS_CONVERSION}\
175+
pages are allowed.",
176+
)
177+
try:
178+
await _check_content_quota_availability(
179+
asession=asession,
180+
n_contents_to_add=num_expected_contents,
181+
workspace_id=workspace_db.workspace_id,
182+
)
183+
except ExceedsContentQuotaError as e:
184+
match = re.search(r"existing (\d+) in the database", str(e))
185+
existing_contents = 0
186+
if match:
187+
existing_contents = int(match.group(1))
188+
pages_left = max(
189+
0,
190+
workspace_db.content_quota
191+
- temp_docmuncher_contents
192+
- existing_contents,
193+
)
194+
pages_left = np.floor(pages_left / PAGES_TO_CARDS_CONVERSION).astype(int)
195+
raise HTTPException(
196+
status_code=status.HTTP_403_FORBIDDEN,
197+
detail=f"Document ingestion could exceed content quota:\n\
198+
There are {num_pages} pages in your upload, but only\
199+
{pages_left} more pages are allowed.",
200+
) from e
201+
139202
upload_id = str(uuid4())
140203
tasks: list[DocUploadResponsePdf] = []
141204
zip_created_datetime_utc = datetime.now(timezone.utc)
142205

143206
for filename, content in pdf_files:
144207
bg_asession = AsyncSession(asession.bind)
145-
146208
# 3.
147209
# Log task in redis
148-
redis = request.app.state.redis
149210
task_id = f"{JOB_KEY_PREFIX}{str(uuid4())}"
150211
task_status = DocUploadResponsePdf(
151212
upload_id=upload_id,
@@ -160,7 +221,11 @@ async def upload_document(
160221
tasks.append(task_status)
161222

162223
await redis.set(
163-
task_id, task_status.model_dump_json(), ex=REDIS_EXPIRATION_SECONDS
224+
task_id, task_status.model_dump_json(), ex=REDIS_DOC_INGEST_EXPIRY_TIME
225+
)
226+
# Update expected contents from running jobs
227+
await redis.set(
228+
f"{workspace_db.workspace_id}_docmuncher_contents", num_expected_contents
164229
)
165230

166231
background_tasks.add_task(
@@ -173,6 +238,7 @@ async def upload_document(
173238
asession=bg_asession,
174239
)
175240

241+
# 4.
176242
if len(pdf_files) == 1:
177243
return tasks[0]
178244
else:

core_backend/app/workspaces/routers.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -701,7 +701,11 @@ async def check_update_workspace_call(
701701
updating_content_quota = content_quota is None or content_quota >= 0
702702

703703
if not any(
704-
[updating_api_daily_quota, updating_content_quota, workspace_name is not None]
704+
[
705+
updating_api_daily_quota,
706+
updating_content_quota,
707+
workspace_name is not None,
708+
]
705709
):
706710
raise HTTPException(
707711
status_code=status.HTTP_400_BAD_REQUEST,

core_backend/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ psycopg2==2.9.9
77
asyncpg==0.28.0
88
litellm==1.61.15
99
sqlalchemy[asyncio]==2.0.20
10+
PyPDF2==3.0.1
1011
python-multipart==0.0.18
1112
pyjwt[crypto]==2.8.0
1213
prometheus_client==0.19.0

core_backend/tests/api/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
TEST_WORKSPACE_CONTENT_QUOTA_4 = 50
8484
TEST_WORKSPACE_CONTENT_QUOTA_DATA_API_1 = 50
8585
TEST_WORKSPACE_CONTENT_QUOTA_DATA_API_2 = 50
86+
8687
TEST_WORKSPACE_NAME_1 = "test_workspace_1"
8788
TEST_WORKSPACE_NAME_2 = "test_workspace_2"
8889
TEST_WORKSPACE_NAME_3 = "test_workspace_3"

deployment/docker-compose/template.core_backend.env

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ PROMETHEUS_MULTIPROC_DIR="/tmp"
4646
#### Application-wide content limits ##########################################
4747
# CHECK_CONTENT_LIMIT=True
4848
# DEFAULT_CONTENT_QUOTA=50
49+
# PAGES_TO_CARDS_CONVERSION=2 # for DocMuncher, estimate of cards per page
50+
4951

5052
#### Number of top content to return for /search. #############################
5153
# N_TOP_CONTENT=5

0 commit comments

Comments
 (0)