Limits for docmuncher (#535)

poornimaramesh · web-flow · commit 4e7c70aba292 · 2025-04-21T18:13:53.000Z
* add content limits for ingesting documents

* update import modal to show errors

* debugging

* add page limits to workspace

* modify error messages in terms of pages

* fix linting

* remove page quota saving to db
diff --git a/.secrets.baseline b/.secrets.baseline
@@ -348,57 +348,6 @@
         "line_number": 15
       }
     ],
-    "core_backend/tests/api/conftest.py": [
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "407c6798fe20fd5d75de4a233c156cc0fce510e3",
-        "is_verified": false,
-        "line_number": 46
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "42553e798bc193bcf25368b5e53ec7cd771483a7",
-        "is_verified": false,
-        "line_number": 47
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "9fb7fe1217aed442b04c0f5e43b5d5a7d3287097",
-        "is_verified": false,
-        "line_number": 50
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "767ef7376d44bb6e52b390ddcd12c1cb1b3902a4",
-        "is_verified": false,
-        "line_number": 51
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "70240b5d0947cc97447de496284791c12b2e678a",
-        "is_verified": false,
-        "line_number": 56
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "80fea3e25cb7e28550d13af9dfda7a9bd08c1a78",
-        "is_verified": false,
-        "line_number": 57
-      },
-      {
-        "type": "Secret Keyword",
-        "filename": "core_backend/tests/api/conftest.py",
-        "hashed_secret": "3465834d516797458465ae4ed2c62e7020032c4e",
-        "is_verified": false,
-        "line_number": 317
-      }
-    ],
     "core_backend/tests/api/test.env": [
       {
         "type": "Secret Keyword",
@@ -581,5 +530,5 @@
       }
     ]
   },
-  "generated_at": "2025-01-24T13:35:08Z"
+  "generated_at": "2025-04-21T11:10:33Z"
 }
diff --git a/admin_app/src/app/content/api.ts b/admin_app/src/app/content/api.ts
@@ -313,7 +313,11 @@ const usePostDocumentToIndex = (token: string) => {
           },
         });
         return { status: response.status, detail: response.data };
-      } catch (error) {
+      } catch (error: any) {
+        if (error.response) {
+          const errorResponse = error.response.data;
+          throw new Error(errorResponse.detail || "Error indexing document");
+        }
         throw new Error("Error indexing document");
       }
     },
diff --git a/admin_app/src/app/content/components/ImportFromPDFModal.tsx b/admin_app/src/app/content/components/ImportFromPDFModal.tsx
@@ -90,8 +90,8 @@ export const ImportFromPDFModal: React.FC<ImportFromPDFModalProps> = ({
             setImportSuccess(true);
             setFiles([]);
           },
-          onError: (error) => {
-            setImportErrorMessages(["An unknown error occurred"]);
+          onError: (error: Error) => {
+            setImportErrorMessages(["An error occurred: " + error.message]);
           },
           onSettled: () => {
             setLoading(false);
diff --git a/core_backend/app/config.py b/core_backend/app/config.py
@@ -85,6 +85,7 @@
 DEFAULT_CONTENT_QUOTA = int(os.environ.get("DEFAULT_CONTENT_QUOTA", 50))
 DEFAULT_API_QUOTA = int(os.environ.get("DEFAULT_API_QUOTA", 100))
 CHECK_API_LIMIT = os.environ.get("CHECK_API_LIMIT", True)
+PAGES_TO_CARDS_CONVERSION = int(os.environ.get("PAGES_TO_CARDS_CONVERSION", 2))
 
 # Alignment Score variables
 ALIGN_SCORE_THRESHOLD = os.environ.get("ALIGN_SCORE_THRESHOLD", 0.7)
diff --git a/core_backend/app/docmuncher/dependencies.py b/core_backend/app/docmuncher/dependencies.py
@@ -1,17 +1,20 @@
 import json
 import os
 from datetime import datetime, timezone
+from io import BytesIO
 from typing import Dict
 
 from fastapi import HTTPException, Request, status
 from langchain.text_splitter import MarkdownHeaderTextSplitter
 from langchain_core.documents import Document
 from mistralai import DocumentURLChunk, Mistral
+from PyPDF2 import PdfReader
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from ..config import (
     LITELLM_MODEL_DOCMUNCHER_PARAPHRASE_TABLE,
     LITELLM_MODEL_DOCMUNCHER_TITLE,
+    PAGES_TO_CARDS_CONVERSION,
     REDIS_DOC_INGEST_EXPIRY_TIME,
 )
 from ..contents.models import save_content_to_db
@@ -455,6 +458,19 @@ async def process_pdf_file(
 
     finally:
         await redis.set(task_id, job_status_pydantic.model_dump_json())
+
+        temp_docmuncher_contents = await redis.get(
+            f"{workspace_id}_docmuncher_contents"
+        )
+        num_pages = len(PdfReader(BytesIO(content)).pages)
+
+        # Update expected contents since task has finished
+        await redis.set(
+            f"{workspace_id}_docmuncher_contents",
+            max(
+                0, int(temp_docmuncher_contents) - num_pages * PAGES_TO_CARDS_CONVERSION
+            ),
+        )
         await asession.close()
 
     return job_status_pydantic
diff --git a/core_backend/app/docmuncher/routers.py b/core_backend/app/docmuncher/routers.py
@@ -1,4 +1,5 @@
 import json
+import re
 import zipfile
 from datetime import datetime, timezone
 from io import BytesIO
@@ -16,17 +17,24 @@
     UploadFile,
     status,
 )
+from PyPDF2 import PdfReader
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from ..auth.dependencies import get_current_user, get_current_workspace_name
-from ..config import REDIS_DOC_INGEST_EXPIRY_TIME as REDIS_EXPIRATION_SECONDS
+from ..config import (
+    CHECK_CONTENT_LIMIT,
+    PAGES_TO_CARDS_CONVERSION,
+    REDIS_DOC_INGEST_EXPIRY_TIME,
+)
+from ..contents.routers import (
+    ExceedsContentQuotaError,
+    _check_content_quota_availability,
+)
 from ..database import get_async_session
 from ..users.models import UserDB, user_has_required_role_in_workspace
 from ..users.schemas import UserRoles
 from ..utils import setup_logger
-from ..workspaces.utils import (
-    get_workspace_by_workspace_name,
-)
+from ..workspaces.utils import get_workspace_by_workspace_name
 from .dependencies import JOB_KEY_PREFIX, process_pdf_file
 from .schemas import (
     DocIngestionStatusZip,
@@ -59,8 +67,9 @@ async def upload_document(
     The process is as follows:
 
     1. Parameters for the endpoint are checked first.
-    2. Create a copy of the file and asession
-    3. Start a document ingestion job and return a job ID.
+    2. Check if content / page limits are reached
+    3. Create a copy of the file and asession
+    4. Start a document ingestion job and return a job ID.
 
     Parameters
     ----------
@@ -86,6 +95,7 @@ async def upload_document(
         If the file is not a .pdf or .zip file.
     """
     logger.info("Document upload request received.")
+    redis = request.app.state.redis
 
     # 1.
     workspace_db = await get_workspace_by_workspace_name(
@@ -110,6 +120,7 @@ async def upload_document(
         )
 
     pdf_files = []
+    num_pages = 0
     parent_file_name = None
     if file.filename.endswith(".zip"):
         parent_file_name = file.filename
@@ -123,10 +134,14 @@ async def upload_document(
                     status_code=status.HTTP_400_BAD_REQUEST,
                     detail="The zip file does not contain any PDF files.",
                 )
+            num_pages = sum(
+                len(PdfReader(BytesIO(content)).pages) for _, content in pdf_files
+            )
         await file.close()
     elif file.filename.endswith(".pdf"):
         file_content = await file.read()
         pdf_files = [(file.filename, file_content)]
+        num_pages = len(PdfReader(BytesIO(file_content)).pages)
         await file.close()
 
     else:
@@ -135,17 +150,63 @@ async def upload_document(
             detail="Only PDF files are supported for document ingestion.",
         )
 
-    # 2.
+    # 3.
+    # Get temporary log of expected contents to be created by running jobs
+    temp_docmuncher_contents = await redis.get(
+        f"{workspace_db.workspace_id}_docmuncher_contents"
+    )
+    if not temp_docmuncher_contents:
+        temp_docmuncher_contents = 0
+    else:
+        temp_docmuncher_contents = int(temp_docmuncher_contents.decode("utf-8"))
+    num_expected_contents = (
+        num_pages * PAGES_TO_CARDS_CONVERSION + temp_docmuncher_contents
+    )
+
+    if CHECK_CONTENT_LIMIT:
+        if workspace_db.content_quota and (
+            num_pages > workspace_db.content_quota / PAGES_TO_CARDS_CONVERSION
+        ):
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail=f"Document ingestion exceeds page quota:\n\
+                    There are {num_pages} pages in your upload, but only\
+                    {workspace_db.content_quota / PAGES_TO_CARDS_CONVERSION}\
+                    pages are allowed.",
+            )
+        try:
+            await _check_content_quota_availability(
+                asession=asession,
+                n_contents_to_add=num_expected_contents,
+                workspace_id=workspace_db.workspace_id,
+            )
+        except ExceedsContentQuotaError as e:
+            match = re.search(r"existing (\d+) in the database", str(e))
+            existing_contents = 0
+            if match:
+                existing_contents = int(match.group(1))
+            pages_left = max(
+                0,
+                workspace_db.content_quota
+                - temp_docmuncher_contents
+                - existing_contents,
+            )
+            pages_left = np.floor(pages_left / PAGES_TO_CARDS_CONVERSION).astype(int)
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail=f"Document ingestion could exceed content quota:\n\
+                    There are {num_pages} pages in your upload, but only\
+                    {pages_left} more pages are allowed.",
+            ) from e
+
     upload_id = str(uuid4())
     tasks: list[DocUploadResponsePdf] = []
     zip_created_datetime_utc = datetime.now(timezone.utc)
 
     for filename, content in pdf_files:
         bg_asession = AsyncSession(asession.bind)
-
         # 3.
         # Log task in redis
-        redis = request.app.state.redis
         task_id = f"{JOB_KEY_PREFIX}{str(uuid4())}"
         task_status = DocUploadResponsePdf(
             upload_id=upload_id,
@@ -160,7 +221,11 @@ async def upload_document(
         tasks.append(task_status)
 
         await redis.set(
-            task_id, task_status.model_dump_json(), ex=REDIS_EXPIRATION_SECONDS
+            task_id, task_status.model_dump_json(), ex=REDIS_DOC_INGEST_EXPIRY_TIME
+        )
+        # Update expected contents from running jobs
+        await redis.set(
+            f"{workspace_db.workspace_id}_docmuncher_contents", num_expected_contents
         )
 
         background_tasks.add_task(
@@ -173,6 +238,7 @@ async def upload_document(
             asession=bg_asession,
         )
 
+    # 4.
     if len(pdf_files) == 1:
         return tasks[0]
     else:
diff --git a/core_backend/app/workspaces/routers.py b/core_backend/app/workspaces/routers.py
@@ -701,7 +701,11 @@ async def check_update_workspace_call(
     updating_content_quota = content_quota is None or content_quota >= 0
 
     if not any(
-        [updating_api_daily_quota, updating_content_quota, workspace_name is not None]
+        [
+            updating_api_daily_quota,
+            updating_content_quota,
+            workspace_name is not None,
+        ]
     ):
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
diff --git a/core_backend/requirements.txt b/core_backend/requirements.txt
@@ -7,6 +7,7 @@ psycopg2==2.9.9
 asyncpg==0.28.0
 litellm==1.61.15
 sqlalchemy[asyncio]==2.0.20
+PyPDF2==3.0.1
 python-multipart==0.0.18
 pyjwt[crypto]==2.8.0
 prometheus_client==0.19.0
diff --git a/core_backend/tests/api/conftest.py b/core_backend/tests/api/conftest.py
@@ -83,6 +83,7 @@
 TEST_WORKSPACE_CONTENT_QUOTA_4 = 50
 TEST_WORKSPACE_CONTENT_QUOTA_DATA_API_1 = 50
 TEST_WORKSPACE_CONTENT_QUOTA_DATA_API_2 = 50
+
 TEST_WORKSPACE_NAME_1 = "test_workspace_1"
 TEST_WORKSPACE_NAME_2 = "test_workspace_2"
 TEST_WORKSPACE_NAME_3 = "test_workspace_3"
diff --git a/deployment/docker-compose/template.core_backend.env b/deployment/docker-compose/template.core_backend.env
@@ -46,6 +46,8 @@ PROMETHEUS_MULTIPROC_DIR="/tmp"
 #### Application-wide content limits ##########################################
 # CHECK_CONTENT_LIMIT=True
 # DEFAULT_CONTENT_QUOTA=50
+# PAGES_TO_CARDS_CONVERSION=2  # for DocMuncher, estimate of cards per page
+
 
 #### Number of top content to return for /search. #############################
 # N_TOP_CONTENT=5