fix: Display whisper-hash in case of errors (#180)

chandrasekharan-zipstack · web-flow · commit 8c8b5bca1a68 · 2025-04-21T14:35:36.000+05:30
Signed-off-by: Chandrasekharan M &lt;117059509+chandrasekharan-zipstack@users.noreply.github.com&gt;
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/constants.py
@@ -39,12 +39,8 @@ class WhispererEnv:
     Can be used to alter behaviour at runtime.
 
     Attributes:
-        POLL_INTERVAL: Time in seconds to wait before polling
-            LLMWhisperer's status API. Defaults to 30s
-        MAX_POLLS: Total number of times to poll the status API.
-            Set to -1 to poll indefinitely. Defaults to -1
-        STATUS_RETRIES: Number of times to retry calling LLLMWhisperer's
-        status API on failure during polling. Defaults to 5.
+        WAIT_TIMEOUT: Timeout for the extraction in seconds. Defaults to 300s
+        LOG_LEVEL: Logging level for the client library. Defaults to INFO
     """
 
     WAIT_TIMEOUT = "ADAPTER_LLMW_WAIT_TIMEOUT"
@@ -108,6 +104,6 @@ class WhispererDefaults:
     URL_IN_POST = False
     TAG = "default"
     TEXT_ONLY = False
-    WAIT_TIMEOUT = int(os.getenv(WhispererEnv.WAIT_TIMEOUT, 300))
+    WAIT_TIMEOUT = int(os.getenv(WhispererEnv.WAIT_TIMEOUT, 900))
     WAIT_FOR_COMPLETION = True
     LOGGING_LEVEL = os.getenv(WhispererEnv.LOG_LEVEL, "INFO")
diff --git a/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py b/src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py
@@ -60,38 +60,41 @@ def test_connection_request(
                 "Unable to connect to LLMWhisperer service, please check the URL",
                 actual_err=e,
                 status_code=503,
-            )
+            ) from e
         except Timeout as e:
             msg = "Request to LLMWhisperer has timed out"
             logger.error(f"{msg}: {e}")
-            raise ExtractorError(msg, actual_err=e, status_code=504)
+            raise ExtractorError(msg, actual_err=e, status_code=504) from e
         except HTTPError as e:
             logger.error(f"Adapter error: {e}")
             default_err = "Error while calling the LLMWhisperer service"
             msg = AdapterUtils.get_msg_from_request_exc(
                 err=e, message_key="message", default_err=default_err
             )
-            raise ExtractorError(msg, status_code=e.response.status_code, actual_err=e)
+            raise ExtractorError(
+                msg, status_code=e.response.status_code, actual_err=e
+            ) from e
 
     @staticmethod
     def make_request(
         config: dict[str, Any],
         headers: dict[str, Any] | None = None,
         params: dict[str, Any] | None = None,
-        data: Any | None = None,
+        data: BytesIO | None = None,
         type: str = "whisper",
     ) -> Response:
         """Makes a request to LLMWhisperer service.
 
         Args:
-            request_method (HTTPMethod): HTTPMethod to call. Can be GET or POST
-            request_endpoint (str): LLMWhisperer endpoint to hit
+            config (dict[str, Any]): LLMWhisperer config to use
             headers (Optional[dict[str, Any]], optional): Headers to pass.
                 Defaults to None.
             params (Optional[dict[str, Any]], optional): Query params to pass.
                 Defaults to None.
-            data (Optional[Any], optional): Data to pass in case of POST.
+            data (Optional[BytesIO], optional): Data to pass in case of POST.
                 Defaults to None.
+            type (str, optional): Type of request / endpoint in LLMWhisperer.
+                Defaults to "whisper".
 
         Returns:
             Response: Response from the request
@@ -110,11 +113,19 @@ def make_request(
             if type == "whisper":
                 response = client.whisper(**params, stream=data)
                 if response["status_code"] == 200:
+                    logger.debug(
+                        "Successfully extracted for whisper hash: "
+                        f"{response.get(X2TextConstants.WHISPER_HASH_V2, '')}"
+                    )
                     response["extraction"][X2TextConstants.WHISPER_HASH_V2] = (
                         response.get(X2TextConstants.WHISPER_HASH_V2, "")
                     )
                     return response["extraction"]
                 else:
+                    response["message"] += (
+                        ". Whisper hash: "
+                        f"{response.get(X2TextConstants.WHISPER_HASH_V2, '')}"
+                    )
                     raise ExtractorError(
                         response["message"],
                         response["status_code"],
@@ -130,18 +141,18 @@ def make_request(
                 "Unable to connect to LLMWhisperer service, please check the URL",
                 actual_err=e,
                 status_code=503,
-            )
+            ) from e
         except Timeout as e:
             msg = "Request to LLMWhisperer has timed out"
             logger.error(f"{msg}: {e}")
-            raise ExtractorError(msg, actual_err=e, status_code=504)
+            raise ExtractorError(msg, actual_err=e, status_code=504) from e
         except LLMWhispererClientException as e:
             logger.error(f"LLM Whisperer error: {e}")
             raise ExtractorError(
                 message=f"LLM Whisperer error: {e}",
                 actual_err=e,
                 status_code=500,
-            )
+            ) from e
 
         return response
 
@@ -251,7 +262,7 @@ def send_whisper_request(
                 response["line_metadata"] = highlight_data
         except OSError as e:
             logger.error(f"OS error while reading {input_file_path}: {e}")
-            raise ExtractorError(str(e))
+            raise ExtractorError(str(e)) from e
         return response
 
     @staticmethod
@@ -261,10 +272,12 @@ def make_highlight_data_request(
         """Makes a call to get highlight data from LLMWhisperer.
 
         Args:
+            config (dict[str, Any]): LLMWhisperer config to use
             whisper_hash (str): Identifier of the extraction
+            enable_highlight (bool): Whether to enable highlight
 
         Returns:
-            str: Extracted contents from the file
+            dict[Any, Any]: Highlight data
         """
         logger.info(f"Extracting async for whisper hash: {whisper_hash}")
 
@@ -307,14 +320,17 @@ def write_output_to_file(
         output_file_path: Path,
         fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
     ) -> None:
-        """Writes the extracted text and metadata to the specified output file
+        """Write LLMW outputs to file.
+
+        Writes the extracted text and metadata to the specified output file
         and metadata file.
 
         Args:
             output_json (dict): The dictionary containing the extracted data,
                 with "text" as the key for the main content.
             output_file_path (Path): The file path where the extracted text
                 should be written.
+            fs (FileStorage): File storage instance to use for writing
 
         Raises:
             ExtractorError: If there is an error while writing the output file.
@@ -330,7 +346,7 @@ def write_output_to_file(
             )
         except Exception as e:
             logger.error(f"Error while writing {output_file_path}: {e}")
-            raise ExtractorError(str(e))
+            raise ExtractorError(str(e)) from e
         try:
             # Define the directory of the output file and metadata paths
             output_dir = output_file_path.parent
diff --git a/src/unstract/sdk/prompt.py b/src/unstract/sdk/prompt.py
@@ -21,10 +21,13 @@ def __init__(
         prompt_port: str,
         is_public_call: bool = False,
     ) -> None:
-        """Args:
-        tool (AbstractTool): Instance of AbstractTool
-        prompt_host (str): Host of platform service
-        prompt_host (str): Port of platform service
+        """Class to interact with prompt-service.
+
+        Args:
+            tool (AbstractTool): Instance of AbstractTool
+            prompt_host (str): Host of platform service
+            prompt_port (str): Port of platform service
+            is_public_call (bool): Whether the call is public. Defaults to False
         """
         self.tool = tool
         self.base_url = SdkHelper.get_platform_base_url(prompt_host, prompt_port)
@@ -113,8 +116,7 @@ def _post_call(
         params: dict[str, str] | None = None,
         headers: dict[str, str] | None = None,
     ) -> dict[str, Any]:
-        """Invokes and communicates to prompt service to fetch response for the
-        prompt.
+        """Communicates to prompt service to fetch response for the prompt.
 
         Args:
             url_path (str): URL path to the service endpoint