Skip to content

Commit f19a569

Browse files
authored
Merge pull request #25 from Zipstack/fix/lw-158-whisper-status-error-handling
LW-158 [FIX] Improve error handling for non-JSON or empty API responses in LLMWhispererClientV2.whisper_status
2 parents 764a0b6 + 7c5780f commit f19a569

File tree

3 files changed

+27
-12
lines changed

3 files changed

+27
-12
lines changed

src/unstract/llmwhisperer/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "2.4.0"
1+
__version__ = "2.4.1"
22

33
from .client_v2 import LLMWhispererClientV2 # noqa: F401
44

src/unstract/llmwhisperer/client_v2.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -446,9 +446,17 @@ def whisper_status(self, whisper_hash: str) -> Any:
446446
s = requests.Session()
447447
response = s.send(prepared, timeout=self.api_timeout)
448448
if response.status_code != 200:
449-
err = json.loads(response.text)
450-
err["status_code"] = response.status_code
451-
raise LLMWhispererClientException(err)
449+
if not (response.text or "").strip():
450+
self.logger.error(f"API error - empty response body, status code: {response.status_code}")
451+
raise LLMWhispererClientException("API error: empty response body", response.status_code)
452+
try:
453+
err = json.loads(response.text)
454+
except json.JSONDecodeError as e:
455+
# Truncate response text if too long to avoid log pollution
456+
response_preview = response.text[:500] + "..." if len(response.text) > 500 else response.text
457+
self.logger.error(f"API error - JSON decode failed: {e}; Response preview: {response_preview!r}")
458+
raise LLMWhispererClientException(f"API error: non-JSON response - {response_preview}", response.status_code) from e
459+
raise LLMWhispererClientException(err, response.status_code)
452460
message = json.loads(response.text)
453461
message["status_code"] = response.status_code
454462
return message

tests/integration/client_v2_test.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111

1212
logger = logging.getLogger(__name__)
1313

14+
# Test tolerance constants for better maintainability
15+
COORDINATE_TOLERANCE = 2
16+
PERCENTAGE_TOLERANCE = 0.05
17+
PAGE_HEIGHT_TOLERANCE = 5
18+
OCR_SIMILARITY_THRESHOLD = 0.90
19+
1420

1521
def test_get_usage_info(client_v2: LLMWhispererClientV2) -> None:
1622
usage_info = client_v2.get_usage_info()
@@ -28,6 +34,7 @@ def test_get_usage_info(client_v2: LLMWhispererClientV2) -> None:
2834
"overage_page_count",
2935
"subscription_plan",
3036
"today_page_count",
37+
"current_page_count_table",
3138
]
3239
assert set(usage_info.keys()) == set(expected_keys), f"usage_info {usage_info} does not contain the expected keys"
3340

@@ -103,12 +110,12 @@ def test_highlight(client_v2: LLMWhispererClientV2, data_dir: str, input_file: s
103110

104111
# Assert line 2 data
105112
line2 = highlight_data["2"]
106-
assert line2["base_y"] == 155
107-
assert line2["base_y_percent"] == pytest.approx(4.8927) # Using approx for float comparison
108-
assert line2["height"] == 51
109-
assert line2["height_percent"] == pytest.approx(1.6098) # Using approx for float comparison
113+
assert line2["base_y"] == pytest.approx(155, abs=COORDINATE_TOLERANCE)
114+
assert line2["base_y_percent"] == pytest.approx(4.8927, abs=PERCENTAGE_TOLERANCE)
115+
assert line2["height"] == pytest.approx(51, abs=COORDINATE_TOLERANCE)
116+
assert line2["height_percent"] == pytest.approx(1.6098, abs=PERCENTAGE_TOLERANCE)
110117
assert line2["page"] == 0
111-
assert line2["page_height"] == 3168
118+
assert line2["page_height"] == pytest.approx(3168, abs=PAGE_HEIGHT_TOLERANCE)
112119

113120

114121
@pytest.mark.parametrize(
@@ -170,7 +177,7 @@ def test_whisper_v2_url_in_post(
170177
"url,token,webhook_name",
171178
[
172179
(
173-
"https://webhook.site/0990fff9-ce95-4d11-95e1-be9ad38c40d6", # need to find a clean solution
180+
os.getenv("WEBHOOK_TEST_URL", "https://httpbin.org/post"), # configurable via env var, defaults to httpbin.org
174181
"",
175182
"client_v2_test",
176183
),
@@ -237,13 +244,13 @@ def assert_extracted_text(file_path: str, whisper_result: dict, mode: str, outpu
237244
assert whisper_result["status_code"] == 200
238245

239246
# For OCR based processing
240-
threshold = 0.94
247+
threshold = OCR_SIMILARITY_THRESHOLD
241248

242249
# For text based processing
243250
if mode == "native_text" and output_mode == "text":
244251
threshold = 0.99
245252
elif mode == "low_cost":
246-
threshold = 0.90
253+
threshold = OCR_SIMILARITY_THRESHOLD
247254
extracted_text = whisper_result["extraction"]["result_text"]
248255
similarity = SequenceMatcher(None, extracted_text, exp).ratio()
249256

0 commit comments

Comments
 (0)