Skip to content

Commit aaa5d2c

Browse files
authored
fix(nodes): tolerate doubled-brace JSON output from models like DeepSeek (#1085)
The default GenerateAnswerNode format_instructions show the expected shape as {{"content": ...}} (LangChain's escaped braces). Strongly instruction-following models emit single braces, but some models (notably DeepSeek) copy the doubled braces verbatim, yielding {{"content": ...}} which JsonOutputParser rejects with 'Invalid json output'. Add TolerantJsonOutputParser, a JsonOutputParser subclass that retries once with a single layer of wrapping braces removed, only on the parse-failure path. Behaviour is unchanged for any model already returning valid JSON. Use it in the schema-less branch of GenerateAnswerNode. Adds unit tests.
1 parent b528325 commit aaa5d2c

3 files changed

Lines changed: 90 additions & 4 deletions

File tree

scrapegraphai/nodes/generate_answer_node.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from langchain_core.prompts import PromptTemplate
1010
from langchain_aws import ChatBedrock
1111
from langchain_ollama import ChatOllama
12-
from langchain_core.output_parsers import JsonOutputParser
1312
from langchain_core.runnables import RunnableParallel
1413
from langchain_openai import ChatOpenAI
1514
from requests.exceptions import Timeout
@@ -23,7 +22,10 @@
2322
TEMPLATE_NO_CHUNKS,
2423
TEMPLATE_NO_CHUNKS_MD,
2524
)
26-
from ..utils.output_parser import get_pydantic_output_parser
25+
from ..utils.output_parser import (
26+
TolerantJsonOutputParser,
27+
get_pydantic_output_parser,
28+
)
2729
from .base_node import BaseNode
2830

2931

@@ -148,7 +150,7 @@ def execute(self, state: dict) -> dict:
148150
format_instructions = ""
149151
else:
150152
if not isinstance(self.llm_model, ChatBedrock):
151-
output_parser = JsonOutputParser()
153+
output_parser = TolerantJsonOutputParser()
152154
format_instructions = (
153155
"You must respond with a JSON object. Your response should be formatted as a valid JSON "
154156
"with a 'content' field containing your analysis. For example:\n"

scrapegraphai/utils/output_parser.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,53 @@
22
Functions to retrieve the correct output parser and format instructions for the LLM model.
33
"""
44

5-
from typing import Any, Callable, Dict, Type, Union
5+
from typing import Any, Callable, Dict, List, Type, Union
66

7+
from langchain_core.exceptions import OutputParserException
8+
from langchain_core.outputs import Generation
79
from langchain_core.output_parsers import JsonOutputParser
810
from pydantic import BaseModel as BaseModelV2
911
from pydantic.v1 import BaseModel as BaseModelV1
1012

1113

14+
def _strip_doubled_braces(text: str) -> str:
15+
"""Strip one layer of the doubled braces some models echo from the prompt.
16+
17+
The default ``format_instructions`` show the expected shape using LangChain's
18+
escaped braces, e.g. ``{{"content": "..."}}``. Strongly instruction-following
19+
models (GPT-4o, etc.) emit single braces, but some models (notably DeepSeek)
20+
copy the doubled braces verbatim, producing ``{{"content": "..."}}`` which is
21+
not valid JSON. This normalizes that single case and is a no-op otherwise.
22+
"""
23+
stripped = text.strip()
24+
if stripped.startswith("{{") and stripped.endswith("}}"):
25+
return stripped[1:-1]
26+
return text
27+
28+
29+
class TolerantJsonOutputParser(JsonOutputParser):
30+
"""A :class:`JsonOutputParser` tolerant of doubled-brace output.
31+
32+
Behaviour is unchanged on the happy path: valid JSON is parsed by the parent
33+
parser exactly as before. Only when parsing fails AND the output is wrapped in
34+
doubled braces (``{{ ... }}``) does it retry once with a single layer of braces
35+
removed. This keeps providers like DeepSeek working without altering output for
36+
any model that already returns clean JSON.
37+
"""
38+
39+
def parse_result(self, result: List[Generation], *, partial: bool = False) -> Any:
40+
try:
41+
return super().parse_result(result, partial=partial)
42+
except OutputParserException:
43+
text = result[0].text
44+
normalized = _strip_doubled_braces(text)
45+
if normalized != text:
46+
return super().parse_result(
47+
[Generation(text=normalized)], partial=partial
48+
)
49+
raise
50+
51+
1252
def get_structured_output_parser(
1353
schema: Union[Dict[str, Any], Type[BaseModelV1 | BaseModelV2], Type],
1454
) -> Callable:

tests/utils/output_parser_test.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""Tests for scrapegraphai.utils.output_parser.TolerantJsonOutputParser."""
2+
3+
import pytest
4+
5+
from scrapegraphai.utils.output_parser import (
6+
TolerantJsonOutputParser,
7+
_strip_doubled_braces,
8+
)
9+
10+
11+
def test_strip_doubled_braces_unwraps_single_layer():
12+
assert _strip_doubled_braces('{{"content": "hi"}}') == '{"content": "hi"}'
13+
14+
15+
def test_strip_doubled_braces_is_noop_for_clean_json():
16+
text = '{"content": "hi"}'
17+
assert _strip_doubled_braces(text) == text
18+
19+
20+
def test_strip_doubled_braces_ignores_unbalanced():
21+
text = '{{"content": "hi"}'
22+
assert _strip_doubled_braces(text) == text
23+
24+
25+
def test_tolerant_parser_parses_clean_json_unchanged():
26+
parser = TolerantJsonOutputParser()
27+
assert parser.parse('{"content": "hi"}') == {"content": "hi"}
28+
29+
30+
def test_tolerant_parser_recovers_doubled_braces():
31+
"""Models such as DeepSeek echo the prompt's escaped braces verbatim."""
32+
parser = TolerantJsonOutputParser()
33+
assert parser.parse('{{"content": "hi"}}') == {"content": "hi"}
34+
35+
36+
def test_tolerant_parser_recovers_doubled_braces_with_whitespace():
37+
parser = TolerantJsonOutputParser()
38+
assert parser.parse(' {{"content": "hi"}} ') == {"content": "hi"}
39+
40+
41+
def test_tolerant_parser_still_raises_on_irrecoverable_output():
42+
parser = TolerantJsonOutputParser()
43+
with pytest.raises(Exception):
44+
parser.parse("this is not json at all")

0 commit comments

Comments
 (0)