fix(nodes): tolerate doubled-brace JSON output from models like DeepSeek (#1085)

mjmirza · web-flow · commit aaa5d2cf6d26 · 2026-06-11T11:16:32.000+02:00
The default GenerateAnswerNode format_instructions show the expected shape as
{{"content": ...}} (LangChain's escaped braces). Strongly instruction-following
models emit single braces, but some models (notably DeepSeek) copy the doubled
braces verbatim, yielding {{"content": ...}} which JsonOutputParser rejects with
'Invalid json output'.

Add TolerantJsonOutputParser, a JsonOutputParser subclass that retries once with a
single layer of wrapping braces removed, only on the parse-failure path. Behaviour
is unchanged for any model already returning valid JSON. Use it in the schema-less
branch of GenerateAnswerNode. Adds unit tests.
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -9,7 +9,6 @@
 from langchain_core.prompts import PromptTemplate
 from langchain_aws import ChatBedrock
 from langchain_ollama import ChatOllama
-from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.runnables import RunnableParallel
 from langchain_openai import ChatOpenAI
 from requests.exceptions import Timeout
@@ -23,7 +22,10 @@
     TEMPLATE_NO_CHUNKS,
     TEMPLATE_NO_CHUNKS_MD,
 )
-from ..utils.output_parser import get_pydantic_output_parser
+from ..utils.output_parser import (
+    TolerantJsonOutputParser,
+    get_pydantic_output_parser,
+)
 from .base_node import BaseNode
 
 
@@ -148,7 +150,7 @@ def execute(self, state: dict) -> dict:
                     format_instructions = ""
         else:
             if not isinstance(self.llm_model, ChatBedrock):
-                output_parser = JsonOutputParser()
+                output_parser = TolerantJsonOutputParser()
                 format_instructions = (
                     "You must respond with a JSON object. Your response should be formatted as a valid JSON "
                     "with a 'content' field containing your analysis. For example:\n"
diff --git a/scrapegraphai/utils/output_parser.py b/scrapegraphai/utils/output_parser.py
@@ -2,13 +2,53 @@
 Functions to retrieve the correct output parser and format instructions for the LLM model.
 """
 
-from typing import Any, Callable, Dict, Type, Union
+from typing import Any, Callable, Dict, List, Type, Union
 
+from langchain_core.exceptions import OutputParserException
+from langchain_core.outputs import Generation
 from langchain_core.output_parsers import JsonOutputParser
 from pydantic import BaseModel as BaseModelV2
 from pydantic.v1 import BaseModel as BaseModelV1
 
 
+def _strip_doubled_braces(text: str) -> str:
+    """Strip one layer of the doubled braces some models echo from the prompt.
+
+    The default ``format_instructions`` show the expected shape using LangChain's
+    escaped braces, e.g. ``{{"content": "..."}}``. Strongly instruction-following
+    models (GPT-4o, etc.) emit single braces, but some models (notably DeepSeek)
+    copy the doubled braces verbatim, producing ``{{"content": "..."}}`` which is
+    not valid JSON. This normalizes that single case and is a no-op otherwise.
+    """
+    stripped = text.strip()
+    if stripped.startswith("{{") and stripped.endswith("}}"):
+        return stripped[1:-1]
+    return text
+
+
+class TolerantJsonOutputParser(JsonOutputParser):
+    """A :class:`JsonOutputParser` tolerant of doubled-brace output.
+
+    Behaviour is unchanged on the happy path: valid JSON is parsed by the parent
+    parser exactly as before. Only when parsing fails AND the output is wrapped in
+    doubled braces (``{{ ... }}``) does it retry once with a single layer of braces
+    removed. This keeps providers like DeepSeek working without altering output for
+    any model that already returns clean JSON.
+    """
+
+    def parse_result(self, result: List[Generation], *, partial: bool = False) -> Any:
+        try:
+            return super().parse_result(result, partial=partial)
+        except OutputParserException:
+            text = result[0].text
+            normalized = _strip_doubled_braces(text)
+            if normalized != text:
+                return super().parse_result(
+                    [Generation(text=normalized)], partial=partial
+                )
+            raise
+
+
 def get_structured_output_parser(
     schema: Union[Dict[str, Any], Type[BaseModelV1 | BaseModelV2], Type],
 ) -> Callable:
diff --git a/tests/utils/output_parser_test.py b/tests/utils/output_parser_test.py
@@ -0,0 +1,44 @@
+"""Tests for scrapegraphai.utils.output_parser.TolerantJsonOutputParser."""
+
+import pytest
+
+from scrapegraphai.utils.output_parser import (
+    TolerantJsonOutputParser,
+    _strip_doubled_braces,
+)
+
+
+def test_strip_doubled_braces_unwraps_single_layer():
+    assert _strip_doubled_braces('{{"content": "hi"}}') == '{"content": "hi"}'
+
+
+def test_strip_doubled_braces_is_noop_for_clean_json():
+    text = '{"content": "hi"}'
+    assert _strip_doubled_braces(text) == text
+
+
+def test_strip_doubled_braces_ignores_unbalanced():
+    text = '{{"content": "hi"}'
+    assert _strip_doubled_braces(text) == text
+
+
+def test_tolerant_parser_parses_clean_json_unchanged():
+    parser = TolerantJsonOutputParser()
+    assert parser.parse('{"content": "hi"}') == {"content": "hi"}
+
+
+def test_tolerant_parser_recovers_doubled_braces():
+    """Models such as DeepSeek echo the prompt's escaped braces verbatim."""
+    parser = TolerantJsonOutputParser()
+    assert parser.parse('{{"content": "hi"}}') == {"content": "hi"}
+
+
+def test_tolerant_parser_recovers_doubled_braces_with_whitespace():
+    parser = TolerantJsonOutputParser()
+    assert parser.parse('  {{"content": "hi"}}  ') == {"content": "hi"}
+
+
+def test_tolerant_parser_still_raises_on_irrecoverable_output():
+    parser = TolerantJsonOutputParser()
+    with pytest.raises(Exception):
+        parser.parse("this is not json at all")