Fix claims parsing: truncated JSON recovery + chunking + compact output

config.py parse_llm_json: Added truncated JSON recovery. When Claude's output is cut mid-JSON (common with long claim lists), the parser now: - Finds the last complete JSON item (closing "}") - Closes the array/object brackets - Returns partial but valid results instead of None Tested: recovers 2/3 items from truncated array, all cases pass. claims_extractor.py: - Prompt asks for compact output (150 words max per claim, group similar) - Explicitly requests "no markdown, no explanations, JSON only" - Long documents split into chunks at paragraph boundaries - Each chunk processed separately, results merged - max_tokens already at 8192 This fixes the recurring "0 claims" bug for committee responses and permit applicant responses where the JSON was getting truncated. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 16:04:34 +00:00
parent 7d1dc73112
commit e725f9ecd7
2 changed files with 96 additions and 42 deletions
--- a/mcp-server/src/legal_mcp/config.py
+++ b/mcp-server/src/legal_mcp/config.py
@@ -72,13 +72,19 @@ AUDIT_ENABLED = os.environ.get("AUDIT_ENABLED", "true").lower() == "true"
 # ── Utility ───────────────────────────────────────────────────────

 def parse_llm_json(raw: str):
-    """Parse JSON from LLM response, stripping markdown code blocks and extra text."""
+    """Parse JSON from LLM response, handling markdown wrapping and truncation.
+
+    Handles:
+    1. Markdown ```json ... ``` code blocks
+    2. Extra text before/after JSON
+    3. Truncated JSON (missing closing brackets) — attempts recovery
+    """
    import json
    import re
    raw = raw.strip()
    # Strip markdown code blocks
    raw = re.sub(r"^```(?:json)?\s*\n?", "", raw)
-    raw = re.sub(r"\n?\s*```$", "", raw)
+    raw = re.sub(r"\n?\s*```\s*$", "", raw)
    # Try direct parse first
    try:
        return json.loads(raw)
@@ -92,4 +98,38 @@ def parse_llm_json(raw: str):
                return json.loads(match.group())
            except json.JSONDecodeError:
                continue
+    # Attempt truncated JSON recovery:
+    # Find the start of JSON, then try closing open brackets
+    for opener, closer in [("[", "]"), ("{", "}")]:
+        start = raw.find(opener)
+        if start < 0:
+            continue
+        fragment = raw[start:]
+        # Try progressively removing trailing partial content and closing
+        # Look for the last complete item (ending with }, or ])
+        for end_pattern in [r'.*\}(?=\s*,?\s*$)', r'.*\](?=\s*,?\s*$)', r'.*"(?=\s*$)']:
+            pass  # fallback below
+        # Simple approach: find last complete JSON item boundary
+        # For arrays: find last "}" and close the array
+        if opener == "[":
+            last_brace = fragment.rfind("}")
+            if last_brace > 0:
+                truncated = fragment[:last_brace + 1] + "]"
+                try:
+                    return json.loads(truncated)
+                except json.JSONDecodeError:
+                    pass
+        # For objects: find last complete key-value
+        if opener == "{":
+            last_brace = fragment.rfind("}")
+            if last_brace > 0:
+                # Check if this closes a nested object — try adding outer close
+                truncated = fragment[:last_brace + 1]
+                # Count unclosed braces
+                open_count = truncated.count("{") - truncated.count("}")
+                truncated += "}" * open_count
+                try:
+                    return json.loads(truncated)
+                except json.JSONDecodeError:
+                    pass
    return None