Fix claims parsing: truncated JSON recovery + chunking + compact output

config.py parse_llm_json: Added truncated JSON recovery. When Claude's output is cut mid-JSON (common with long claim lists), the parser now: - Finds the last complete JSON item (closing "}") - Closes the array/object brackets - Returns partial but valid results instead of None Tested: recovers 2/3 items from truncated array, all cases pass. claims_extractor.py: - Prompt asks for compact output (150 words max per claim, group similar) - Explicitly requests "no markdown, no explanations, JSON only" - Long documents split into chunks at paragraph boundaries - Each chunk processed separately, results merged - max_tokens already at 8192 This fixes the recurring "0 claims" bug for committee responses and permit applicant responses where the JSON was getting truncated. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 16:04:34 +00:00
parent 7d1dc73112
commit e725f9ecd7
2 changed files with 96 additions and 42 deletions
--- a/mcp-server/src/legal_mcp/config.py
+++ b/mcp-server/src/legal_mcp/config.py
@@ -72,13 +72,19 @@ AUDIT_ENABLED = os.environ.get("AUDIT_ENABLED", "true").lower() == "true"
 # ── Utility ───────────────────────────────────────────────────────
 def parse_llm_json(raw: str):
-    """Parse JSON from LLM response, stripping markdown code blocks and extra text."""
+    """Parse JSON from LLM response, handling markdown wrapping and truncation.
    Handles:
    1. Markdown ```json ... ``` code blocks
    2. Extra text before/after JSON
    3. Truncated JSON (missing closing brackets) — attempts recovery
    """
    import json
    import re
    raw = raw.strip()
    # Strip markdown code blocks
    raw = re.sub(r"^```(?:json)?\s*\n?", "", raw)
-    raw = re.sub(r"\n?\s*```$", "", raw)
+    raw = re.sub(r"\n?\s*```\s*$", "", raw)
    # Try direct parse first
    try:
        return json.loads(raw)
@@ -92,4 +98,38 @@ def parse_llm_json(raw: str):
                return json.loads(match.group())
            except json.JSONDecodeError:
                continue
    # Attempt truncated JSON recovery:
    # Find the start of JSON, then try closing open brackets
    for opener, closer in [("[", "]"), ("{", "}")]:
        start = raw.find(opener)
        if start < 0:
            continue
        fragment = raw[start:]
        # Try progressively removing trailing partial content and closing
        # Look for the last complete item (ending with }, or ])
        for end_pattern in [r'.*\}(?=\s*,?\s*$)', r'.*\](?=\s*,?\s*$)', r'.*"(?=\s*$)']:
            pass  # fallback below
        # Simple approach: find last complete JSON item boundary
        # For arrays: find last "}" and close the array
        if opener == "[":
            last_brace = fragment.rfind("}")
            if last_brace > 0:
                truncated = fragment[:last_brace + 1] + "]"
                try:
                    return json.loads(truncated)
                except json.JSONDecodeError:
                    pass
        # For objects: find last complete key-value
        if opener == "{":
            last_brace = fragment.rfind("}")
            if last_brace > 0:
                # Check if this closes a nested object — try adding outer close
                truncated = fragment[:last_brace + 1]
                # Count unclosed braces
                open_count = truncated.count("{") - truncated.count("}")
                truncated += "}" * open_count
                try:
                    return json.loads(truncated)
                except json.JSONDecodeError:
                    pass
    return None
--- a/mcp-server/src/legal_mcp/services/claims_extractor.py
+++ b/mcp-server/src/legal_mcp/services/claims_extractor.py
@@ -44,16 +44,13 @@ EXTRACT_CLAIMS_PROMPT = """אתה מנתח מסמכים משפטיים בתחו
 - permit_applicant — מבקש/ת היתר
 ## פלט:
-החזר JSON array בלבד:
+החזר JSON array בלבד — ללא markdown, ללא הסברים, רק JSON:
-[
+[{"party_role": "appellant", "claim_text": "הטענה בגוף שלישי", "topic": "נושא"}]
  {
    "party_role": "appellant",
    "claim_text": "הטענה בגוף שלישי, בעברית",
    "topic": "נושא הטענה בקצרה (3-5 מילים)"
  }
 ]
-אם אין טענות — החזר [].
+חשוב:
 - claim_text קצר — עד 150 מילים לכל טענה
 - קבץ טענות דומות לטענה אחת
 - אם אין טענות החזר []
 """
@@ -72,23 +69,34 @@ async def extract_claims_with_ai(
    Returns:
        רשימת טענות עם party_role, claim_text, topic
    """
    # For very long documents, truncate but try to keep complete paragraphs
    max_chars = 25000
    if len(text) > max_chars:
        # Find a paragraph break near the limit
        cutoff = text.rfind("\n\n", 0, max_chars)
        if cutoff < max_chars // 2:
            cutoff = max_chars
        sample = text[:cutoff]
        logger.info("Document truncated from %d to %d chars", len(text), len(sample))
    else:
        sample = text
    context = f"סוג המסמך: {doc_type}"
    if party_hint:
        context += f"\nהצד המגיש: {party_hint}"
    # For very long documents, split into chunks and merge results
    max_chars_per_call = 25000
    chunks = []
    if len(text) > max_chars_per_call:
        # Split at paragraph boundaries
        pos = 0
        while pos < len(text):
            end = min(pos + max_chars_per_call, len(text))
            if end < len(text):
                # Find paragraph break near the limit
                break_pos = text.rfind("\n\n", pos, end)
                if break_pos > pos + max_chars_per_call // 2:
                    end = break_pos
            chunks.append(text[pos:end])
            pos = end
        logger.info("Document split into %d chunks (%d chars total)", len(chunks), len(text))
    else:
        chunks = [text]
    all_claims = []
    client = _get_anthropic()
    for i, chunk in enumerate(chunks):
        chunk_label = f" (חלק {i+1}/{len(chunks)})" if len(chunks) > 1 else ""
        message = client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=8192,
@@ -97,8 +105,8 @@ async def extract_claims_with_ai(
                    "role": "user",
                    "content": (
                        f"{EXTRACT_CLAIMS_PROMPT}\n\n"
-                    f"{context}\n\n"
+                        f"{context}{chunk_label}\n\n"
-                    f"--- תחילת מסמך ---\n{sample}\n--- סוף מסמך ---"
+                        f"--- תחילת מסמך ---\n{chunk}\n--- סוף מסמך ---"
                    ),
                }
            ],
@@ -107,7 +115,13 @@ async def extract_claims_with_ai(
        raw = message.content[0].text.strip()
        claims = parse_llm_json(raw)
        if claims is None:
-        logger.warning("Failed to parse claims response: %s", raw[:200])
+            logger.warning("Failed to parse claims for chunk %d: %s", i, raw[:200])
            continue
        if isinstance(claims, list):
            all_claims.extend(claims)
    claims = all_claims
    if not claims:
        return []
    if not isinstance(claims, list):