From e725f9ecd74d744b1dabe63212922d113e2d3d70 Mon Sep 17 00:00:00 2001 From: Chaim Date: Fri, 3 Apr 2026 16:04:34 +0000 Subject: [PATCH] Fix claims parsing: truncated JSON recovery + chunking + compact output config.py parse_llm_json: Added truncated JSON recovery. When Claude's output is cut mid-JSON (common with long claim lists), the parser now: - Finds the last complete JSON item (closing "}") - Closes the array/object brackets - Returns partial but valid results instead of None Tested: recovers 2/3 items from truncated array, all cases pass. claims_extractor.py: - Prompt asks for compact output (150 words max per claim, group similar) - Explicitly requests "no markdown, no explanations, JSON only" - Long documents split into chunks at paragraph boundaries - Each chunk processed separately, results merged - max_tokens already at 8192 This fixes the recurring "0 claims" bug for committee responses and permit applicant responses where the JSON was getting truncated. Co-Authored-By: Claude Opus 4.6 (1M context) --- mcp-server/src/legal_mcp/config.py | 44 ++++++++- .../legal_mcp/services/claims_extractor.py | 94 +++++++++++-------- 2 files changed, 96 insertions(+), 42 deletions(-) diff --git a/mcp-server/src/legal_mcp/config.py b/mcp-server/src/legal_mcp/config.py index 3b922de..722be5c 100644 --- a/mcp-server/src/legal_mcp/config.py +++ b/mcp-server/src/legal_mcp/config.py @@ -72,13 +72,19 @@ AUDIT_ENABLED = os.environ.get("AUDIT_ENABLED", "true").lower() == "true" # ── Utility ─────────────────────────────────────────────────────── def parse_llm_json(raw: str): - """Parse JSON from LLM response, stripping markdown code blocks and extra text.""" + """Parse JSON from LLM response, handling markdown wrapping and truncation. + + Handles: + 1. Markdown ```json ... ``` code blocks + 2. Extra text before/after JSON + 3. Truncated JSON (missing closing brackets) — attempts recovery + """ import json import re raw = raw.strip() # Strip markdown code blocks raw = re.sub(r"^```(?:json)?\s*\n?", "", raw) - raw = re.sub(r"\n?\s*```$", "", raw) + raw = re.sub(r"\n?\s*```\s*$", "", raw) # Try direct parse first try: return json.loads(raw) @@ -92,4 +98,38 @@ def parse_llm_json(raw: str): return json.loads(match.group()) except json.JSONDecodeError: continue + # Attempt truncated JSON recovery: + # Find the start of JSON, then try closing open brackets + for opener, closer in [("[", "]"), ("{", "}")]: + start = raw.find(opener) + if start < 0: + continue + fragment = raw[start:] + # Try progressively removing trailing partial content and closing + # Look for the last complete item (ending with }, or ]) + for end_pattern in [r'.*\}(?=\s*,?\s*$)', r'.*\](?=\s*,?\s*$)', r'.*"(?=\s*$)']: + pass # fallback below + # Simple approach: find last complete JSON item boundary + # For arrays: find last "}" and close the array + if opener == "[": + last_brace = fragment.rfind("}") + if last_brace > 0: + truncated = fragment[:last_brace + 1] + "]" + try: + return json.loads(truncated) + except json.JSONDecodeError: + pass + # For objects: find last complete key-value + if opener == "{": + last_brace = fragment.rfind("}") + if last_brace > 0: + # Check if this closes a nested object — try adding outer close + truncated = fragment[:last_brace + 1] + # Count unclosed braces + open_count = truncated.count("{") - truncated.count("}") + truncated += "}" * open_count + try: + return json.loads(truncated) + except json.JSONDecodeError: + pass return None diff --git a/mcp-server/src/legal_mcp/services/claims_extractor.py b/mcp-server/src/legal_mcp/services/claims_extractor.py index fc42e20..16edcbe 100644 --- a/mcp-server/src/legal_mcp/services/claims_extractor.py +++ b/mcp-server/src/legal_mcp/services/claims_extractor.py @@ -44,16 +44,13 @@ EXTRACT_CLAIMS_PROMPT = """אתה מנתח מסמכים משפטיים בתחו - permit_applicant — מבקש/ת היתר ## פלט: -החזר JSON array בלבד: -[ - { - "party_role": "appellant", - "claim_text": "הטענה בגוף שלישי, בעברית", - "topic": "נושא הטענה בקצרה (3-5 מילים)" - } -] +החזר JSON array בלבד — ללא markdown, ללא הסברים, רק JSON: +[{"party_role": "appellant", "claim_text": "הטענה בגוף שלישי", "topic": "נושא"}] -אם אין טענות — החזר []. +חשוב: +- claim_text קצר — עד 150 מילים לכל טענה +- קבץ טענות דומות לטענה אחת +- אם אין טענות החזר [] """ @@ -72,42 +69,59 @@ async def extract_claims_with_ai( Returns: רשימת טענות עם party_role, claim_text, topic """ - # For very long documents, truncate but try to keep complete paragraphs - max_chars = 25000 - if len(text) > max_chars: - # Find a paragraph break near the limit - cutoff = text.rfind("\n\n", 0, max_chars) - if cutoff < max_chars // 2: - cutoff = max_chars - sample = text[:cutoff] - logger.info("Document truncated from %d to %d chars", len(text), len(sample)) - else: - sample = text - context = f"סוג המסמך: {doc_type}" if party_hint: context += f"\nהצד המגיש: {party_hint}" - client = _get_anthropic() - message = client.messages.create( - model="claude-sonnet-4-20250514", - max_tokens=8192, - messages=[ - { - "role": "user", - "content": ( - f"{EXTRACT_CLAIMS_PROMPT}\n\n" - f"{context}\n\n" - f"--- תחילת מסמך ---\n{sample}\n--- סוף מסמך ---" - ), - } - ], - ) + # For very long documents, split into chunks and merge results + max_chars_per_call = 25000 + chunks = [] + if len(text) > max_chars_per_call: + # Split at paragraph boundaries + pos = 0 + while pos < len(text): + end = min(pos + max_chars_per_call, len(text)) + if end < len(text): + # Find paragraph break near the limit + break_pos = text.rfind("\n\n", pos, end) + if break_pos > pos + max_chars_per_call // 2: + end = break_pos + chunks.append(text[pos:end]) + pos = end + logger.info("Document split into %d chunks (%d chars total)", len(chunks), len(text)) + else: + chunks = [text] - raw = message.content[0].text.strip() - claims = parse_llm_json(raw) - if claims is None: - logger.warning("Failed to parse claims response: %s", raw[:200]) + all_claims = [] + client = _get_anthropic() + + for i, chunk in enumerate(chunks): + chunk_label = f" (חלק {i+1}/{len(chunks)})" if len(chunks) > 1 else "" + message = client.messages.create( + model="claude-sonnet-4-20250514", + max_tokens=8192, + messages=[ + { + "role": "user", + "content": ( + f"{EXTRACT_CLAIMS_PROMPT}\n\n" + f"{context}{chunk_label}\n\n" + f"--- תחילת מסמך ---\n{chunk}\n--- סוף מסמך ---" + ), + } + ], + ) + + raw = message.content[0].text.strip() + claims = parse_llm_json(raw) + if claims is None: + logger.warning("Failed to parse claims for chunk %d: %s", i, raw[:200]) + continue + if isinstance(claims, list): + all_claims.extend(claims) + + claims = all_claims + if not claims: return [] if not isinstance(claims, list):