Fix claims parsing: truncated JSON recovery + chunking + compact output

config.py parse_llm_json: Added truncated JSON recovery. When Claude's
output is cut mid-JSON (common with long claim lists), the parser now:
- Finds the last complete JSON item (closing "}")
- Closes the array/object brackets
- Returns partial but valid results instead of None
Tested: recovers 2/3 items from truncated array, all cases pass.

claims_extractor.py:
- Prompt asks for compact output (150 words max per claim, group similar)
- Explicitly requests "no markdown, no explanations, JSON only"
- Long documents split into chunks at paragraph boundaries
- Each chunk processed separately, results merged
- max_tokens already at 8192

This fixes the recurring "0 claims" bug for committee responses and
permit applicant responses where the JSON was getting truncated.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-03 16:04:34 +00:00
parent 7d1dc73112
commit e725f9ecd7
2 changed files with 96 additions and 42 deletions

View File

@@ -72,13 +72,19 @@ AUDIT_ENABLED = os.environ.get("AUDIT_ENABLED", "true").lower() == "true"
# ── Utility ─────────────────────────────────────────────────────── # ── Utility ───────────────────────────────────────────────────────
def parse_llm_json(raw: str): def parse_llm_json(raw: str):
"""Parse JSON from LLM response, stripping markdown code blocks and extra text.""" """Parse JSON from LLM response, handling markdown wrapping and truncation.
Handles:
1. Markdown ```json ... ``` code blocks
2. Extra text before/after JSON
3. Truncated JSON (missing closing brackets) — attempts recovery
"""
import json import json
import re import re
raw = raw.strip() raw = raw.strip()
# Strip markdown code blocks # Strip markdown code blocks
raw = re.sub(r"^```(?:json)?\s*\n?", "", raw) raw = re.sub(r"^```(?:json)?\s*\n?", "", raw)
raw = re.sub(r"\n?\s*```$", "", raw) raw = re.sub(r"\n?\s*```\s*$", "", raw)
# Try direct parse first # Try direct parse first
try: try:
return json.loads(raw) return json.loads(raw)
@@ -92,4 +98,38 @@ def parse_llm_json(raw: str):
return json.loads(match.group()) return json.loads(match.group())
except json.JSONDecodeError: except json.JSONDecodeError:
continue continue
# Attempt truncated JSON recovery:
# Find the start of JSON, then try closing open brackets
for opener, closer in [("[", "]"), ("{", "}")]:
start = raw.find(opener)
if start < 0:
continue
fragment = raw[start:]
# Try progressively removing trailing partial content and closing
# Look for the last complete item (ending with }, or ])
for end_pattern in [r'.*\}(?=\s*,?\s*$)', r'.*\](?=\s*,?\s*$)', r'.*"(?=\s*$)']:
pass # fallback below
# Simple approach: find last complete JSON item boundary
# For arrays: find last "}" and close the array
if opener == "[":
last_brace = fragment.rfind("}")
if last_brace > 0:
truncated = fragment[:last_brace + 1] + "]"
try:
return json.loads(truncated)
except json.JSONDecodeError:
pass
# For objects: find last complete key-value
if opener == "{":
last_brace = fragment.rfind("}")
if last_brace > 0:
# Check if this closes a nested object — try adding outer close
truncated = fragment[:last_brace + 1]
# Count unclosed braces
open_count = truncated.count("{") - truncated.count("}")
truncated += "}" * open_count
try:
return json.loads(truncated)
except json.JSONDecodeError:
pass
return None return None

View File

@@ -44,16 +44,13 @@ EXTRACT_CLAIMS_PROMPT = """אתה מנתח מסמכים משפטיים בתחו
- permit_applicant — מבקש/ת היתר - permit_applicant — מבקש/ת היתר
## פלט: ## פלט:
החזר JSON array בלבד: החזר JSON array בלבד — ללא markdown, ללא הסברים, רק JSON:
[ [{"party_role": "appellant", "claim_text": "הטענה בגוף שלישי", "topic": "נושא"}]
{
"party_role": "appellant",
"claim_text": "הטענה בגוף שלישי, בעברית",
"topic": "נושא הטענה בקצרה (3-5 מילים)"
}
]
אם אין טענות — החזר []. חשוב:
- claim_text קצר — עד 150 מילים לכל טענה
- קבץ טענות דומות לטענה אחת
- אם אין טענות החזר []
""" """
@@ -72,23 +69,34 @@ async def extract_claims_with_ai(
Returns: Returns:
רשימת טענות עם party_role, claim_text, topic רשימת טענות עם party_role, claim_text, topic
""" """
# For very long documents, truncate but try to keep complete paragraphs
max_chars = 25000
if len(text) > max_chars:
# Find a paragraph break near the limit
cutoff = text.rfind("\n\n", 0, max_chars)
if cutoff < max_chars // 2:
cutoff = max_chars
sample = text[:cutoff]
logger.info("Document truncated from %d to %d chars", len(text), len(sample))
else:
sample = text
context = f"סוג המסמך: {doc_type}" context = f"סוג המסמך: {doc_type}"
if party_hint: if party_hint:
context += f"\nהצד המגיש: {party_hint}" context += f"\nהצד המגיש: {party_hint}"
# For very long documents, split into chunks and merge results
max_chars_per_call = 25000
chunks = []
if len(text) > max_chars_per_call:
# Split at paragraph boundaries
pos = 0
while pos < len(text):
end = min(pos + max_chars_per_call, len(text))
if end < len(text):
# Find paragraph break near the limit
break_pos = text.rfind("\n\n", pos, end)
if break_pos > pos + max_chars_per_call // 2:
end = break_pos
chunks.append(text[pos:end])
pos = end
logger.info("Document split into %d chunks (%d chars total)", len(chunks), len(text))
else:
chunks = [text]
all_claims = []
client = _get_anthropic() client = _get_anthropic()
for i, chunk in enumerate(chunks):
chunk_label = f" (חלק {i+1}/{len(chunks)})" if len(chunks) > 1 else ""
message = client.messages.create( message = client.messages.create(
model="claude-sonnet-4-20250514", model="claude-sonnet-4-20250514",
max_tokens=8192, max_tokens=8192,
@@ -97,8 +105,8 @@ async def extract_claims_with_ai(
"role": "user", "role": "user",
"content": ( "content": (
f"{EXTRACT_CLAIMS_PROMPT}\n\n" f"{EXTRACT_CLAIMS_PROMPT}\n\n"
f"{context}\n\n" f"{context}{chunk_label}\n\n"
f"--- תחילת מסמך ---\n{sample}\n--- סוף מסמך ---" f"--- תחילת מסמך ---\n{chunk}\n--- סוף מסמך ---"
), ),
} }
], ],
@@ -107,7 +115,13 @@ async def extract_claims_with_ai(
raw = message.content[0].text.strip() raw = message.content[0].text.strip()
claims = parse_llm_json(raw) claims = parse_llm_json(raw)
if claims is None: if claims is None:
logger.warning("Failed to parse claims response: %s", raw[:200]) logger.warning("Failed to parse claims for chunk %d: %s", i, raw[:200])
continue
if isinstance(claims, list):
all_claims.extend(claims)
claims = all_claims
if not claims:
return [] return []
if not isinstance(claims, list): if not isinstance(claims, list):