Fix claims parsing: truncated JSON recovery + chunking + compact output
config.py parse_llm_json: Added truncated JSON recovery. When Claude's output is cut mid-JSON (common with long claim lists), the parser now: - Finds the last complete JSON item (closing "}") - Closes the array/object brackets - Returns partial but valid results instead of None Tested: recovers 2/3 items from truncated array, all cases pass. claims_extractor.py: - Prompt asks for compact output (150 words max per claim, group similar) - Explicitly requests "no markdown, no explanations, JSON only" - Long documents split into chunks at paragraph boundaries - Each chunk processed separately, results merged - max_tokens already at 8192 This fixes the recurring "0 claims" bug for committee responses and permit applicant responses where the JSON was getting truncated. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -44,16 +44,13 @@ EXTRACT_CLAIMS_PROMPT = """אתה מנתח מסמכים משפטיים בתחו
|
||||
- permit_applicant — מבקש/ת היתר
|
||||
|
||||
## פלט:
|
||||
החזר JSON array בלבד:
|
||||
[
|
||||
{
|
||||
"party_role": "appellant",
|
||||
"claim_text": "הטענה בגוף שלישי, בעברית",
|
||||
"topic": "נושא הטענה בקצרה (3-5 מילים)"
|
||||
}
|
||||
]
|
||||
החזר JSON array בלבד — ללא markdown, ללא הסברים, רק JSON:
|
||||
[{"party_role": "appellant", "claim_text": "הטענה בגוף שלישי", "topic": "נושא"}]
|
||||
|
||||
אם אין טענות — החזר [].
|
||||
חשוב:
|
||||
- claim_text קצר — עד 150 מילים לכל טענה
|
||||
- קבץ טענות דומות לטענה אחת
|
||||
- אם אין טענות החזר []
|
||||
"""
|
||||
|
||||
|
||||
@@ -72,42 +69,59 @@ async def extract_claims_with_ai(
|
||||
Returns:
|
||||
רשימת טענות עם party_role, claim_text, topic
|
||||
"""
|
||||
# For very long documents, truncate but try to keep complete paragraphs
|
||||
max_chars = 25000
|
||||
if len(text) > max_chars:
|
||||
# Find a paragraph break near the limit
|
||||
cutoff = text.rfind("\n\n", 0, max_chars)
|
||||
if cutoff < max_chars // 2:
|
||||
cutoff = max_chars
|
||||
sample = text[:cutoff]
|
||||
logger.info("Document truncated from %d to %d chars", len(text), len(sample))
|
||||
else:
|
||||
sample = text
|
||||
|
||||
context = f"סוג המסמך: {doc_type}"
|
||||
if party_hint:
|
||||
context += f"\nהצד המגיש: {party_hint}"
|
||||
|
||||
client = _get_anthropic()
|
||||
message = client.messages.create(
|
||||
model="claude-sonnet-4-20250514",
|
||||
max_tokens=8192,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
f"{EXTRACT_CLAIMS_PROMPT}\n\n"
|
||||
f"{context}\n\n"
|
||||
f"--- תחילת מסמך ---\n{sample}\n--- סוף מסמך ---"
|
||||
),
|
||||
}
|
||||
],
|
||||
)
|
||||
# For very long documents, split into chunks and merge results
|
||||
max_chars_per_call = 25000
|
||||
chunks = []
|
||||
if len(text) > max_chars_per_call:
|
||||
# Split at paragraph boundaries
|
||||
pos = 0
|
||||
while pos < len(text):
|
||||
end = min(pos + max_chars_per_call, len(text))
|
||||
if end < len(text):
|
||||
# Find paragraph break near the limit
|
||||
break_pos = text.rfind("\n\n", pos, end)
|
||||
if break_pos > pos + max_chars_per_call // 2:
|
||||
end = break_pos
|
||||
chunks.append(text[pos:end])
|
||||
pos = end
|
||||
logger.info("Document split into %d chunks (%d chars total)", len(chunks), len(text))
|
||||
else:
|
||||
chunks = [text]
|
||||
|
||||
raw = message.content[0].text.strip()
|
||||
claims = parse_llm_json(raw)
|
||||
if claims is None:
|
||||
logger.warning("Failed to parse claims response: %s", raw[:200])
|
||||
all_claims = []
|
||||
client = _get_anthropic()
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
chunk_label = f" (חלק {i+1}/{len(chunks)})" if len(chunks) > 1 else ""
|
||||
message = client.messages.create(
|
||||
model="claude-sonnet-4-20250514",
|
||||
max_tokens=8192,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
f"{EXTRACT_CLAIMS_PROMPT}\n\n"
|
||||
f"{context}{chunk_label}\n\n"
|
||||
f"--- תחילת מסמך ---\n{chunk}\n--- סוף מסמך ---"
|
||||
),
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
raw = message.content[0].text.strip()
|
||||
claims = parse_llm_json(raw)
|
||||
if claims is None:
|
||||
logger.warning("Failed to parse claims for chunk %d: %s", i, raw[:200])
|
||||
continue
|
||||
if isinstance(claims, list):
|
||||
all_claims.extend(claims)
|
||||
|
||||
claims = all_claims
|
||||
if not claims:
|
||||
return []
|
||||
|
||||
if not isinstance(claims, list):
|
||||
|
||||
Reference in New Issue
Block a user