fix(chunker): תיקון זיהוי כותרות טענות הצדדים ביחיד/נקבה + שלוש שכבות הגנה
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 4s
Lint — undefined names / undefined-names (pull_request) Successful in 9s

**סיבת-שורש:** רג'קס respondent_claims כיסה רק צורת רבים (המשיבים/המשיבין),
ולא יחיד נקבה (המשיבה) ויחיד זכר (המשיב). הכותרת "טענות המשיבה:" בתיק 8181-21
נבלעה לתוך מקטע ruling → חולצה כהלכה שגויה.

**שלוש שכבות הגנה:**
1. chunker.py — הרחבת SECTION_PATTERNS לכסות יחיד/זכר/נקבה + תגובת/תשובת
2. halacha_extractor.py — עיגון חיובי: drop ל-ruling chunks שלפני legal_analysis
3. halacha_quality.py — FLAG_PARTY_CLAIM: זיהוי שפת-טענות-צד בציטוט התומך

**היקף:** 93 תיקים עם 0 chunks של טענות (כנראה בגלל כותרות ביחיד שלא הוכרו);
628 הלכות מאושרות מתיקים אלה — חלקן עשויות להיות תקינות, יש לעשות re-chunk.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-17 15:12:05 +00:00
parent fa7fe85177
commit 42376db4c5
5 changed files with 275 additions and 5 deletions

View File

@@ -22,8 +22,20 @@ from legal_mcp import config
# court rulings use slightly different vocabulary (פסק דין, נימוקים, סוף דבר).
SECTION_PATTERNS = [
(r"רקע\s*עובדתי|רקע\s*כללי|העובדות|הרקע", "facts"),
(r"טענות\s*העוררי[םן]|טענות\s*המערערי[םן]|עיקר\s*טענות\s*העוררי[םן]", "appellant_claims"),
(r"טענות\s*המשיבי[םן]|תשובת\s*המשיבי[םן]|עיקר\s*טענות\s*המשיבי[םן]", "respondent_claims"),
# appellant_claims: covers singular (עורר/עוררת, מערער/מערערת) and plural
# (עוררים/עוררין, מערערים). Previously only plural was matched, so headers
# like "טענות העורר:" were silently absorbed into the preceding section.
(
r"(?:טענות|עיקר\s*טענות)\s*ה(?:עוררי[םן]|עורר[ת]?|מערערי[םן]|מערער[ת]?)",
"appellant_claims",
),
# respondent_claims: covers singular (משיב/משיבה) and plural (משיבים/משיבין),
# plus verb forms תשובת/תגובת. "טענות המשיבה:" (feminine singular) was the
# root cause of halacha 8181-21 index-11 being extracted from party claims.
(
r"(?:טענות|תשובת|תגובת|עיקר\s*טענות)\s*ה(?:משיבי[םן]|משיב[ה]?)",
"respondent_claims",
),
(r"דיון\s*והכרעה|דיון|הכרעה|ניתוח\s*משפטי|המסגרת\s*המשפטית|נימוקים", "legal_analysis"),
(r"מסקנ[הות]|סיכום|סוף\s*דבר", "conclusion"),
(r"פסק[- ]?דין|החלטה|לפיכך\s*אני\s*מחליט|התוצאה", "ruling"),

View File

@@ -638,12 +638,20 @@ async def _select_extractable_chunks(
extraction. Previously the fallback took *all* chunks, re-admitting exactly
the sections the primary filter excludes.
Positive-anchor guard: when a document has a "דיון/הכרעה" section
(section_type='legal_analysis'), extraction starts from that section
onwards. Any 'ruling' chunks that appear BEFORE the first legal_analysis
chunk by position are dropped — they most likely result from a party-claims
section whose header was not recognised by the chunker and was therefore
absorbed into the preceding section.
Returns ``(chunks, used_fallback)`` so the caller can log the fallback once.
"""
chunks = await db.list_precedent_chunks(
case_law_id, section_types=EXTRACTABLE_SECTIONS,
)
if chunks:
chunks = _apply_discussion_anchor(chunks)
return chunks, False
all_chunks = await db.list_precedent_chunks(case_law_id)
filtered = [
@@ -653,6 +661,37 @@ async def _select_extractable_chunks(
return filtered, True
def _apply_discussion_anchor(chunks: list[dict]) -> list[dict]:
"""Drop 'ruling' chunks that precede the first 'legal_analysis' chunk.
In Israeli planning-committee decisions the discussion section
(דיון / הכרעה / דיון והכרעה) always comes after the parties' claims.
A 'ruling'-labelled chunk that appears before the discussion is a strong
signal that a party-claims section was silently absorbed into it (chunker
regex didn't match the header). Dropping those early 'ruling' chunks is
safe because all reasoning content falls at or after the 'דיון' anchor.
"""
analysis_indices = [
c["chunk_index"] for c in chunks
if c.get("section_type") == "legal_analysis"
]
if not analysis_indices:
return chunks
first_analysis = min(analysis_indices)
filtered = [
c for c in chunks
if not (c.get("section_type") == "ruling" and c["chunk_index"] < first_analysis)
]
dropped = len(chunks) - len(filtered)
if dropped:
logger.info(
"halacha_extractor: positive-anchor guard dropped %d pre-discussion "
"'ruling' chunk(s) (first legal_analysis at chunk_index=%d)",
dropped, first_analysis,
)
return filtered
async def _extract_impl(case_law_id: UUID, force: bool = False,
effort: str | None = None) -> dict:
"""Core extraction (caller holds the global advisory lock for the duration).

View File

@@ -279,9 +279,50 @@ FLAG_NON_DECISION = "non_decision"
FLAG_TRUNCATED_QUOTE = "truncated_quote"
FLAG_THIN_RESTATEMENT = "thin_restatement"
FLAG_QUOTE_UNVERIFIED = "quote_unverified"
FLAG_NLI_UNSUPPORTED = "nli_unsupported" # rule not entailed by its quote (#81.3)
FLAG_APPLICATION = "application" # fact-dependent, not a holding (#81.4)
FLAG_NEAR_DUPLICATE = "near_duplicate" # cosine-tail lexical dup (#82.3)
FLAG_NLI_UNSUPPORTED = "nli_unsupported" # rule not entailed by its quote (#81.3)
FLAG_APPLICATION = "application" # fact-dependent, not a holding (#81.4)
FLAG_NEAR_DUPLICATE = "near_duplicate" # cosine-tail lexical dup (#82.3)
FLAG_PARTY_CLAIM = "party_claim_language" # quote reads as a party's position, not the court's
# ── Party-claim language: quote is the court's words, not a party's ──
#
# Positive markers that a quote comes from a party's argument section rather
# than the court's own reasoning. The chunker now correctly classifies these
# sections, but a belt-and-suspenders lexical gate catches any case where
# the chunker still absorbs a party-claims section into a reasoning chunk
# (e.g. an unrecognised header variant). We scan the supporting_quote only —
# the rule_statement is already abstracted and should not contain these phrases.
_PARTY_CLAIM_MARKERS = (
# Named-party attribution forms — always party-claim language, never court reasoning
"לטענת העורר",
"לטענת העוררת",
"לטענת העוררים",
"לטענת המשיב",
"לטענת המשיבה",
"לטענת המשיבים",
"טוען העורר",
"טוענת העוררת",
"טוען המשיב",
"טוענת המשיבה",
# Excluded (too broad — courts also use these in their own reasoning):
# "נטען כי", "נטען על ידי", "נטען על-ידי", "לטענתו", "לטענתה", "לטענתם"
)
def detect_party_claim_language(supporting_quote: str) -> str | None:
"""Return the first party-claim marker found in the quote (or None).
Only the supporting_quote is scanned — rule_statement is already abstracted.
A match means the LLM likely extracted from a party's argument section
rather than the court's reasoning.
"""
norm = normalize_text(supporting_quote)
for marker in _PARTY_CLAIM_MARKERS:
if marker in norm:
return marker
return None
# ── NLI entailment check (rule_statement ⊨ supporting_quote) — #81.3 ──
@@ -417,4 +458,8 @@ def compute_quality_flags(
# rule_type='application' and add a high-precision deixis catch.
if rule_type == "application" or is_fact_dependent(rule_statement):
flags.append(FLAG_APPLICATION)
# Belt-and-suspenders: if the quote contains party-claim language the
# chunker's section filter should have excluded, flag for manual review.
if detect_party_claim_language(supporting_quote):
flags.append(FLAG_PARTY_CLAIM)
return flags