fix(chunker): תיקון זיהוי כותרות טענות הצדדים ביחיד/נקבה + שלוש שכבות הגנה
**סיבת-שורש:** רג'קס respondent_claims כיסה רק צורת רבים (המשיבים/המשיבין), ולא יחיד נקבה (המשיבה) ויחיד זכר (המשיב). הכותרת "טענות המשיבה:" בתיק 8181-21 נבלעה לתוך מקטע ruling → חולצה כהלכה שגויה. **שלוש שכבות הגנה:** 1. chunker.py — הרחבת SECTION_PATTERNS לכסות יחיד/זכר/נקבה + תגובת/תשובת 2. halacha_extractor.py — עיגון חיובי: drop ל-ruling chunks שלפני legal_analysis 3. halacha_quality.py — FLAG_PARTY_CLAIM: זיהוי שפת-טענות-צד בציטוט התומך **היקף:** 93 תיקים עם 0 chunks של טענות (כנראה בגלל כותרות ביחיד שלא הוכרו); 628 הלכות מאושרות מתיקים אלה — חלקן עשויות להיות תקינות, יש לעשות re-chunk. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -279,9 +279,50 @@ FLAG_NON_DECISION = "non_decision"
|
||||
FLAG_TRUNCATED_QUOTE = "truncated_quote"
|
||||
FLAG_THIN_RESTATEMENT = "thin_restatement"
|
||||
FLAG_QUOTE_UNVERIFIED = "quote_unverified"
|
||||
FLAG_NLI_UNSUPPORTED = "nli_unsupported" # rule not entailed by its quote (#81.3)
|
||||
FLAG_APPLICATION = "application" # fact-dependent, not a holding (#81.4)
|
||||
FLAG_NEAR_DUPLICATE = "near_duplicate" # cosine-tail lexical dup (#82.3)
|
||||
FLAG_NLI_UNSUPPORTED = "nli_unsupported" # rule not entailed by its quote (#81.3)
|
||||
FLAG_APPLICATION = "application" # fact-dependent, not a holding (#81.4)
|
||||
FLAG_NEAR_DUPLICATE = "near_duplicate" # cosine-tail lexical dup (#82.3)
|
||||
FLAG_PARTY_CLAIM = "party_claim_language" # quote reads as a party's position, not the court's
|
||||
|
||||
|
||||
# ── Party-claim language: quote is the court's words, not a party's ──
|
||||
#
|
||||
# Positive markers that a quote comes from a party's argument section rather
|
||||
# than the court's own reasoning. The chunker now correctly classifies these
|
||||
# sections, but a belt-and-suspenders lexical gate catches any case where
|
||||
# the chunker still absorbs a party-claims section into a reasoning chunk
|
||||
# (e.g. an unrecognised header variant). We scan the supporting_quote only —
|
||||
# the rule_statement is already abstracted and should not contain these phrases.
|
||||
|
||||
_PARTY_CLAIM_MARKERS = (
|
||||
# Named-party attribution forms — always party-claim language, never court reasoning
|
||||
"לטענת העורר",
|
||||
"לטענת העוררת",
|
||||
"לטענת העוררים",
|
||||
"לטענת המשיב",
|
||||
"לטענת המשיבה",
|
||||
"לטענת המשיבים",
|
||||
"טוען העורר",
|
||||
"טוענת העוררת",
|
||||
"טוען המשיב",
|
||||
"טוענת המשיבה",
|
||||
# Excluded (too broad — courts also use these in their own reasoning):
|
||||
# "נטען כי", "נטען על ידי", "נטען על-ידי", "לטענתו", "לטענתה", "לטענתם"
|
||||
)
|
||||
|
||||
|
||||
def detect_party_claim_language(supporting_quote: str) -> str | None:
|
||||
"""Return the first party-claim marker found in the quote (or None).
|
||||
|
||||
Only the supporting_quote is scanned — rule_statement is already abstracted.
|
||||
A match means the LLM likely extracted from a party's argument section
|
||||
rather than the court's reasoning.
|
||||
"""
|
||||
norm = normalize_text(supporting_quote)
|
||||
for marker in _PARTY_CLAIM_MARKERS:
|
||||
if marker in norm:
|
||||
return marker
|
||||
return None
|
||||
|
||||
|
||||
# ── NLI entailment check (rule_statement ⊨ supporting_quote) — #81.3 ──
|
||||
@@ -417,4 +458,8 @@ def compute_quality_flags(
|
||||
# rule_type='application' and add a high-precision deixis catch.
|
||||
if rule_type == "application" or is_fact_dependent(rule_statement):
|
||||
flags.append(FLAG_APPLICATION)
|
||||
# Belt-and-suspenders: if the quote contains party-claim language the
|
||||
# chunker's section filter should have excluded, flag for manual review.
|
||||
if detect_party_claim_language(supporting_quote):
|
||||
flags.append(FLAG_PARTY_CLAIM)
|
||||
return flags
|
||||
|
||||
Reference in New Issue
Block a user