fix(chunker): תיקון זיהוי כותרות טענות הצדדים ביחיד/נקבה + שלוש שכבות הגנה
**סיבת-שורש:** רג'קס respondent_claims כיסה רק צורת רבים (המשיבים/המשיבין), ולא יחיד נקבה (המשיבה) ויחיד זכר (המשיב). הכותרת "טענות המשיבה:" בתיק 8181-21 נבלעה לתוך מקטע ruling → חולצה כהלכה שגויה. **שלוש שכבות הגנה:** 1. chunker.py — הרחבת SECTION_PATTERNS לכסות יחיד/זכר/נקבה + תגובת/תשובת 2. halacha_extractor.py — עיגון חיובי: drop ל-ruling chunks שלפני legal_analysis 3. halacha_quality.py — FLAG_PARTY_CLAIM: זיהוי שפת-טענות-צד בציטוט התומך **היקף:** 93 תיקים עם 0 chunks של טענות (כנראה בגלל כותרות ביחיד שלא הוכרו); 628 הלכות מאושרות מתיקים אלה — חלקן עשויות להיות תקינות, יש לעשות re-chunk. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -22,8 +22,20 @@ from legal_mcp import config
|
|||||||
# court rulings use slightly different vocabulary (פסק דין, נימוקים, סוף דבר).
|
# court rulings use slightly different vocabulary (פסק דין, נימוקים, סוף דבר).
|
||||||
SECTION_PATTERNS = [
|
SECTION_PATTERNS = [
|
||||||
(r"רקע\s*עובדתי|רקע\s*כללי|העובדות|הרקע", "facts"),
|
(r"רקע\s*עובדתי|רקע\s*כללי|העובדות|הרקע", "facts"),
|
||||||
(r"טענות\s*העוררי[םן]|טענות\s*המערערי[םן]|עיקר\s*טענות\s*העוררי[םן]", "appellant_claims"),
|
# appellant_claims: covers singular (עורר/עוררת, מערער/מערערת) and plural
|
||||||
(r"טענות\s*המשיבי[םן]|תשובת\s*המשיבי[םן]|עיקר\s*טענות\s*המשיבי[םן]", "respondent_claims"),
|
# (עוררים/עוררין, מערערים). Previously only plural was matched, so headers
|
||||||
|
# like "טענות העורר:" were silently absorbed into the preceding section.
|
||||||
|
(
|
||||||
|
r"(?:טענות|עיקר\s*טענות)\s*ה(?:עוררי[םן]|עורר[ת]?|מערערי[םן]|מערער[ת]?)",
|
||||||
|
"appellant_claims",
|
||||||
|
),
|
||||||
|
# respondent_claims: covers singular (משיב/משיבה) and plural (משיבים/משיבין),
|
||||||
|
# plus verb forms תשובת/תגובת. "טענות המשיבה:" (feminine singular) was the
|
||||||
|
# root cause of halacha 8181-21 index-11 being extracted from party claims.
|
||||||
|
(
|
||||||
|
r"(?:טענות|תשובת|תגובת|עיקר\s*טענות)\s*ה(?:משיבי[םן]|משיב[ה]?)",
|
||||||
|
"respondent_claims",
|
||||||
|
),
|
||||||
(r"דיון\s*והכרעה|דיון|הכרעה|ניתוח\s*משפטי|המסגרת\s*המשפטית|נימוקים", "legal_analysis"),
|
(r"דיון\s*והכרעה|דיון|הכרעה|ניתוח\s*משפטי|המסגרת\s*המשפטית|נימוקים", "legal_analysis"),
|
||||||
(r"מסקנ[הות]|סיכום|סוף\s*דבר", "conclusion"),
|
(r"מסקנ[הות]|סיכום|סוף\s*דבר", "conclusion"),
|
||||||
(r"פסק[- ]?דין|החלטה|לפיכך\s*אני\s*מחליט|התוצאה", "ruling"),
|
(r"פסק[- ]?דין|החלטה|לפיכך\s*אני\s*מחליט|התוצאה", "ruling"),
|
||||||
|
|||||||
@@ -638,12 +638,20 @@ async def _select_extractable_chunks(
|
|||||||
extraction. Previously the fallback took *all* chunks, re-admitting exactly
|
extraction. Previously the fallback took *all* chunks, re-admitting exactly
|
||||||
the sections the primary filter excludes.
|
the sections the primary filter excludes.
|
||||||
|
|
||||||
|
Positive-anchor guard: when a document has a "דיון/הכרעה" section
|
||||||
|
(section_type='legal_analysis'), extraction starts from that section
|
||||||
|
onwards. Any 'ruling' chunks that appear BEFORE the first legal_analysis
|
||||||
|
chunk by position are dropped — they most likely result from a party-claims
|
||||||
|
section whose header was not recognised by the chunker and was therefore
|
||||||
|
absorbed into the preceding section.
|
||||||
|
|
||||||
Returns ``(chunks, used_fallback)`` so the caller can log the fallback once.
|
Returns ``(chunks, used_fallback)`` so the caller can log the fallback once.
|
||||||
"""
|
"""
|
||||||
chunks = await db.list_precedent_chunks(
|
chunks = await db.list_precedent_chunks(
|
||||||
case_law_id, section_types=EXTRACTABLE_SECTIONS,
|
case_law_id, section_types=EXTRACTABLE_SECTIONS,
|
||||||
)
|
)
|
||||||
if chunks:
|
if chunks:
|
||||||
|
chunks = _apply_discussion_anchor(chunks)
|
||||||
return chunks, False
|
return chunks, False
|
||||||
all_chunks = await db.list_precedent_chunks(case_law_id)
|
all_chunks = await db.list_precedent_chunks(case_law_id)
|
||||||
filtered = [
|
filtered = [
|
||||||
@@ -653,6 +661,37 @@ async def _select_extractable_chunks(
|
|||||||
return filtered, True
|
return filtered, True
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_discussion_anchor(chunks: list[dict]) -> list[dict]:
|
||||||
|
"""Drop 'ruling' chunks that precede the first 'legal_analysis' chunk.
|
||||||
|
|
||||||
|
In Israeli planning-committee decisions the discussion section
|
||||||
|
(דיון / הכרעה / דיון והכרעה) always comes after the parties' claims.
|
||||||
|
A 'ruling'-labelled chunk that appears before the discussion is a strong
|
||||||
|
signal that a party-claims section was silently absorbed into it (chunker
|
||||||
|
regex didn't match the header). Dropping those early 'ruling' chunks is
|
||||||
|
safe because all reasoning content falls at or after the 'דיון' anchor.
|
||||||
|
"""
|
||||||
|
analysis_indices = [
|
||||||
|
c["chunk_index"] for c in chunks
|
||||||
|
if c.get("section_type") == "legal_analysis"
|
||||||
|
]
|
||||||
|
if not analysis_indices:
|
||||||
|
return chunks
|
||||||
|
first_analysis = min(analysis_indices)
|
||||||
|
filtered = [
|
||||||
|
c for c in chunks
|
||||||
|
if not (c.get("section_type") == "ruling" and c["chunk_index"] < first_analysis)
|
||||||
|
]
|
||||||
|
dropped = len(chunks) - len(filtered)
|
||||||
|
if dropped:
|
||||||
|
logger.info(
|
||||||
|
"halacha_extractor: positive-anchor guard dropped %d pre-discussion "
|
||||||
|
"'ruling' chunk(s) (first legal_analysis at chunk_index=%d)",
|
||||||
|
dropped, first_analysis,
|
||||||
|
)
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
async def _extract_impl(case_law_id: UUID, force: bool = False,
|
async def _extract_impl(case_law_id: UUID, force: bool = False,
|
||||||
effort: str | None = None) -> dict:
|
effort: str | None = None) -> dict:
|
||||||
"""Core extraction (caller holds the global advisory lock for the duration).
|
"""Core extraction (caller holds the global advisory lock for the duration).
|
||||||
|
|||||||
@@ -282,6 +282,47 @@ FLAG_QUOTE_UNVERIFIED = "quote_unverified"
|
|||||||
FLAG_NLI_UNSUPPORTED = "nli_unsupported" # rule not entailed by its quote (#81.3)
|
FLAG_NLI_UNSUPPORTED = "nli_unsupported" # rule not entailed by its quote (#81.3)
|
||||||
FLAG_APPLICATION = "application" # fact-dependent, not a holding (#81.4)
|
FLAG_APPLICATION = "application" # fact-dependent, not a holding (#81.4)
|
||||||
FLAG_NEAR_DUPLICATE = "near_duplicate" # cosine-tail lexical dup (#82.3)
|
FLAG_NEAR_DUPLICATE = "near_duplicate" # cosine-tail lexical dup (#82.3)
|
||||||
|
FLAG_PARTY_CLAIM = "party_claim_language" # quote reads as a party's position, not the court's
|
||||||
|
|
||||||
|
|
||||||
|
# ── Party-claim language: quote is the court's words, not a party's ──
|
||||||
|
#
|
||||||
|
# Positive markers that a quote comes from a party's argument section rather
|
||||||
|
# than the court's own reasoning. The chunker now correctly classifies these
|
||||||
|
# sections, but a belt-and-suspenders lexical gate catches any case where
|
||||||
|
# the chunker still absorbs a party-claims section into a reasoning chunk
|
||||||
|
# (e.g. an unrecognised header variant). We scan the supporting_quote only —
|
||||||
|
# the rule_statement is already abstracted and should not contain these phrases.
|
||||||
|
|
||||||
|
_PARTY_CLAIM_MARKERS = (
|
||||||
|
# Named-party attribution forms — always party-claim language, never court reasoning
|
||||||
|
"לטענת העורר",
|
||||||
|
"לטענת העוררת",
|
||||||
|
"לטענת העוררים",
|
||||||
|
"לטענת המשיב",
|
||||||
|
"לטענת המשיבה",
|
||||||
|
"לטענת המשיבים",
|
||||||
|
"טוען העורר",
|
||||||
|
"טוענת העוררת",
|
||||||
|
"טוען המשיב",
|
||||||
|
"טוענת המשיבה",
|
||||||
|
# Excluded (too broad — courts also use these in their own reasoning):
|
||||||
|
# "נטען כי", "נטען על ידי", "נטען על-ידי", "לטענתו", "לטענתה", "לטענתם"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def detect_party_claim_language(supporting_quote: str) -> str | None:
|
||||||
|
"""Return the first party-claim marker found in the quote (or None).
|
||||||
|
|
||||||
|
Only the supporting_quote is scanned — rule_statement is already abstracted.
|
||||||
|
A match means the LLM likely extracted from a party's argument section
|
||||||
|
rather than the court's reasoning.
|
||||||
|
"""
|
||||||
|
norm = normalize_text(supporting_quote)
|
||||||
|
for marker in _PARTY_CLAIM_MARKERS:
|
||||||
|
if marker in norm:
|
||||||
|
return marker
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# ── NLI entailment check (rule_statement ⊨ supporting_quote) — #81.3 ──
|
# ── NLI entailment check (rule_statement ⊨ supporting_quote) — #81.3 ──
|
||||||
@@ -417,4 +458,8 @@ def compute_quality_flags(
|
|||||||
# rule_type='application' and add a high-precision deixis catch.
|
# rule_type='application' and add a high-precision deixis catch.
|
||||||
if rule_type == "application" or is_fact_dependent(rule_statement):
|
if rule_type == "application" or is_fact_dependent(rule_statement):
|
||||||
flags.append(FLAG_APPLICATION)
|
flags.append(FLAG_APPLICATION)
|
||||||
|
# Belt-and-suspenders: if the quote contains party-claim language the
|
||||||
|
# chunker's section filter should have excluded, flag for manual review.
|
||||||
|
if detect_party_claim_language(supporting_quote):
|
||||||
|
flags.append(FLAG_PARTY_CLAIM)
|
||||||
return flags
|
return flags
|
||||||
|
|||||||
122
mcp-server/tests/test_chunker_section_patterns.py
Normal file
122
mcp-server/tests/test_chunker_section_patterns.py
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
"""Tests for the Hebrew section-header patterns in the chunker.
|
||||||
|
|
||||||
|
Focuses on the singular/feminine forms that were previously missing and
|
||||||
|
caused party-claims sections to bleed into the preceding section type.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from legal_mcp.services.chunker import chunk_document
|
||||||
|
|
||||||
|
|
||||||
|
def _section_types(text: str) -> list[str]:
|
||||||
|
"""Return the sequence of section_type values in the chunked output."""
|
||||||
|
return [c.section_type for c in chunk_document(text)]
|
||||||
|
|
||||||
|
|
||||||
|
def _has_section(text: str, section: str) -> bool:
|
||||||
|
return section in _section_types(text)
|
||||||
|
|
||||||
|
|
||||||
|
# ── respondent_claims patterns ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("header", [
|
||||||
|
"טענות המשיבים:", # plural masculine — was working before
|
||||||
|
"טענות המשיבין:", # plural alternative — was working
|
||||||
|
"תשובת המשיבים:", # plural verb form — was working
|
||||||
|
"טענות המשיבה:", # singular feminine — BUG FIX (8181-21)
|
||||||
|
"טענות המשיב:", # singular masculine — BUG FIX
|
||||||
|
"תשובת המשיבה:", # singular feminine verb form — BUG FIX
|
||||||
|
"תגובת המשיבה:", # תגובה form — BUG FIX
|
||||||
|
"תגובת המשיבים:", # plural תגובה — BUG FIX
|
||||||
|
"עיקר טענות המשיבה:", # prefix + singular — BUG FIX
|
||||||
|
])
|
||||||
|
def test_respondent_claims_recognized(header):
|
||||||
|
# Explicit + avoids Python implicit string-literal concatenation merging
|
||||||
|
# the "\n\n" separator into the following header string.
|
||||||
|
text = (
|
||||||
|
"רקע עובדתי\n"
|
||||||
|
+ "עובדות כלליות. " * 20 + "\n\n"
|
||||||
|
+ f"{header}\n"
|
||||||
|
+ "הוועדה המקומית טוענת כי אין מקום לקבל את הערר. " * 15 + "\n\n"
|
||||||
|
+ "דיון והכרעה\n"
|
||||||
|
+ "לאחר בחינת הטענות אנו קובעים כי יש לקבל את הערר בחלקו. " * 15
|
||||||
|
)
|
||||||
|
assert _has_section(text, "respondent_claims"), (
|
||||||
|
f"Header '{header}' should produce respondent_claims chunk"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── appellant_claims patterns ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("header", [
|
||||||
|
"טענות העוררים:", # plural masculine — was working before
|
||||||
|
"טענות העוררין:", # plural alternative — was working
|
||||||
|
"טענות המערערים:", # מערערים plural — was working
|
||||||
|
"טענות העורר:", # singular masculine — BUG FIX
|
||||||
|
"טענות העוררת:", # singular feminine — BUG FIX
|
||||||
|
"טענות המערער:", # מערער singular — BUG FIX
|
||||||
|
"טענות המערערת:", # מערערת singular feminine — BUG FIX
|
||||||
|
"עיקר טענות העורר:", # prefix + singular — BUG FIX
|
||||||
|
])
|
||||||
|
def test_appellant_claims_recognized(header):
|
||||||
|
text = (
|
||||||
|
"רקע עובדתי\n"
|
||||||
|
+ "עובדות כלליות. " * 20 + "\n\n"
|
||||||
|
+ f"{header}\n"
|
||||||
|
+ "ב\"כ העורר טוען כי יש לקבל את הערר ולבטל את ההחלטה. " * 15 + "\n\n"
|
||||||
|
+ "דיון והכרעה\n"
|
||||||
|
+ "לאחר בחינת הטענות אנו קובעים כי יש לקבל את הערר בחלקו. " * 15
|
||||||
|
)
|
||||||
|
assert _has_section(text, "appellant_claims"), (
|
||||||
|
f"Header '{header}' should produce appellant_claims chunk"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── regression: existing plural forms still work ────────────────────────────
|
||||||
|
|
||||||
|
def test_regression_plural_respondent_and_legal_analysis():
|
||||||
|
"""Full decision with plural respondent — all sections must be preserved."""
|
||||||
|
text = (
|
||||||
|
"מבוא\n"
|
||||||
|
+ "ערר זה הוגש על החלטת הוועדה המקומית לתכנון ובנייה. " * 10 + "\n\n"
|
||||||
|
+ "רקע עובדתי\n"
|
||||||
|
+ "העורר רכש את הנכס בשנת 2010 ופנה לקבלת היתר בנייה. " * 10 + "\n\n"
|
||||||
|
+ "טענות העוררים:\n"
|
||||||
|
+ "ב\"כ העוררים טוען כי נפל פגם יסודי בהחלטת הוועדה. " * 10 + "\n\n"
|
||||||
|
+ "טענות המשיבים:\n"
|
||||||
|
+ "הוועדה המקומית סבורה כי ההחלטה תקינה ומבוססת. " * 10 + "\n\n"
|
||||||
|
+ "דיון והכרעה\n"
|
||||||
|
+ "לאחר בחינת הטענות קובעים כי הערר מתקבל בחלקו. " * 10
|
||||||
|
)
|
||||||
|
types = set(_section_types(text))
|
||||||
|
assert "appellant_claims" in types
|
||||||
|
assert "respondent_claims" in types
|
||||||
|
assert "legal_analysis" in types
|
||||||
|
|
||||||
|
|
||||||
|
# ── party claims do NOT bleed into legal_analysis ───────────────────────────
|
||||||
|
|
||||||
|
def test_respondent_singular_does_not_bleed_into_ruling():
|
||||||
|
"""Before the fix, "טענות המשיבה:" was absorbed into the preceding section.
|
||||||
|
After the fix it must produce its own respondent_claims chunk."""
|
||||||
|
text = (
|
||||||
|
"החלטה\n"
|
||||||
|
+ "הוועדה דנה בערר שהוגש על החלטת הוועדה המקומית. " * 10 + "\n\n"
|
||||||
|
+ "טענות המשיבה:\n"
|
||||||
|
+ "הוועדה המקומית טוענת שאין כל השבחה בנסיבות העניין. " * 10 + "\n\n"
|
||||||
|
+ "דיון\n"
|
||||||
|
+ "אנו בוחנים את הטענות לאחר עיון בחומר שהוגש. " * 10
|
||||||
|
)
|
||||||
|
# The respondent claims must NOT remain labeled as 'ruling'
|
||||||
|
chunks = chunk_document(text)
|
||||||
|
respondent_chunks = [c for c in chunks if c.section_type == "respondent_claims"]
|
||||||
|
assert len(respondent_chunks) > 0, (
|
||||||
|
"Singular 'טענות המשיבה' must produce respondent_claims chunks, not bleed into ruling"
|
||||||
|
)
|
||||||
|
# Verify the respondent text didn't end up in ruling chunks
|
||||||
|
ruling_text = " ".join(c.content for c in chunks if c.section_type == "ruling")
|
||||||
|
assert "הוועדה המקומית טוענת שאין השבחה" not in ruling_text, (
|
||||||
|
"Respondent claim text must not appear in ruling chunks"
|
||||||
|
)
|
||||||
@@ -270,3 +270,55 @@ def test_lexical_near_duplicate_band():
|
|||||||
assert hq.lexical_near_duplicate(a, b) is True
|
assert hq.lexical_near_duplicate(a, b) is True
|
||||||
c = "המועד להגשת ערר על שומה הוא שלושים ימים"
|
c = "המועד להגשת ערר על שומה הוא שלושים ימים"
|
||||||
assert hq.lexical_near_duplicate(a, c) is False
|
assert hq.lexical_near_duplicate(a, c) is False
|
||||||
|
|
||||||
|
|
||||||
|
# ── party-claim language detector (FLAG_PARTY_CLAIM) ────────────────────────
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("quote", [
|
||||||
|
# Named-party attribution — always party-claim language
|
||||||
|
"לטענת העורר אין בסיס לחיוב בהיטל",
|
||||||
|
"לטענת המשיבה יש לדחות את הערר",
|
||||||
|
"לטענת המשיב לא הייתה כל השבחה",
|
||||||
|
"לטענת המשיבים ההחלטה תקינה",
|
||||||
|
"טוענת המשיבה כי אין מקום לפיצוי",
|
||||||
|
])
|
||||||
|
def test_detect_party_claim_language_hits(quote):
|
||||||
|
assert hq.detect_party_claim_language(quote) is not None, (
|
||||||
|
f"Quote '{quote[:40]}' should trigger party_claim detection"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("quote", [
|
||||||
|
# Court's own reasoning — must NOT trigger the flag
|
||||||
|
"ועדת הערר קובעת כי ההיתר תואם את התכנית",
|
||||||
|
"מן הטעמים האמורים, הערר מתקבל",
|
||||||
|
"לאחר בחינת הטענות אנו סבורים כי יש לדחות",
|
||||||
|
# Broad passive forms courts also use in their reasoning (excluded from markers)
|
||||||
|
"נטען כי ההיתר ניתן שלא כדין",
|
||||||
|
"נטען על ידי העורר כי השומה שגויה",
|
||||||
|
# The 8181-21 bad halacha quote — no party-claim markers
|
||||||
|
"גם אם ניתן בעבר פטור לעוררת על בסיס פרשנות הוראות החוק שהשתנתה כיום הרי שיש לפעול בהתאם לפרשנות כיום",
|
||||||
|
])
|
||||||
|
def test_detect_party_claim_language_misses(quote):
|
||||||
|
assert hq.detect_party_claim_language(quote) is None, (
|
||||||
|
f"Quote '{quote[:40]}' must NOT trigger party_claim detection"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_party_claim_flag_in_compute_quality_flags():
|
||||||
|
flags = hq.compute_quality_flags(
|
||||||
|
"ביטול היתר מחייב בסיס חוקי",
|
||||||
|
"לטענת העורר ההיתר ניתן שלא כדין ויש לבטלו",
|
||||||
|
quote_verified=True,
|
||||||
|
)
|
||||||
|
assert hq.FLAG_PARTY_CLAIM in flags
|
||||||
|
|
||||||
|
|
||||||
|
def test_court_quote_does_not_trigger_party_claim_flag():
|
||||||
|
flags = hq.compute_quality_flags(
|
||||||
|
"ביטול היתר מחייב בסיס חוקי מוצק",
|
||||||
|
"ועדת הערר קובעת כי ביטול היתר מחייב בסיס חוקי מוצק בדמות פגיעה ממשית באינטרס ציבורי",
|
||||||
|
quote_verified=True,
|
||||||
|
rule_type="holding",
|
||||||
|
)
|
||||||
|
assert hq.FLAG_PARTY_CLAIM not in flags
|
||||||
|
|||||||
Reference in New Issue
Block a user