fix(chunker): תיקון זיהוי כותרות טענות הצדדים ביחיד/נקבה + שלוש שכבות הגנה

**סיבת-שורש:** רג'קס respondent_claims כיסה רק צורת רבים (המשיבים/המשיבין), ולא יחיד נקבה (המשיבה) ויחיד זכר (המשיב). הכותרת "טענות המשיבה:" בתיק 8181-21 נבלעה לתוך מקטע ruling → חולצה כהלכה שגויה. **שלוש שכבות הגנה:** 1. chunker.py — הרחבת SECTION_PATTERNS לכסות יחיד/זכר/נקבה + תגובת/תשובת 2. halacha_extractor.py — עיגון חיובי: drop ל-ruling chunks שלפני legal_analysis 3. halacha_quality.py — FLAG_PARTY_CLAIM: זיהוי שפת-טענות-צד בציטוט התומך **היקף:** 93 תיקים עם 0 chunks של טענות (כנראה בגלל כותרות ביחיד שלא הוכרו); 628 הלכות מאושרות מתיקים אלה — חלקן עשויות להיות תקינות, יש לעשות re-chunk. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-17 15:12:05 +00:00
parent fa7fe85177
commit 42376db4c5
5 changed files with 275 additions and 5 deletions
--- a/mcp-server/src/legal_mcp/services/chunker.py
+++ b/mcp-server/src/legal_mcp/services/chunker.py
@@ -22,8 +22,20 @@ from legal_mcp import config
 # court rulings use slightly different vocabulary (פסק דין, נימוקים, סוף דבר).
 SECTION_PATTERNS = [
    (r"רקע\s*עובדתי|רקע\s*כללי|העובדות|הרקע", "facts"),
-    (r"טענות\s*העוררי[םן]|טענות\s*המערערי[םן]|עיקר\s*טענות\s*העוררי[םן]", "appellant_claims"),
+    # appellant_claims: covers singular (עורר/עוררת, מערער/מערערת) and plural
-    (r"טענות\s*המשיבי[םן]|תשובת\s*המשיבי[םן]|עיקר\s*טענות\s*המשיבי[םן]", "respondent_claims"),
+    # (עוררים/עוררין, מערערים). Previously only plural was matched, so headers
    # like "טענות העורר:" were silently absorbed into the preceding section.
    (
        r"(?:טענות|עיקר\s*טענות)\s*ה(?:עוררי[םן]|עורר[ת]?|מערערי[םן]|מערער[ת]?)",
        "appellant_claims",
    ),
    # respondent_claims: covers singular (משיב/משיבה) and plural (משיבים/משיבין),
    # plus verb forms תשובת/תגובת. "טענות המשיבה:" (feminine singular) was the
    # root cause of halacha 8181-21 index-11 being extracted from party claims.
    (
        r"(?:טענות|תשובת|תגובת|עיקר\s*טענות)\s*ה(?:משיבי[םן]|משיב[ה]?)",
        "respondent_claims",
    ),
    (r"דיון\s*והכרעה|דיון|הכרעה|ניתוח\s*משפטי|המסגרת\s*המשפטית|נימוקים", "legal_analysis"),
    (r"מסקנ[הות]|סיכום|סוף\s*דבר", "conclusion"),
    (r"פסק[- ]?דין|החלטה|לפיכך\s*אני\s*מחליט|התוצאה", "ruling"),
--- a/mcp-server/src/legal_mcp/services/halacha_extractor.py
+++ b/mcp-server/src/legal_mcp/services/halacha_extractor.py
@@ -638,12 +638,20 @@ async def _select_extractable_chunks(
    extraction. Previously the fallback took *all* chunks, re-admitting exactly
    the sections the primary filter excludes.
    Positive-anchor guard: when a document has a "דיון/הכרעה" section
    (section_type='legal_analysis'), extraction starts from that section
    onwards.  Any 'ruling' chunks that appear BEFORE the first legal_analysis
    chunk by position are dropped — they most likely result from a party-claims
    section whose header was not recognised by the chunker and was therefore
    absorbed into the preceding section.
    Returns ``(chunks, used_fallback)`` so the caller can log the fallback once.
    """
    chunks = await db.list_precedent_chunks(
        case_law_id, section_types=EXTRACTABLE_SECTIONS,
    )
    if chunks:
        chunks = _apply_discussion_anchor(chunks)
        return chunks, False
    all_chunks = await db.list_precedent_chunks(case_law_id)
    filtered = [
@@ -653,6 +661,37 @@ async def _select_extractable_chunks(
    return filtered, True
 def _apply_discussion_anchor(chunks: list[dict]) -> list[dict]:
    """Drop 'ruling' chunks that precede the first 'legal_analysis' chunk.
    In Israeli planning-committee decisions the discussion section
    (דיון / הכרעה / דיון והכרעה) always comes after the parties' claims.
    A 'ruling'-labelled chunk that appears before the discussion is a strong
    signal that a party-claims section was silently absorbed into it (chunker
    regex didn't match the header).  Dropping those early 'ruling' chunks is
    safe because all reasoning content falls at or after the 'דיון' anchor.
    """
    analysis_indices = [
        c["chunk_index"] for c in chunks
        if c.get("section_type") == "legal_analysis"
    ]
    if not analysis_indices:
        return chunks
    first_analysis = min(analysis_indices)
    filtered = [
        c for c in chunks
        if not (c.get("section_type") == "ruling" and c["chunk_index"] < first_analysis)
    ]
    dropped = len(chunks) - len(filtered)
    if dropped:
        logger.info(
            "halacha_extractor: positive-anchor guard dropped %d pre-discussion "
            "'ruling' chunk(s) (first legal_analysis at chunk_index=%d)",
            dropped, first_analysis,
        )
    return filtered
 async def _extract_impl(case_law_id: UUID, force: bool = False,
                        effort: str | None = None) -> dict:
    """Core extraction (caller holds the global advisory lock for the duration).
--- a/mcp-server/src/legal_mcp/services/halacha_quality.py
+++ b/mcp-server/src/legal_mcp/services/halacha_quality.py
@@ -282,6 +282,47 @@ FLAG_QUOTE_UNVERIFIED = "quote_unverified"
 FLAG_NLI_UNSUPPORTED = "nli_unsupported"   # rule not entailed by its quote (#81.3)
 FLAG_APPLICATION = "application"            # fact-dependent, not a holding (#81.4)
 FLAG_NEAR_DUPLICATE = "near_duplicate"      # cosine-tail lexical dup (#82.3)
 FLAG_PARTY_CLAIM = "party_claim_language"   # quote reads as a party's position, not the court's
 # ── Party-claim language: quote is the court's words, not a party's ──
 #
 # Positive markers that a quote comes from a party's argument section rather
 # than the court's own reasoning.  The chunker now correctly classifies these
 # sections, but a belt-and-suspenders lexical gate catches any case where
 # the chunker still absorbs a party-claims section into a reasoning chunk
 # (e.g. an unrecognised header variant).  We scan the supporting_quote only —
 # the rule_statement is already abstracted and should not contain these phrases.
 _PARTY_CLAIM_MARKERS = (
    # Named-party attribution forms — always party-claim language, never court reasoning
    "לטענת העורר",
    "לטענת העוררת",
    "לטענת העוררים",
    "לטענת המשיב",
    "לטענת המשיבה",
    "לטענת המשיבים",
    "טוען העורר",
    "טוענת העוררת",
    "טוען המשיב",
    "טוענת המשיבה",
    # Excluded (too broad — courts also use these in their own reasoning):
    # "נטען כי", "נטען על ידי", "נטען על-ידי", "לטענתו", "לטענתה", "לטענתם"
 )
 def detect_party_claim_language(supporting_quote: str) -> str | None:
    """Return the first party-claim marker found in the quote (or None).
    Only the supporting_quote is scanned — rule_statement is already abstracted.
    A match means the LLM likely extracted from a party's argument section
    rather than the court's reasoning.
    """
    norm = normalize_text(supporting_quote)
    for marker in _PARTY_CLAIM_MARKERS:
        if marker in norm:
            return marker
    return None
 # ── NLI entailment check (rule_statement ⊨ supporting_quote) — #81.3 ──
@@ -417,4 +458,8 @@ def compute_quality_flags(
    # rule_type='application' and add a high-precision deixis catch.
    if rule_type == "application" or is_fact_dependent(rule_statement):
        flags.append(FLAG_APPLICATION)
    # Belt-and-suspenders: if the quote contains party-claim language the
    # chunker's section filter should have excluded, flag for manual review.
    if detect_party_claim_language(supporting_quote):
        flags.append(FLAG_PARTY_CLAIM)
    return flags
--- a/mcp-server/tests/test_chunker_section_patterns.py
+++ b/mcp-server/tests/test_chunker_section_patterns.py
@@ -0,0 +1,122 @@
 """Tests for the Hebrew section-header patterns in the chunker.
 Focuses on the singular/feminine forms that were previously missing and
 caused party-claims sections to bleed into the preceding section type.
 """
 from __future__ import annotations
 import pytest
 from legal_mcp.services.chunker import chunk_document
 def _section_types(text: str) -> list[str]:
    """Return the sequence of section_type values in the chunked output."""
    return [c.section_type for c in chunk_document(text)]
 def _has_section(text: str, section: str) -> bool:
    return section in _section_types(text)
 # ── respondent_claims patterns ──────────────────────────────────────────────
@pytest.mark.parametrize("header", [
    "טענות המשיבים:",          # plural masculine — was working before
    "טענות המשיבין:",          # plural alternative — was working
    "תשובת המשיבים:",          # plural verb form — was working
    "טענות המשיבה:",           # singular feminine — BUG FIX (8181-21)
    "טענות המשיב:",            # singular masculine — BUG FIX
    "תשובת המשיבה:",           # singular feminine verb form — BUG FIX
    "תגובת המשיבה:",           # תגובה form — BUG FIX
    "תגובת המשיבים:",          # plural תגובה — BUG FIX
    "עיקר טענות המשיבה:",     # prefix + singular — BUG FIX
 ])
 def test_respondent_claims_recognized(header):
    # Explicit + avoids Python implicit string-literal concatenation merging
    # the "\n\n" separator into the following header string.
    text = (
        "רקע עובדתי\n"
        + "עובדות כלליות. " * 20 + "\n\n"
        + f"{header}\n"
        + "הוועדה המקומית טוענת כי אין מקום לקבל את הערר. " * 15 + "\n\n"
        + "דיון והכרעה\n"
        + "לאחר בחינת הטענות אנו קובעים כי יש לקבל את הערר בחלקו. " * 15
    )
    assert _has_section(text, "respondent_claims"), (
        f"Header '{header}' should produce respondent_claims chunk"
    )
 # ── appellant_claims patterns ────────────────────────────────────────────────
@pytest.mark.parametrize("header", [
    "טענות העוררים:",          # plural masculine — was working before
    "טענות העוררין:",          # plural alternative — was working
    "טענות המערערים:",         # מערערים plural — was working
    "טענות העורר:",            # singular masculine — BUG FIX
    "טענות העוררת:",           # singular feminine — BUG FIX
    "טענות המערער:",           # מערער singular — BUG FIX
    "טענות המערערת:",          # מערערת singular feminine — BUG FIX
    "עיקר טענות העורר:",      # prefix + singular — BUG FIX
 ])
 def test_appellant_claims_recognized(header):
    text = (
        "רקע עובדתי\n"
        + "עובדות כלליות. " * 20 + "\n\n"
        + f"{header}\n"
        + "ב\"כ העורר טוען כי יש לקבל את הערר ולבטל את ההחלטה. " * 15 + "\n\n"
        + "דיון והכרעה\n"
        + "לאחר בחינת הטענות אנו קובעים כי יש לקבל את הערר בחלקו. " * 15
    )
    assert _has_section(text, "appellant_claims"), (
        f"Header '{header}' should produce appellant_claims chunk"
    )
 # ── regression: existing plural forms still work ────────────────────────────
 def test_regression_plural_respondent_and_legal_analysis():
    """Full decision with plural respondent — all sections must be preserved."""
    text = (
        "מבוא\n"
        + "ערר זה הוגש על החלטת הוועדה המקומית לתכנון ובנייה. " * 10 + "\n\n"
        + "רקע עובדתי\n"
        + "העורר רכש את הנכס בשנת 2010 ופנה לקבלת היתר בנייה. " * 10 + "\n\n"
        + "טענות העוררים:\n"
        + "ב\"כ העוררים טוען כי נפל פגם יסודי בהחלטת הוועדה. " * 10 + "\n\n"
        + "טענות המשיבים:\n"
        + "הוועדה המקומית סבורה כי ההחלטה תקינה ומבוססת. " * 10 + "\n\n"
        + "דיון והכרעה\n"
        + "לאחר בחינת הטענות קובעים כי הערר מתקבל בחלקו. " * 10
    )
    types = set(_section_types(text))
    assert "appellant_claims" in types
    assert "respondent_claims" in types
    assert "legal_analysis" in types
 # ── party claims do NOT bleed into legal_analysis ───────────────────────────
 def test_respondent_singular_does_not_bleed_into_ruling():
    """Before the fix, "טענות המשיבה:" was absorbed into the preceding section.
    After the fix it must produce its own respondent_claims chunk."""
    text = (
        "החלטה\n"
        + "הוועדה דנה בערר שהוגש על החלטת הוועדה המקומית. " * 10 + "\n\n"
        + "טענות המשיבה:\n"
        + "הוועדה המקומית טוענת שאין כל השבחה בנסיבות העניין. " * 10 + "\n\n"
        + "דיון\n"
        + "אנו בוחנים את הטענות לאחר עיון בחומר שהוגש. " * 10
    )
    # The respondent claims must NOT remain labeled as 'ruling'
    chunks = chunk_document(text)
    respondent_chunks = [c for c in chunks if c.section_type == "respondent_claims"]
    assert len(respondent_chunks) > 0, (
        "Singular 'טענות המשיבה' must produce respondent_claims chunks, not bleed into ruling"
    )
    # Verify the respondent text didn't end up in ruling chunks
    ruling_text = " ".join(c.content for c in chunks if c.section_type == "ruling")
    assert "הוועדה המקומית טוענת שאין השבחה" not in ruling_text, (
        "Respondent claim text must not appear in ruling chunks"
    )
--- a/mcp-server/tests/test_halacha_quality.py
+++ b/mcp-server/tests/test_halacha_quality.py
@@ -270,3 +270,55 @@ def test_lexical_near_duplicate_band():
    assert hq.lexical_near_duplicate(a, b) is True
    c = "המועד להגשת ערר על שומה הוא שלושים ימים"
    assert hq.lexical_near_duplicate(a, c) is False
 # ── party-claim language detector (FLAG_PARTY_CLAIM) ────────────────────────
@pytest.mark.parametrize("quote", [
    # Named-party attribution — always party-claim language
    "לטענת העורר אין בסיס לחיוב בהיטל",
    "לטענת המשיבה יש לדחות את הערר",
    "לטענת המשיב לא הייתה כל השבחה",
    "לטענת המשיבים ההחלטה תקינה",
    "טוענת המשיבה כי אין מקום לפיצוי",
 ])
 def test_detect_party_claim_language_hits(quote):
    assert hq.detect_party_claim_language(quote) is not None, (
        f"Quote '{quote[:40]}' should trigger party_claim detection"
    )
@pytest.mark.parametrize("quote", [
    # Court's own reasoning — must NOT trigger the flag
    "ועדת הערר קובעת כי ההיתר תואם את התכנית",
    "מן הטעמים האמורים, הערר מתקבל",
    "לאחר בחינת הטענות אנו סבורים כי יש לדחות",
    # Broad passive forms courts also use in their reasoning (excluded from markers)
    "נטען כי ההיתר ניתן שלא כדין",
    "נטען על ידי העורר כי השומה שגויה",
    # The 8181-21 bad halacha quote — no party-claim markers
    "גם אם ניתן בעבר פטור לעוררת על בסיס פרשנות הוראות החוק שהשתנתה כיום הרי שיש לפעול בהתאם לפרשנות כיום",
 ])
 def test_detect_party_claim_language_misses(quote):
    assert hq.detect_party_claim_language(quote) is None, (
        f"Quote '{quote[:40]}' must NOT trigger party_claim detection"
    )
 def test_party_claim_flag_in_compute_quality_flags():
    flags = hq.compute_quality_flags(
        "ביטול היתר מחייב בסיס חוקי",
        "לטענת העורר ההיתר ניתן שלא כדין ויש לבטלו",
        quote_verified=True,
    )
    assert hq.FLAG_PARTY_CLAIM in flags
 def test_court_quote_does_not_trigger_party_claim_flag():
    flags = hq.compute_quality_flags(
        "ביטול היתר מחייב בסיס חוקי מוצק",
        "ועדת הערר קובעת כי ביטול היתר מחייב בסיס חוקי מוצק בדמות פגיעה ממשית באינטרס ציבורי",
        quote_verified=True,
        rule_type="holding",
    )
    assert hq.FLAG_PARTY_CLAIM not in flags