From 42376db4c52c0da7756807e7fc34f8d2c63b4853 Mon Sep 17 00:00:00 2001
From: Chaim <chaim@marcus-law.co.il>
Date: Wed, 17 Jun 2026 15:12:05 +0000
Subject: [PATCH] =?UTF-8?q?fix(chunker):=20=D7=AA=D7=99=D7=A7=D7=95=D7=9F?=
 =?UTF-8?q?=20=D7=96=D7=99=D7=94=D7=95=D7=99=20=D7=9B=D7=95=D7=AA=D7=A8?=
 =?UTF-8?q?=D7=95=D7=AA=20=D7=98=D7=A2=D7=A0=D7=95=D7=AA=20=D7=94=D7=A6?=
 =?UTF-8?q?=D7=93=D7=93=D7=99=D7=9D=20=D7=91=D7=99=D7=97=D7=99=D7=93/?=
 =?UTF-8?q?=D7=A0=D7=A7=D7=91=D7=94=20+=20=D7=A9=D7=9C=D7=95=D7=A9=20?=
 =?UTF-8?q?=D7=A9=D7=9B=D7=91=D7=95=D7=AA=20=D7=94=D7=92=D7=A0=D7=94?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**סיבת-שורש:** רג'קס respondent_claims כיסה רק צורת רבים (המשיבים/המשיבין),
ולא יחיד נקבה (המשיבה) ויחיד זכר (המשיב). הכותרת "טענות המשיבה:" בתיק 8181-21
נבלעה לתוך מקטע ruling → חולצה כהלכה שגויה.

**שלוש שכבות הגנה:**
1. chunker.py — הרחבת SECTION_PATTERNS לכסות יחיד/זכר/נקבה + תגובת/תשובת
2. halacha_extractor.py — עיגון חיובי: drop ל-ruling chunks שלפני legal_analysis
3. halacha_quality.py — FLAG_PARTY_CLAIM: זיהוי שפת-טענות-צד בציטוט התומך

**היקף:** 93 תיקים עם 0 chunks של טענות (כנראה בגלל כותרות ביחיד שלא הוכרו);
628 הלכות מאושרות מתיקים אלה — חלקן עשויות להיות תקינות, יש לעשות re-chunk.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 mcp-server/src/legal_mcp/services/chunker.py  |  16 ++-
 .../legal_mcp/services/halacha_extractor.py   |  39 ++++++
 .../src/legal_mcp/services/halacha_quality.py |  51 +++++++-
 .../tests/test_chunker_section_patterns.py    | 122 ++++++++++++++++++
 mcp-server/tests/test_halacha_quality.py      |  52 ++++++++
 5 files changed, 275 insertions(+), 5 deletions(-)
 create mode 100644 mcp-server/tests/test_chunker_section_patterns.py

diff --git a/mcp-server/src/legal_mcp/services/chunker.py b/mcp-server/src/legal_mcp/services/chunker.py
index af3320a..e41656d 100644
--- a/mcp-server/src/legal_mcp/services/chunker.py
+++ b/mcp-server/src/legal_mcp/services/chunker.py
@@ -22,8 +22,20 @@ from legal_mcp import config
 # court rulings use slightly different vocabulary (פסק דין, נימוקים, סוף דבר).
 SECTION_PATTERNS = [
     (r"רקע\s*עובדתי|רקע\s*כללי|העובדות|הרקע", "facts"),
-    (r"טענות\s*העוררי[םן]|טענות\s*המערערי[םן]|עיקר\s*טענות\s*העוררי[םן]", "appellant_claims"),
-    (r"טענות\s*המשיבי[םן]|תשובת\s*המשיבי[םן]|עיקר\s*טענות\s*המשיבי[םן]", "respondent_claims"),
+    # appellant_claims: covers singular (עורר/עוררת, מערער/מערערת) and plural
+    # (עוררים/עוררין, מערערים). Previously only plural was matched, so headers
+    # like "טענות העורר:" were silently absorbed into the preceding section.
+    (
+        r"(?:טענות|עיקר\s*טענות)\s*ה(?:עוררי[םן]|עורר[ת]?|מערערי[םן]|מערער[ת]?)",
+        "appellant_claims",
+    ),
+    # respondent_claims: covers singular (משיב/משיבה) and plural (משיבים/משיבין),
+    # plus verb forms תשובת/תגובת. "טענות המשיבה:" (feminine singular) was the
+    # root cause of halacha 8181-21 index-11 being extracted from party claims.
+    (
+        r"(?:טענות|תשובת|תגובת|עיקר\s*טענות)\s*ה(?:משיבי[םן]|משיב[ה]?)",
+        "respondent_claims",
+    ),
     (r"דיון\s*והכרעה|דיון|הכרעה|ניתוח\s*משפטי|המסגרת\s*המשפטית|נימוקים", "legal_analysis"),
     (r"מסקנ[הות]|סיכום|סוף\s*דבר", "conclusion"),
     (r"פסק[- ]?דין|החלטה|לפיכך\s*אני\s*מחליט|התוצאה", "ruling"),
diff --git a/mcp-server/src/legal_mcp/services/halacha_extractor.py b/mcp-server/src/legal_mcp/services/halacha_extractor.py
index c83373c..796c011 100644
--- a/mcp-server/src/legal_mcp/services/halacha_extractor.py
+++ b/mcp-server/src/legal_mcp/services/halacha_extractor.py
@@ -638,12 +638,20 @@ async def _select_extractable_chunks(
     extraction. Previously the fallback took *all* chunks, re-admitting exactly
     the sections the primary filter excludes.
 
+    Positive-anchor guard: when a document has a "דיון/הכרעה" section
+    (section_type='legal_analysis'), extraction starts from that section
+    onwards.  Any 'ruling' chunks that appear BEFORE the first legal_analysis
+    chunk by position are dropped — they most likely result from a party-claims
+    section whose header was not recognised by the chunker and was therefore
+    absorbed into the preceding section.
+
     Returns ``(chunks, used_fallback)`` so the caller can log the fallback once.
     """
     chunks = await db.list_precedent_chunks(
         case_law_id, section_types=EXTRACTABLE_SECTIONS,
     )
     if chunks:
+        chunks = _apply_discussion_anchor(chunks)
         return chunks, False
     all_chunks = await db.list_precedent_chunks(case_law_id)
     filtered = [
@@ -653,6 +661,37 @@ async def _select_extractable_chunks(
     return filtered, True
 
 
+def _apply_discussion_anchor(chunks: list[dict]) -> list[dict]:
+    """Drop 'ruling' chunks that precede the first 'legal_analysis' chunk.
+
+    In Israeli planning-committee decisions the discussion section
+    (דיון / הכרעה / דיון והכרעה) always comes after the parties' claims.
+    A 'ruling'-labelled chunk that appears before the discussion is a strong
+    signal that a party-claims section was silently absorbed into it (chunker
+    regex didn't match the header).  Dropping those early 'ruling' chunks is
+    safe because all reasoning content falls at or after the 'דיון' anchor.
+    """
+    analysis_indices = [
+        c["chunk_index"] for c in chunks
+        if c.get("section_type") == "legal_analysis"
+    ]
+    if not analysis_indices:
+        return chunks
+    first_analysis = min(analysis_indices)
+    filtered = [
+        c for c in chunks
+        if not (c.get("section_type") == "ruling" and c["chunk_index"] < first_analysis)
+    ]
+    dropped = len(chunks) - len(filtered)
+    if dropped:
+        logger.info(
+            "halacha_extractor: positive-anchor guard dropped %d pre-discussion "
+            "'ruling' chunk(s) (first legal_analysis at chunk_index=%d)",
+            dropped, first_analysis,
+        )
+    return filtered
+
+
 async def _extract_impl(case_law_id: UUID, force: bool = False,
                         effort: str | None = None) -> dict:
     """Core extraction (caller holds the global advisory lock for the duration).
diff --git a/mcp-server/src/legal_mcp/services/halacha_quality.py b/mcp-server/src/legal_mcp/services/halacha_quality.py
index ee90d05..7948bee 100644
--- a/mcp-server/src/legal_mcp/services/halacha_quality.py
+++ b/mcp-server/src/legal_mcp/services/halacha_quality.py
@@ -279,9 +279,50 @@ FLAG_NON_DECISION = "non_decision"
 FLAG_TRUNCATED_QUOTE = "truncated_quote"
 FLAG_THIN_RESTATEMENT = "thin_restatement"
 FLAG_QUOTE_UNVERIFIED = "quote_unverified"
-FLAG_NLI_UNSUPPORTED = "nli_unsupported"  # rule not entailed by its quote (#81.3)
-FLAG_APPLICATION = "application"           # fact-dependent, not a holding (#81.4)
-FLAG_NEAR_DUPLICATE = "near_duplicate"     # cosine-tail lexical dup (#82.3)
+FLAG_NLI_UNSUPPORTED = "nli_unsupported"   # rule not entailed by its quote (#81.3)
+FLAG_APPLICATION = "application"            # fact-dependent, not a holding (#81.4)
+FLAG_NEAR_DUPLICATE = "near_duplicate"      # cosine-tail lexical dup (#82.3)
+FLAG_PARTY_CLAIM = "party_claim_language"   # quote reads as a party's position, not the court's
+
+
+# ── Party-claim language: quote is the court's words, not a party's ──
+#
+# Positive markers that a quote comes from a party's argument section rather
+# than the court's own reasoning.  The chunker now correctly classifies these
+# sections, but a belt-and-suspenders lexical gate catches any case where
+# the chunker still absorbs a party-claims section into a reasoning chunk
+# (e.g. an unrecognised header variant).  We scan the supporting_quote only —
+# the rule_statement is already abstracted and should not contain these phrases.
+
+_PARTY_CLAIM_MARKERS = (
+    # Named-party attribution forms — always party-claim language, never court reasoning
+    "לטענת העורר",
+    "לטענת העוררת",
+    "לטענת העוררים",
+    "לטענת המשיב",
+    "לטענת המשיבה",
+    "לטענת המשיבים",
+    "טוען העורר",
+    "טוענת העוררת",
+    "טוען המשיב",
+    "טוענת המשיבה",
+    # Excluded (too broad — courts also use these in their own reasoning):
+    # "נטען כי", "נטען על ידי", "נטען על-ידי", "לטענתו", "לטענתה", "לטענתם"
+)
+
+
+def detect_party_claim_language(supporting_quote: str) -> str | None:
+    """Return the first party-claim marker found in the quote (or None).
+
+    Only the supporting_quote is scanned — rule_statement is already abstracted.
+    A match means the LLM likely extracted from a party's argument section
+    rather than the court's reasoning.
+    """
+    norm = normalize_text(supporting_quote)
+    for marker in _PARTY_CLAIM_MARKERS:
+        if marker in norm:
+            return marker
+    return None
 
 
 # ── NLI entailment check (rule_statement ⊨ supporting_quote) — #81.3 ──
@@ -417,4 +458,8 @@ def compute_quality_flags(
     # rule_type='application' and add a high-precision deixis catch.
     if rule_type == "application" or is_fact_dependent(rule_statement):
         flags.append(FLAG_APPLICATION)
+    # Belt-and-suspenders: if the quote contains party-claim language the
+    # chunker's section filter should have excluded, flag for manual review.
+    if detect_party_claim_language(supporting_quote):
+        flags.append(FLAG_PARTY_CLAIM)
     return flags
diff --git a/mcp-server/tests/test_chunker_section_patterns.py b/mcp-server/tests/test_chunker_section_patterns.py
new file mode 100644
index 0000000..45085a5
--- /dev/null
+++ b/mcp-server/tests/test_chunker_section_patterns.py
@@ -0,0 +1,122 @@
+"""Tests for the Hebrew section-header patterns in the chunker.
+
+Focuses on the singular/feminine forms that were previously missing and
+caused party-claims sections to bleed into the preceding section type.
+"""
+from __future__ import annotations
+
+import pytest
+
+from legal_mcp.services.chunker import chunk_document
+
+
+def _section_types(text: str) -> list[str]:
+    """Return the sequence of section_type values in the chunked output."""
+    return [c.section_type for c in chunk_document(text)]
+
+
+def _has_section(text: str, section: str) -> bool:
+    return section in _section_types(text)
+
+
+# ── respondent_claims patterns ──────────────────────────────────────────────
+
+@pytest.mark.parametrize("header", [
+    "טענות המשיבים:",          # plural masculine — was working before
+    "טענות המשיבין:",          # plural alternative — was working
+    "תשובת המשיבים:",          # plural verb form — was working
+    "טענות המשיבה:",           # singular feminine — BUG FIX (8181-21)
+    "טענות המשיב:",            # singular masculine — BUG FIX
+    "תשובת המשיבה:",           # singular feminine verb form — BUG FIX
+    "תגובת המשיבה:",           # תגובה form — BUG FIX
+    "תגובת המשיבים:",          # plural תגובה — BUG FIX
+    "עיקר טענות המשיבה:",     # prefix + singular — BUG FIX
+])
+def test_respondent_claims_recognized(header):
+    # Explicit + avoids Python implicit string-literal concatenation merging
+    # the "\n\n" separator into the following header string.
+    text = (
+        "רקע עובדתי\n"
+        + "עובדות כלליות. " * 20 + "\n\n"
+        + f"{header}\n"
+        + "הוועדה המקומית טוענת כי אין מקום לקבל את הערר. " * 15 + "\n\n"
+        + "דיון והכרעה\n"
+        + "לאחר בחינת הטענות אנו קובעים כי יש לקבל את הערר בחלקו. " * 15
+    )
+    assert _has_section(text, "respondent_claims"), (
+        f"Header '{header}' should produce respondent_claims chunk"
+    )
+
+
+# ── appellant_claims patterns ────────────────────────────────────────────────
+
+@pytest.mark.parametrize("header", [
+    "טענות העוררים:",          # plural masculine — was working before
+    "טענות העוררין:",          # plural alternative — was working
+    "טענות המערערים:",         # מערערים plural — was working
+    "טענות העורר:",            # singular masculine — BUG FIX
+    "טענות העוררת:",           # singular feminine — BUG FIX
+    "טענות המערער:",           # מערער singular — BUG FIX
+    "טענות המערערת:",          # מערערת singular feminine — BUG FIX
+    "עיקר טענות העורר:",      # prefix + singular — BUG FIX
+])
+def test_appellant_claims_recognized(header):
+    text = (
+        "רקע עובדתי\n"
+        + "עובדות כלליות. " * 20 + "\n\n"
+        + f"{header}\n"
+        + "ב\"כ העורר טוען כי יש לקבל את הערר ולבטל את ההחלטה. " * 15 + "\n\n"
+        + "דיון והכרעה\n"
+        + "לאחר בחינת הטענות אנו קובעים כי יש לקבל את הערר בחלקו. " * 15
+    )
+    assert _has_section(text, "appellant_claims"), (
+        f"Header '{header}' should produce appellant_claims chunk"
+    )
+
+
+# ── regression: existing plural forms still work ────────────────────────────
+
+def test_regression_plural_respondent_and_legal_analysis():
+    """Full decision with plural respondent — all sections must be preserved."""
+    text = (
+        "מבוא\n"
+        + "ערר זה הוגש על החלטת הוועדה המקומית לתכנון ובנייה. " * 10 + "\n\n"
+        + "רקע עובדתי\n"
+        + "העורר רכש את הנכס בשנת 2010 ופנה לקבלת היתר בנייה. " * 10 + "\n\n"
+        + "טענות העוררים:\n"
+        + "ב\"כ העוררים טוען כי נפל פגם יסודי בהחלטת הוועדה. " * 10 + "\n\n"
+        + "טענות המשיבים:\n"
+        + "הוועדה המקומית סבורה כי ההחלטה תקינה ומבוססת. " * 10 + "\n\n"
+        + "דיון והכרעה\n"
+        + "לאחר בחינת הטענות קובעים כי הערר מתקבל בחלקו. " * 10
+    )
+    types = set(_section_types(text))
+    assert "appellant_claims" in types
+    assert "respondent_claims" in types
+    assert "legal_analysis" in types
+
+
+# ── party claims do NOT bleed into legal_analysis ───────────────────────────
+
+def test_respondent_singular_does_not_bleed_into_ruling():
+    """Before the fix, "טענות המשיבה:" was absorbed into the preceding section.
+    After the fix it must produce its own respondent_claims chunk."""
+    text = (
+        "החלטה\n"
+        + "הוועדה דנה בערר שהוגש על החלטת הוועדה המקומית. " * 10 + "\n\n"
+        + "טענות המשיבה:\n"
+        + "הוועדה המקומית טוענת שאין כל השבחה בנסיבות העניין. " * 10 + "\n\n"
+        + "דיון\n"
+        + "אנו בוחנים את הטענות לאחר עיון בחומר שהוגש. " * 10
+    )
+    # The respondent claims must NOT remain labeled as 'ruling'
+    chunks = chunk_document(text)
+    respondent_chunks = [c for c in chunks if c.section_type == "respondent_claims"]
+    assert len(respondent_chunks) > 0, (
+        "Singular 'טענות המשיבה' must produce respondent_claims chunks, not bleed into ruling"
+    )
+    # Verify the respondent text didn't end up in ruling chunks
+    ruling_text = " ".join(c.content for c in chunks if c.section_type == "ruling")
+    assert "הוועדה המקומית טוענת שאין השבחה" not in ruling_text, (
+        "Respondent claim text must not appear in ruling chunks"
+    )
diff --git a/mcp-server/tests/test_halacha_quality.py b/mcp-server/tests/test_halacha_quality.py
index bf6c2ac..d68b81f 100644
--- a/mcp-server/tests/test_halacha_quality.py
+++ b/mcp-server/tests/test_halacha_quality.py
@@ -270,3 +270,55 @@ def test_lexical_near_duplicate_band():
     assert hq.lexical_near_duplicate(a, b) is True
     c = "המועד להגשת ערר על שומה הוא שלושים ימים"
     assert hq.lexical_near_duplicate(a, c) is False
+
+
+# ── party-claim language detector (FLAG_PARTY_CLAIM) ────────────────────────
+
+@pytest.mark.parametrize("quote", [
+    # Named-party attribution — always party-claim language
+    "לטענת העורר אין בסיס לחיוב בהיטל",
+    "לטענת המשיבה יש לדחות את הערר",
+    "לטענת המשיב לא הייתה כל השבחה",
+    "לטענת המשיבים ההחלטה תקינה",
+    "טוענת המשיבה כי אין מקום לפיצוי",
+])
+def test_detect_party_claim_language_hits(quote):
+    assert hq.detect_party_claim_language(quote) is not None, (
+        f"Quote '{quote[:40]}' should trigger party_claim detection"
+    )
+
+
+@pytest.mark.parametrize("quote", [
+    # Court's own reasoning — must NOT trigger the flag
+    "ועדת הערר קובעת כי ההיתר תואם את התכנית",
+    "מן הטעמים האמורים, הערר מתקבל",
+    "לאחר בחינת הטענות אנו סבורים כי יש לדחות",
+    # Broad passive forms courts also use in their reasoning (excluded from markers)
+    "נטען כי ההיתר ניתן שלא כדין",
+    "נטען על ידי העורר כי השומה שגויה",
+    # The 8181-21 bad halacha quote — no party-claim markers
+    "גם אם ניתן בעבר פטור לעוררת על בסיס פרשנות הוראות החוק שהשתנתה כיום הרי שיש לפעול בהתאם לפרשנות כיום",
+])
+def test_detect_party_claim_language_misses(quote):
+    assert hq.detect_party_claim_language(quote) is None, (
+        f"Quote '{quote[:40]}' must NOT trigger party_claim detection"
+    )
+
+
+def test_party_claim_flag_in_compute_quality_flags():
+    flags = hq.compute_quality_flags(
+        "ביטול היתר מחייב בסיס חוקי",
+        "לטענת העורר ההיתר ניתן שלא כדין ויש לבטלו",
+        quote_verified=True,
+    )
+    assert hq.FLAG_PARTY_CLAIM in flags
+
+
+def test_court_quote_does_not_trigger_party_claim_flag():
+    flags = hq.compute_quality_flags(
+        "ביטול היתר מחייב בסיס חוקי מוצק",
+        "ועדת הערר קובעת כי ביטול היתר מחייב בסיס חוקי מוצק בדמות פגיעה ממשית באינטרס ציבורי",
+        quote_verified=True,
+        rule_type="holding",
+    )
+    assert hq.FLAG_PARTY_CLAIM not in flags