From 42376db4c52c0da7756807e7fc34f8d2c63b4853 Mon Sep 17 00:00:00 2001 From: Chaim Date: Wed, 17 Jun 2026 15:12:05 +0000 Subject: [PATCH] =?UTF-8?q?fix(chunker):=20=D7=AA=D7=99=D7=A7=D7=95=D7=9F?= =?UTF-8?q?=20=D7=96=D7=99=D7=94=D7=95=D7=99=20=D7=9B=D7=95=D7=AA=D7=A8?= =?UTF-8?q?=D7=95=D7=AA=20=D7=98=D7=A2=D7=A0=D7=95=D7=AA=20=D7=94=D7=A6?= =?UTF-8?q?=D7=93=D7=93=D7=99=D7=9D=20=D7=91=D7=99=D7=97=D7=99=D7=93/?= =?UTF-8?q?=D7=A0=D7=A7=D7=91=D7=94=20+=20=D7=A9=D7=9C=D7=95=D7=A9=20?= =?UTF-8?q?=D7=A9=D7=9B=D7=91=D7=95=D7=AA=20=D7=94=D7=92=D7=A0=D7=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **סיבת-שורש:** רג'קס respondent_claims כיסה רק צורת רבים (המשיבים/המשיבין), ולא יחיד נקבה (המשיבה) ויחיד זכר (המשיב). הכותרת "טענות המשיבה:" בתיק 8181-21 נבלעה לתוך מקטע ruling → חולצה כהלכה שגויה. **שלוש שכבות הגנה:** 1. chunker.py — הרחבת SECTION_PATTERNS לכסות יחיד/זכר/נקבה + תגובת/תשובת 2. halacha_extractor.py — עיגון חיובי: drop ל-ruling chunks שלפני legal_analysis 3. halacha_quality.py — FLAG_PARTY_CLAIM: זיהוי שפת-טענות-צד בציטוט התומך **היקף:** 93 תיקים עם 0 chunks של טענות (כנראה בגלל כותרות ביחיד שלא הוכרו); 628 הלכות מאושרות מתיקים אלה — חלקן עשויות להיות תקינות, יש לעשות re-chunk. Co-Authored-By: Claude Sonnet 4.6 --- mcp-server/src/legal_mcp/services/chunker.py | 16 ++- .../legal_mcp/services/halacha_extractor.py | 39 ++++++ .../src/legal_mcp/services/halacha_quality.py | 51 +++++++- .../tests/test_chunker_section_patterns.py | 122 ++++++++++++++++++ mcp-server/tests/test_halacha_quality.py | 52 ++++++++ 5 files changed, 275 insertions(+), 5 deletions(-) create mode 100644 mcp-server/tests/test_chunker_section_patterns.py diff --git a/mcp-server/src/legal_mcp/services/chunker.py b/mcp-server/src/legal_mcp/services/chunker.py index af3320a..e41656d 100644 --- a/mcp-server/src/legal_mcp/services/chunker.py +++ b/mcp-server/src/legal_mcp/services/chunker.py @@ -22,8 +22,20 @@ from legal_mcp import config # court rulings use slightly different vocabulary (פסק דין, נימוקים, סוף דבר). SECTION_PATTERNS = [ (r"רקע\s*עובדתי|רקע\s*כללי|העובדות|הרקע", "facts"), - (r"טענות\s*העוררי[םן]|טענות\s*המערערי[םן]|עיקר\s*טענות\s*העוררי[םן]", "appellant_claims"), - (r"טענות\s*המשיבי[םן]|תשובת\s*המשיבי[םן]|עיקר\s*טענות\s*המשיבי[םן]", "respondent_claims"), + # appellant_claims: covers singular (עורר/עוררת, מערער/מערערת) and plural + # (עוררים/עוררין, מערערים). Previously only plural was matched, so headers + # like "טענות העורר:" were silently absorbed into the preceding section. + ( + r"(?:טענות|עיקר\s*טענות)\s*ה(?:עוררי[םן]|עורר[ת]?|מערערי[םן]|מערער[ת]?)", + "appellant_claims", + ), + # respondent_claims: covers singular (משיב/משיבה) and plural (משיבים/משיבין), + # plus verb forms תשובת/תגובת. "טענות המשיבה:" (feminine singular) was the + # root cause of halacha 8181-21 index-11 being extracted from party claims. + ( + r"(?:טענות|תשובת|תגובת|עיקר\s*טענות)\s*ה(?:משיבי[םן]|משיב[ה]?)", + "respondent_claims", + ), (r"דיון\s*והכרעה|דיון|הכרעה|ניתוח\s*משפטי|המסגרת\s*המשפטית|נימוקים", "legal_analysis"), (r"מסקנ[הות]|סיכום|סוף\s*דבר", "conclusion"), (r"פסק[- ]?דין|החלטה|לפיכך\s*אני\s*מחליט|התוצאה", "ruling"), diff --git a/mcp-server/src/legal_mcp/services/halacha_extractor.py b/mcp-server/src/legal_mcp/services/halacha_extractor.py index c83373c..796c011 100644 --- a/mcp-server/src/legal_mcp/services/halacha_extractor.py +++ b/mcp-server/src/legal_mcp/services/halacha_extractor.py @@ -638,12 +638,20 @@ async def _select_extractable_chunks( extraction. Previously the fallback took *all* chunks, re-admitting exactly the sections the primary filter excludes. + Positive-anchor guard: when a document has a "דיון/הכרעה" section + (section_type='legal_analysis'), extraction starts from that section + onwards. Any 'ruling' chunks that appear BEFORE the first legal_analysis + chunk by position are dropped — they most likely result from a party-claims + section whose header was not recognised by the chunker and was therefore + absorbed into the preceding section. + Returns ``(chunks, used_fallback)`` so the caller can log the fallback once. """ chunks = await db.list_precedent_chunks( case_law_id, section_types=EXTRACTABLE_SECTIONS, ) if chunks: + chunks = _apply_discussion_anchor(chunks) return chunks, False all_chunks = await db.list_precedent_chunks(case_law_id) filtered = [ @@ -653,6 +661,37 @@ async def _select_extractable_chunks( return filtered, True +def _apply_discussion_anchor(chunks: list[dict]) -> list[dict]: + """Drop 'ruling' chunks that precede the first 'legal_analysis' chunk. + + In Israeli planning-committee decisions the discussion section + (דיון / הכרעה / דיון והכרעה) always comes after the parties' claims. + A 'ruling'-labelled chunk that appears before the discussion is a strong + signal that a party-claims section was silently absorbed into it (chunker + regex didn't match the header). Dropping those early 'ruling' chunks is + safe because all reasoning content falls at or after the 'דיון' anchor. + """ + analysis_indices = [ + c["chunk_index"] for c in chunks + if c.get("section_type") == "legal_analysis" + ] + if not analysis_indices: + return chunks + first_analysis = min(analysis_indices) + filtered = [ + c for c in chunks + if not (c.get("section_type") == "ruling" and c["chunk_index"] < first_analysis) + ] + dropped = len(chunks) - len(filtered) + if dropped: + logger.info( + "halacha_extractor: positive-anchor guard dropped %d pre-discussion " + "'ruling' chunk(s) (first legal_analysis at chunk_index=%d)", + dropped, first_analysis, + ) + return filtered + + async def _extract_impl(case_law_id: UUID, force: bool = False, effort: str | None = None) -> dict: """Core extraction (caller holds the global advisory lock for the duration). diff --git a/mcp-server/src/legal_mcp/services/halacha_quality.py b/mcp-server/src/legal_mcp/services/halacha_quality.py index ee90d05..7948bee 100644 --- a/mcp-server/src/legal_mcp/services/halacha_quality.py +++ b/mcp-server/src/legal_mcp/services/halacha_quality.py @@ -279,9 +279,50 @@ FLAG_NON_DECISION = "non_decision" FLAG_TRUNCATED_QUOTE = "truncated_quote" FLAG_THIN_RESTATEMENT = "thin_restatement" FLAG_QUOTE_UNVERIFIED = "quote_unverified" -FLAG_NLI_UNSUPPORTED = "nli_unsupported" # rule not entailed by its quote (#81.3) -FLAG_APPLICATION = "application" # fact-dependent, not a holding (#81.4) -FLAG_NEAR_DUPLICATE = "near_duplicate" # cosine-tail lexical dup (#82.3) +FLAG_NLI_UNSUPPORTED = "nli_unsupported" # rule not entailed by its quote (#81.3) +FLAG_APPLICATION = "application" # fact-dependent, not a holding (#81.4) +FLAG_NEAR_DUPLICATE = "near_duplicate" # cosine-tail lexical dup (#82.3) +FLAG_PARTY_CLAIM = "party_claim_language" # quote reads as a party's position, not the court's + + +# ── Party-claim language: quote is the court's words, not a party's ── +# +# Positive markers that a quote comes from a party's argument section rather +# than the court's own reasoning. The chunker now correctly classifies these +# sections, but a belt-and-suspenders lexical gate catches any case where +# the chunker still absorbs a party-claims section into a reasoning chunk +# (e.g. an unrecognised header variant). We scan the supporting_quote only — +# the rule_statement is already abstracted and should not contain these phrases. + +_PARTY_CLAIM_MARKERS = ( + # Named-party attribution forms — always party-claim language, never court reasoning + "לטענת העורר", + "לטענת העוררת", + "לטענת העוררים", + "לטענת המשיב", + "לטענת המשיבה", + "לטענת המשיבים", + "טוען העורר", + "טוענת העוררת", + "טוען המשיב", + "טוענת המשיבה", + # Excluded (too broad — courts also use these in their own reasoning): + # "נטען כי", "נטען על ידי", "נטען על-ידי", "לטענתו", "לטענתה", "לטענתם" +) + + +def detect_party_claim_language(supporting_quote: str) -> str | None: + """Return the first party-claim marker found in the quote (or None). + + Only the supporting_quote is scanned — rule_statement is already abstracted. + A match means the LLM likely extracted from a party's argument section + rather than the court's reasoning. + """ + norm = normalize_text(supporting_quote) + for marker in _PARTY_CLAIM_MARKERS: + if marker in norm: + return marker + return None # ── NLI entailment check (rule_statement ⊨ supporting_quote) — #81.3 ── @@ -417,4 +458,8 @@ def compute_quality_flags( # rule_type='application' and add a high-precision deixis catch. if rule_type == "application" or is_fact_dependent(rule_statement): flags.append(FLAG_APPLICATION) + # Belt-and-suspenders: if the quote contains party-claim language the + # chunker's section filter should have excluded, flag for manual review. + if detect_party_claim_language(supporting_quote): + flags.append(FLAG_PARTY_CLAIM) return flags diff --git a/mcp-server/tests/test_chunker_section_patterns.py b/mcp-server/tests/test_chunker_section_patterns.py new file mode 100644 index 0000000..45085a5 --- /dev/null +++ b/mcp-server/tests/test_chunker_section_patterns.py @@ -0,0 +1,122 @@ +"""Tests for the Hebrew section-header patterns in the chunker. + +Focuses on the singular/feminine forms that were previously missing and +caused party-claims sections to bleed into the preceding section type. +""" +from __future__ import annotations + +import pytest + +from legal_mcp.services.chunker import chunk_document + + +def _section_types(text: str) -> list[str]: + """Return the sequence of section_type values in the chunked output.""" + return [c.section_type for c in chunk_document(text)] + + +def _has_section(text: str, section: str) -> bool: + return section in _section_types(text) + + +# ── respondent_claims patterns ────────────────────────────────────────────── + +@pytest.mark.parametrize("header", [ + "טענות המשיבים:", # plural masculine — was working before + "טענות המשיבין:", # plural alternative — was working + "תשובת המשיבים:", # plural verb form — was working + "טענות המשיבה:", # singular feminine — BUG FIX (8181-21) + "טענות המשיב:", # singular masculine — BUG FIX + "תשובת המשיבה:", # singular feminine verb form — BUG FIX + "תגובת המשיבה:", # תגובה form — BUG FIX + "תגובת המשיבים:", # plural תגובה — BUG FIX + "עיקר טענות המשיבה:", # prefix + singular — BUG FIX +]) +def test_respondent_claims_recognized(header): + # Explicit + avoids Python implicit string-literal concatenation merging + # the "\n\n" separator into the following header string. + text = ( + "רקע עובדתי\n" + + "עובדות כלליות. " * 20 + "\n\n" + + f"{header}\n" + + "הוועדה המקומית טוענת כי אין מקום לקבל את הערר. " * 15 + "\n\n" + + "דיון והכרעה\n" + + "לאחר בחינת הטענות אנו קובעים כי יש לקבל את הערר בחלקו. " * 15 + ) + assert _has_section(text, "respondent_claims"), ( + f"Header '{header}' should produce respondent_claims chunk" + ) + + +# ── appellant_claims patterns ──────────────────────────────────────────────── + +@pytest.mark.parametrize("header", [ + "טענות העוררים:", # plural masculine — was working before + "טענות העוררין:", # plural alternative — was working + "טענות המערערים:", # מערערים plural — was working + "טענות העורר:", # singular masculine — BUG FIX + "טענות העוררת:", # singular feminine — BUG FIX + "טענות המערער:", # מערער singular — BUG FIX + "טענות המערערת:", # מערערת singular feminine — BUG FIX + "עיקר טענות העורר:", # prefix + singular — BUG FIX +]) +def test_appellant_claims_recognized(header): + text = ( + "רקע עובדתי\n" + + "עובדות כלליות. " * 20 + "\n\n" + + f"{header}\n" + + "ב\"כ העורר טוען כי יש לקבל את הערר ולבטל את ההחלטה. " * 15 + "\n\n" + + "דיון והכרעה\n" + + "לאחר בחינת הטענות אנו קובעים כי יש לקבל את הערר בחלקו. " * 15 + ) + assert _has_section(text, "appellant_claims"), ( + f"Header '{header}' should produce appellant_claims chunk" + ) + + +# ── regression: existing plural forms still work ──────────────────────────── + +def test_regression_plural_respondent_and_legal_analysis(): + """Full decision with plural respondent — all sections must be preserved.""" + text = ( + "מבוא\n" + + "ערר זה הוגש על החלטת הוועדה המקומית לתכנון ובנייה. " * 10 + "\n\n" + + "רקע עובדתי\n" + + "העורר רכש את הנכס בשנת 2010 ופנה לקבלת היתר בנייה. " * 10 + "\n\n" + + "טענות העוררים:\n" + + "ב\"כ העוררים טוען כי נפל פגם יסודי בהחלטת הוועדה. " * 10 + "\n\n" + + "טענות המשיבים:\n" + + "הוועדה המקומית סבורה כי ההחלטה תקינה ומבוססת. " * 10 + "\n\n" + + "דיון והכרעה\n" + + "לאחר בחינת הטענות קובעים כי הערר מתקבל בחלקו. " * 10 + ) + types = set(_section_types(text)) + assert "appellant_claims" in types + assert "respondent_claims" in types + assert "legal_analysis" in types + + +# ── party claims do NOT bleed into legal_analysis ─────────────────────────── + +def test_respondent_singular_does_not_bleed_into_ruling(): + """Before the fix, "טענות המשיבה:" was absorbed into the preceding section. + After the fix it must produce its own respondent_claims chunk.""" + text = ( + "החלטה\n" + + "הוועדה דנה בערר שהוגש על החלטת הוועדה המקומית. " * 10 + "\n\n" + + "טענות המשיבה:\n" + + "הוועדה המקומית טוענת שאין כל השבחה בנסיבות העניין. " * 10 + "\n\n" + + "דיון\n" + + "אנו בוחנים את הטענות לאחר עיון בחומר שהוגש. " * 10 + ) + # The respondent claims must NOT remain labeled as 'ruling' + chunks = chunk_document(text) + respondent_chunks = [c for c in chunks if c.section_type == "respondent_claims"] + assert len(respondent_chunks) > 0, ( + "Singular 'טענות המשיבה' must produce respondent_claims chunks, not bleed into ruling" + ) + # Verify the respondent text didn't end up in ruling chunks + ruling_text = " ".join(c.content for c in chunks if c.section_type == "ruling") + assert "הוועדה המקומית טוענת שאין השבחה" not in ruling_text, ( + "Respondent claim text must not appear in ruling chunks" + ) diff --git a/mcp-server/tests/test_halacha_quality.py b/mcp-server/tests/test_halacha_quality.py index bf6c2ac..d68b81f 100644 --- a/mcp-server/tests/test_halacha_quality.py +++ b/mcp-server/tests/test_halacha_quality.py @@ -270,3 +270,55 @@ def test_lexical_near_duplicate_band(): assert hq.lexical_near_duplicate(a, b) is True c = "המועד להגשת ערר על שומה הוא שלושים ימים" assert hq.lexical_near_duplicate(a, c) is False + + +# ── party-claim language detector (FLAG_PARTY_CLAIM) ──────────────────────── + +@pytest.mark.parametrize("quote", [ + # Named-party attribution — always party-claim language + "לטענת העורר אין בסיס לחיוב בהיטל", + "לטענת המשיבה יש לדחות את הערר", + "לטענת המשיב לא הייתה כל השבחה", + "לטענת המשיבים ההחלטה תקינה", + "טוענת המשיבה כי אין מקום לפיצוי", +]) +def test_detect_party_claim_language_hits(quote): + assert hq.detect_party_claim_language(quote) is not None, ( + f"Quote '{quote[:40]}' should trigger party_claim detection" + ) + + +@pytest.mark.parametrize("quote", [ + # Court's own reasoning — must NOT trigger the flag + "ועדת הערר קובעת כי ההיתר תואם את התכנית", + "מן הטעמים האמורים, הערר מתקבל", + "לאחר בחינת הטענות אנו סבורים כי יש לדחות", + # Broad passive forms courts also use in their reasoning (excluded from markers) + "נטען כי ההיתר ניתן שלא כדין", + "נטען על ידי העורר כי השומה שגויה", + # The 8181-21 bad halacha quote — no party-claim markers + "גם אם ניתן בעבר פטור לעוררת על בסיס פרשנות הוראות החוק שהשתנתה כיום הרי שיש לפעול בהתאם לפרשנות כיום", +]) +def test_detect_party_claim_language_misses(quote): + assert hq.detect_party_claim_language(quote) is None, ( + f"Quote '{quote[:40]}' must NOT trigger party_claim detection" + ) + + +def test_party_claim_flag_in_compute_quality_flags(): + flags = hq.compute_quality_flags( + "ביטול היתר מחייב בסיס חוקי", + "לטענת העורר ההיתר ניתן שלא כדין ויש לבטלו", + quote_verified=True, + ) + assert hq.FLAG_PARTY_CLAIM in flags + + +def test_court_quote_does_not_trigger_party_claim_flag(): + flags = hq.compute_quality_flags( + "ביטול היתר מחייב בסיס חוקי מוצק", + "ועדת הערר קובעת כי ביטול היתר מחייב בסיס חוקי מוצק בדמות פגיעה ממשית באינטרס ציבורי", + quote_verified=True, + rule_type="holding", + ) + assert hq.FLAG_PARTY_CLAIM not in flags