From 3c169a76f262bca0d022343266713d875015bdfc Mon Sep 17 00:00:00 2001 From: Chaim Date: Thu, 11 Jun 2026 15:52:13 +0000 Subject: [PATCH] =?UTF-8?q?feat(halacha):=20rhetorical-role=20pre-filter?= =?UTF-8?q?=20=E2=80=94=20fallback=20excludes=20facts/arguments=20(#81.6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit חילוץ-הלכות מוגבל למקטעי הנמקה/הכרעה בלבד (INV-LRN2 quality-at-source). הפער שנסגר: מסלול ה-fallback (כשה-chunker לא תייג שום מקטע כ-extractable, כותרות לא-תקניות → הכול 'other') נפל קודם ל**כל** ה-chunks — והחזיר בדיוק את המקטעים שהמסנן הראשי מחריג (רקע עובדתי + טענות הצדדים). בלבול Facts↔Reasoning הוא מחלקת-השגיאה הדומיננטית (LegalSeg), כך שהזנת עובדות לחילוץ פוגעת ישירות ב-precision. - NON_REASONING_SECTIONS = (facts, appellant_claims, respondent_claims, intro) - _select_extractable_chunks(): מרכז את מדיניות-הבחירה (primary + fallback) בפונקציה אחת המשמשת גם את הבחירה הראשית וגם את ה-re-read לקביעת-סטטוס (G2 — מקור-אמת יחיד, אין מסלול מקביל). ה-fallback מחריג את NON_REASONING_SECTIONS ועדיין מגיע להנמקה שנחתה תחת 'other'. invariants: G1 (נרמול-במקור, לא תיקון-בקריאה) · G2 (אין מסלול מקביל) · INV-LRN2 (quality-at-source). tests: 4 חדשות (primary/fallback-excludes-args/all-nonreasoning/disjoint-sets) + 61 בדיקות-הלכה קיימות עוברות. Co-Authored-By: Claude Opus 4.8 --- .../legal_mcp/services/halacha_extractor.py | 76 +++++++--- .../test_halacha_rhetorical_prefilter.py | 137 ++++++++++++++++++ 2 files changed, 193 insertions(+), 20 deletions(-) create mode 100644 mcp-server/tests/test_halacha_rhetorical_prefilter.py diff --git a/mcp-server/src/legal_mcp/services/halacha_extractor.py b/mcp-server/src/legal_mcp/services/halacha_extractor.py index 2d232bb..fcdedb6 100644 --- a/mcp-server/src/legal_mcp/services/halacha_extractor.py +++ b/mcp-server/src/legal_mcp/services/halacha_extractor.py @@ -64,6 +64,15 @@ EXTRACTION_FAILURE_THRESHOLD = 0.5 # never contain holdings, only positions, so we skip them. EXTRACTABLE_SECTIONS = ("legal_analysis", "ruling", "conclusion") +# Sections confidently classified as NON-reasoning (parties' positions, the +# factual background, the opening). The fallback path — taken when the chunker +# labeled nothing as an extractable section (non-standard headings → 'other') — +# excludes these so facts/arguments are NEVER fed into extraction, while +# reasoning that merely landed under 'other' is still reached. Raises precision +# on the dominant Facts↔Reasoning confusion class (#81.6; INV-LRN2 +# quality-at-source; LegalSeg / rhetorical-role labeling). +NON_REASONING_SECTIONS = ("facts", "appellant_claims", "respondent_claims", "intro") + # Two prompts — choose by source's is_binding flag. # @@ -498,6 +507,39 @@ async def extract(case_law_id: UUID | str, force: bool = False, await pool.release(lock_conn) +async def _select_extractable_chunks( + case_law_id: UUID, +) -> tuple[list[dict], bool]: + """Pick the chunks that are candidates for halacha extraction (#81.6). + + Rhetorical-role pre-filter (INV-LRN2 quality-at-source): only + reasoning/decision sections feed extraction. + + Primary: chunks labeled as an extractable section + (``EXTRACTABLE_SECTIONS``). Fallback — taken only when the chunker labeled + *nothing* extractable (non-standard headings collapse everything to + 'other') — is every chunk EXCEPT those confidently classified as + non-reasoning (``NON_REASONING_SECTIONS``: facts / parties' arguments / + intro). This preserves recall for reasoning that landed under 'other' while + never feeding the factual background or the parties' positions into + extraction. Previously the fallback took *all* chunks, re-admitting exactly + the sections the primary filter excludes. + + Returns ``(chunks, used_fallback)`` so the caller can log the fallback once. + """ + chunks = await db.list_precedent_chunks( + case_law_id, section_types=EXTRACTABLE_SECTIONS, + ) + if chunks: + return chunks, False + all_chunks = await db.list_precedent_chunks(case_law_id) + filtered = [ + c for c in all_chunks + if c.get("section_type") not in NON_REASONING_SECTIONS + ] + return filtered, True + + async def _extract_impl(case_law_id: UUID, force: bool = False, effort: str | None = None) -> dict: """Core extraction (caller holds the global advisory lock for the duration). @@ -514,22 +556,16 @@ async def _extract_impl(case_law_id: UUID, force: bool = False, is_binding = bool(record.get("is_binding")) - # Try the targeted sections first (legal_analysis / ruling / conclusion). - # If the chunker labeled everything as 'other' (common when a ruling - # uses non-standard headings or the section markers aren't bracketed - # cleanly), fall back to ALL chunks — better to over-include than to - # silently skip a ruling that has reasoning under an unexpected label. - chunks = await db.list_precedent_chunks( - case_law_id, section_types=EXTRACTABLE_SECTIONS, - ) - if not chunks: - chunks = await db.list_precedent_chunks(case_law_id) - if chunks: - logger.info( - "halacha_extractor: case_law=%s — no targeted sections, " - "falling back to all %d chunks", - case_law_id, len(chunks), - ) + # Rhetorical-role pre-filter (#81.6, INV-LRN2): only reasoning/decision + # sections are candidates. The fallback (no targeted section labeled) + # still excludes facts/arguments/intro — see _select_extractable_chunks. + chunks, used_fallback = await _select_extractable_chunks(case_law_id) + if used_fallback and chunks: + logger.info( + "halacha_extractor: case_law=%s — no targeted sections, " + "falling back to %d non-argument chunks (facts/arguments excluded)", + case_law_id, len(chunks), + ) if not chunks: await db.set_case_law_halacha_status(case_law_id, "completed") return {"status": "no_chunks", "extracted": 0, "stored": 0} @@ -655,10 +691,10 @@ async def _extract_impl(case_law_id: UUID, force: bool = False, await asyncio.gather(*[_process(c) for c in pending]) - # Decide final status from what's LEFT (re-read checkpoints). - after = await db.list_precedent_chunks(case_law_id, section_types=EXTRACTABLE_SECTIONS) - if not after: - after = await db.list_precedent_chunks(case_law_id) + # Decide final status from what's LEFT (re-read checkpoints). Use the same + # candidate-selection policy as above so the pending count matches the set + # we actually extracted from (G2 — single source of truth, no parallel path). + after, _ = await _select_extractable_chunks(case_law_id) still_pending = sum(1 for c in after if c.get("halacha_extracted_at") is None) total = len(await db.list_halachot(case_law_id=case_law_id, limit=10_000)) diff --git a/mcp-server/tests/test_halacha_rhetorical_prefilter.py b/mcp-server/tests/test_halacha_rhetorical_prefilter.py new file mode 100644 index 0000000..f09f3be --- /dev/null +++ b/mcp-server/tests/test_halacha_rhetorical_prefilter.py @@ -0,0 +1,137 @@ +"""Tests for TaskMaster #81.6 — rhetorical-role pre-filter on halacha extraction. + +Only reasoning/decision sections should feed halacha extraction (INV-LRN2 +quality-at-source). The historical bug: when the chunker labeled *nothing* as an +extractable section (non-standard headings → everything 'other'), the fallback +took ALL chunks — re-admitting the factual background and the parties' +arguments, exactly the sections the primary filter excludes. The dominant +extraction error class is Facts↔Reasoning confusion (LegalSeg), so feeding facts +into extraction directly lowers precision. + +Fix: ``_select_extractable_chunks`` — the fallback now excludes +``NON_REASONING_SECTIONS`` (facts / appellant_claims / respondent_claims / +intro) while still reaching reasoning that merely landed under 'other'. + +Runs fully OFFLINE — monkeypatches ``db.list_precedent_chunks`` so no Postgres +is needed (same style as ``test_halacha_reextract_preserves_approved.py``). +""" + +from __future__ import annotations + +import asyncio +from uuid import uuid4 + +import pytest + +from legal_mcp.services import db, halacha_extractor + + +def _chunk(idx: int, section_type: str) -> dict: + return { + "id": uuid4(), + "chunk_index": idx, + "content": f"chunk-{idx}-{section_type}", + "section_type": section_type, + "page_number": None, + "halacha_extracted_at": None, + } + + +def _patch_chunks(monkeypatch: pytest.MonkeyPatch, all_chunks: list[dict]) -> list[dict]: + """Patch db.list_precedent_chunks to filter ``all_chunks`` like Postgres would. + + Returns a list that records every call's ``section_types`` argument so a + test can assert whether the unfiltered fallback query was issued. + """ + calls: list = [] + + async def _fake(case_law_id, section_types=None): # noqa: ANN001 + calls.append(section_types) + if section_types: + return [c for c in all_chunks if c["section_type"] in section_types] + return list(all_chunks) + + monkeypatch.setattr(db, "list_precedent_chunks", _fake) + return calls + + +def _run(coro): + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(coro) + finally: + loop.close() + + +def test_primary_path_returns_only_reasoning_sections(monkeypatch: pytest.MonkeyPatch) -> None: + """When extractable sections exist, return exactly them — no fallback.""" + all_chunks = [ + _chunk(0, "facts"), + _chunk(1, "legal_analysis"), + _chunk(2, "appellant_claims"), + _chunk(3, "ruling"), + _chunk(4, "conclusion"), + ] + calls = _patch_chunks(monkeypatch, all_chunks) + + chunks, used_fallback = _run( + halacha_extractor._select_extractable_chunks(uuid4()), + ) + + assert used_fallback is False + got = sorted(c["section_type"] for c in chunks) + assert got == ["conclusion", "legal_analysis", "ruling"] + # Only the filtered query ran — the unfiltered fallback was never issued. + assert calls == [halacha_extractor.EXTRACTABLE_SECTIONS] + + +def test_fallback_excludes_facts_and_arguments(monkeypatch: pytest.MonkeyPatch) -> None: + """No targeted section → fall back, but never to facts/arguments/intro.""" + all_chunks = [ + _chunk(0, "intro"), + _chunk(1, "facts"), + _chunk(2, "appellant_claims"), + _chunk(3, "respondent_claims"), + _chunk(4, "other"), # reasoning that landed under an unexpected label + _chunk(5, "other"), + ] + calls = _patch_chunks(monkeypatch, all_chunks) + + chunks, used_fallback = _run( + halacha_extractor._select_extractable_chunks(uuid4()), + ) + + assert used_fallback is True + # Only the 'other' chunks survive — facts / arguments / intro are dropped. + assert {c["section_type"] for c in chunks} == {"other"} + assert len(chunks) == 2 + # Both queries ran: the filtered primary (empty), then the unfiltered fallback. + assert calls == [halacha_extractor.EXTRACTABLE_SECTIONS, None] + + +def test_fallback_all_nonreasoning_extracts_nothing(monkeypatch: pytest.MonkeyPatch) -> None: + """A doc that is entirely facts/arguments/intro yields zero candidates — + extraction never runs on the factual background.""" + all_chunks = [ + _chunk(0, "intro"), + _chunk(1, "facts"), + _chunk(2, "appellant_claims"), + _chunk(3, "respondent_claims"), + ] + _patch_chunks(monkeypatch, all_chunks) + + chunks, used_fallback = _run( + halacha_extractor._select_extractable_chunks(uuid4()), + ) + + assert used_fallback is True + assert chunks == [] + + +def test_non_reasoning_set_is_disjoint_from_extractable() -> None: + """The two policy sets must never overlap — a section cannot be both a + reasoning candidate and a confidently-excluded one.""" + assert not ( + set(halacha_extractor.NON_REASONING_SECTIONS) + & set(halacha_extractor.EXTRACTABLE_SECTIONS) + ) -- 2.49.1