From 1286a1e60d46937156ca3bd60c01568f37652387 Mon Sep 17 00:00:00 2001 From: Chaim Date: Sat, 6 Jun 2026 19:55:45 +0000 Subject: [PATCH] feat(halacha): application gate + lexical dedup tail + quality harnesses (#81,#82) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Halacha-extraction quality (#81) and dedup-on-insert (#82) — engine changes (pure + tested) plus measurement/ops tooling. halacha_quality.py - #81.4 application gate: is_fact_dependent() (high-precision "applied to THIS case" deixis per the strict rubric §3/§27) + FLAG_APPLICATION. compute_quality_flags now takes rule_type and flags rule_type=='application' OR fact-dependent — blocking auto-approve (an illustration is not a generalizable holding). - #82.3 lexical tail signal: jaccard_shingles / normalized_levenshtein / lexical_near_duplicate + FLAG_NEAR_DUPLICATE, for the 0.83–0.93 cosine band. halacha_extractor.py — pass rule_type to the flag computation; re-type a binding-labeled fact-application to 'application' (mirrors non_decision→obiter). db.py (store_halachot_for_chunk) — dedup now fetches the nearest same-precedent neighbor once: cosine ≥ DEDUP → skip (unchanged); cosine in [BAND, DEDUP) with high lexical overlap → FLAG_NEAR_DUPLICATE (review, not skip — never drop a possibly-distinct principle unreviewed). config.py — HALACHA_DEDUP_BAND_COSINE (0.83). Scripts: - scripts/halacha_goldset.py (#81.7) — export stratified sample for human tagging; score validators (P/R/F1) against the tags. Backbone for #81.8. - scripts/halacha_batch_reconcile.py (#82.7) — conservative cross-precedent dedup (cosine ≥0.95), dry-run report only. - scripts/calibrate_halacha_dedup.py (#82.1) — calibrate the lexical thresholds against the 2026-06-03 cleanup gold-set. Deferred (documented): #82.4 merge-provenance and #82.5 DB ON CONFLICT/UNIQUE on normalized quote are NOT included — the current skip+flag behavior is safe, whereas a UNIQUE on normalized_quote would fail on existing dups and a blind merge risks losing provenance; they need their own chair-reviewed migration. #82.6 over-merge guard is moot until merge lands. #81.6 full rhetorical-role classifier deferred (section pre-filter + application flag cover the practical case); #81.8 blocked on the human-tagged gold-set (harness now provided). Verified: - pytest tests/test_halacha_quality.py — 52 passed (14 new). - calibrate: configured (0.55,0.70) → precision 1.0 (zero false-merge), recall 0.30 — correct profile for an auto-approve-blocking signal. - goldset export: 15-row sample CSV. batch reconcile: 819 halachot → 5 cross-precedent candidate pairs. Invariants: G1 (normalize at source — flag at insert, not at read); §6 (no silent swallow — suspect items flagged to review, never dropped); G2 (no parallel path — same store_halachot_for_chunk / compute_quality_flags). Co-Authored-By: Claude Opus 4.8 (1M context) --- mcp-server/src/legal_mcp/config.py | 8 + mcp-server/src/legal_mcp/services/db.py | 32 ++-- .../legal_mcp/services/halacha_extractor.py | 6 + .../src/legal_mcp/services/halacha_quality.py | 93 +++++++++++ mcp-server/tests/test_halacha_quality.py | 72 +++++++++ scripts/SCRIPTS.md | 3 + scripts/calibrate_halacha_dedup.py | 115 ++++++++++++++ scripts/halacha_batch_reconcile.py | 106 +++++++++++++ scripts/halacha_goldset.py | 149 ++++++++++++++++++ 9 files changed, 574 insertions(+), 10 deletions(-) create mode 100644 scripts/calibrate_halacha_dedup.py create mode 100644 scripts/halacha_batch_reconcile.py create mode 100644 scripts/halacha_goldset.py diff --git a/mcp-server/src/legal_mcp/config.py b/mcp-server/src/legal_mcp/config.py index 1049531..2ad66fa 100644 --- a/mcp-server/src/legal_mcp/config.py +++ b/mcp-server/src/legal_mcp/config.py @@ -154,6 +154,14 @@ HALACHA_AUTO_APPROVE_THRESHOLD = float( # principle. Set > 1.0 to disable semantic dedup (exact-quote dedup still runs). HALACHA_DEDUP_COSINE = float(os.environ.get("HALACHA_DEDUP_COSINE", "0.93")) +# Halacha dedup TAIL band (#82.3) — the [BAND_COSINE, DEDUP_COSINE) range is too +# low to auto-skip but suspicious. A halacha whose nearest same-precedent +# neighbor sits in this band AND has high LEXICAL overlap (Jaccard/Levenshtein +# on rule_statement) is flagged 'near_duplicate' (blocks auto-approve → review), +# not skipped — catching paraphrases the cosine threshold misses without +# dropping a possibly-distinct principle unreviewed. 0.83 from the same cleanup. +HALACHA_DEDUP_BAND_COSINE = float(os.environ.get("HALACHA_DEDUP_BAND_COSINE", "0.83")) + # Halacha NLI entailment validator (#81.3) — after extraction, a claude_session # judge checks each halacha's rule_statement is entailed by its supporting_quote. # Non-entailed (neutral/contradiction) → quality flag 'nli_unsupported' that diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py index 341ad2e..2ef5ffb 100644 --- a/mcp-server/src/legal_mcp/services/db.py +++ b/mcp-server/src/legal_mcp/services/db.py @@ -3699,6 +3699,7 @@ async def store_halachot_for_chunk( """ threshold = config.HALACHA_AUTO_APPROVE_THRESHOLD dedup_distance = 1.0 - config.HALACHA_DEDUP_COSINE # cosine sim → distance + band_distance = 1.0 - config.HALACHA_DEDUP_BAND_COSINE # tail-band ceiling (#82.3) pool = await get_pool() inserted = 0 skipped = 0 @@ -3722,21 +3723,32 @@ async def store_halachot_for_chunk( if norm_quote and norm_quote in existing_quotes: skipped += 1 continue - # 2) semantic near-duplicate (rule embedding cosine) + # 2) semantic near-duplicate (rule embedding cosine) — fetch the + # nearest same-precedent neighbor once so we can both auto-skip + # (cosine ≥ DEDUP) and flag the lexical tail (#82.3). emb = h.get("embedding") + flags = list(h.get("quality_flags") or []) if emb is not None and config.HALACHA_DEDUP_COSINE <= 1.0: - dup = await conn.fetchval( - "SELECT 1 FROM halachot WHERE case_law_id = $1 " - "AND embedding IS NOT NULL AND (embedding <=> $2) <= $3 " - "LIMIT 1", - case_law_id, emb, dedup_distance, + neighbor = await conn.fetchrow( + "SELECT rule_statement, (embedding <=> $2) AS dist " + "FROM halachot WHERE case_law_id = $1 " + "AND embedding IS NOT NULL " + "ORDER BY embedding <=> $2 LIMIT 1", + case_law_id, emb, ) - if dup: - skipped += 1 - continue + if neighbor is not None: + dist = float(neighbor["dist"]) + if dist <= dedup_distance: + skipped += 1 + continue + # tail band: below auto-skip but lexically near → flag. + if (dist <= band_distance + and halacha_quality.FLAG_NEAR_DUPLICATE not in flags + and halacha_quality.lexical_near_duplicate( + h["rule_statement"], neighbor["rule_statement"])): + flags.append(halacha_quality.FLAG_NEAR_DUPLICATE) confidence = float(h.get("confidence", 0.0)) - flags = h.get("quality_flags") or [] auto_approve = confidence >= threshold and not flags review_status = "approved" if auto_approve else "pending_review" reviewer = ( diff --git a/mcp-server/src/legal_mcp/services/halacha_extractor.py b/mcp-server/src/legal_mcp/services/halacha_extractor.py index f365cd5..af4c22e 100644 --- a/mcp-server/src/legal_mcp/services/halacha_extractor.py +++ b/mcp-server/src/legal_mcp/services/halacha_extractor.py @@ -592,10 +592,16 @@ async def _extract_impl(case_law_id: UUID, force: bool = False, flags = halacha_quality.compute_quality_flags( coerced["rule_statement"], coerced["supporting_quote"], coerced["reasoning_summary"], coerced["quote_verified"], + coerced["rule_type"], ) coerced["quality_flags"] = flags if halacha_quality.FLAG_NON_DECISION in flags and coerced["rule_type"] != "obiter": coerced["rule_type"] = "obiter" + # #81.4 — a binding-labeled rule that reads as a case-application is + # re-typed application (it carries FLAG_APPLICATION either way). + elif (halacha_quality.FLAG_APPLICATION in flags + and coerced["rule_type"] == "binding"): + coerced["rule_type"] = "application" cleaned.append(coerced) # #81.3 NLI entailment — one batched judge call per chunk (fail-open). if config.HALACHA_NLI_ENABLED and cleaned: diff --git a/mcp-server/src/legal_mcp/services/halacha_quality.py b/mcp-server/src/legal_mcp/services/halacha_quality.py index 92fc906..e88eda1 100644 --- a/mcp-server/src/legal_mcp/services/halacha_quality.py +++ b/mcp-server/src/legal_mcp/services/halacha_quality.py @@ -128,6 +128,91 @@ def is_thin_restatement(rule_statement: str, supporting_quote: str) -> bool: return overlap >= _THIN_OVERLAP and len_ratio <= _THIN_LEN_RATIO +# ── Fact-dependent application: not a generalizable holding (#81.4) ── +# +# The strict rubric's cut_application (docs/halacha-strict-rubric.md §3, §27): +# a determination that rests on the case's specific facts/parties/amounts is an +# illustration, not a holding — it must not enter the corpus as a binding rule. +# The extractor already classifies ``rule_type='application'``; this is a +# HIGH-PRECISION secondary catch for rules the model mislabeled as binding, +# using only the unambiguous "applied to THIS case" deixis (bare party words +# like "המערער" appear in genuine rules too, so they are deliberately excluded). + +_FACT_DEPENDENT_MARKERS = ( + "במקרה דנן", + "במקרה שבפנינו", + "במקרה שלפנינו", + "במקרה שלפניי", + "בענייננו", + "בנדון דידן", + "בנדון דנן", + "במקרה שלנו", + "בנסיבות המקרה שלפנינו", + "בנסיבות תיק זה", + "בתיק שלפנינו", + "בערר שלפנינו", + "בערר דנן", +) + + +def is_fact_dependent(rule_statement: str) -> bool: + """True when the rule is phrased as an application to THIS case (not a holding).""" + norm = normalize_text(rule_statement) + return any(marker in norm for marker in _FACT_DEPENDENT_MARKERS) + + +# ── Lexical near-duplicate signal (the 0.83–0.90 cosine tail) — #82.3 ── +# +# Embedding cosine alone misses paraphrases that float just below the dedup +# threshold (0.93). A secondary lexical signal — Jaccard over word-shingles + +# normalized Levenshtein on the rule_statement — catches "same rule, reworded" +# in that band without lowering the global cosine threshold. Hybrid +# lexical+semantic beats either alone (arXiv:1805.11611). Pure functions. + +def _shingles(text: str, k: int = 2) -> set[str]: + words = [w for w in re.split(r"[^א-ת0-9]+", normalize_text(text)) if w] + if len(words) < k: + return {" ".join(words)} if words else set() + return {" ".join(words[i : i + k]) for i in range(len(words) - k + 1)} + + +def jaccard_shingles(a: str, b: str, k: int = 2) -> float: + sa, sb = _shingles(a, k), _shingles(b, k) + if not sa or not sb: + return 0.0 + return len(sa & sb) / len(sa | sb) + + +def normalized_levenshtein(a: str, b: str) -> float: + """1.0 == identical, 0.0 == fully different (edit distance / max len).""" + a, b = normalize_text(a), normalize_text(b) + if not a and not b: + return 1.0 + if not a or not b: + return 0.0 + # classic DP edit distance (rule_statements are short — a few hundred chars) + prev = list(range(len(b) + 1)) + for i, ca in enumerate(a, 1): + cur = [i] + for j, cb in enumerate(b, 1): + cur.append(min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + (ca != cb))) + prev = cur + return 1.0 - prev[-1] / max(len(a), len(b)) + + +_LEX_JACCARD_MIN = 0.55 +_LEX_LEVENSHTEIN_MIN = 0.70 + + +def lexical_near_duplicate( + a: str, b: str, jaccard_min: float = _LEX_JACCARD_MIN, + levenshtein_min: float = _LEX_LEVENSHTEIN_MIN, +) -> bool: + """High lexical overlap → likely the same rule reworded (for the cosine tail).""" + return (jaccard_shingles(a, b) >= jaccard_min + or normalized_levenshtein(a, b) >= levenshtein_min) + + # ── Aggregate ── FLAG_NON_DECISION = "non_decision" @@ -135,6 +220,8 @@ FLAG_TRUNCATED_QUOTE = "truncated_quote" FLAG_THIN_RESTATEMENT = "thin_restatement" FLAG_QUOTE_UNVERIFIED = "quote_unverified" FLAG_NLI_UNSUPPORTED = "nli_unsupported" # rule not entailed by its quote (#81.3) +FLAG_APPLICATION = "application" # fact-dependent, not a holding (#81.4) +FLAG_NEAR_DUPLICATE = "near_duplicate" # cosine-tail lexical dup (#82.3) # ── NLI entailment check (rule_statement ⊨ supporting_quote) — #81.3 ── @@ -250,6 +337,7 @@ def compute_quality_flags( supporting_quote: str, reasoning_summary: str = "", quote_verified: bool = True, + rule_type: str = "binding", ) -> list[str]: """Return the list of quality flags for one halacha (empty == clean). @@ -264,4 +352,9 @@ def compute_quality_flags( flags.append(FLAG_THIN_RESTATEMENT) if not quote_verified: flags.append(FLAG_QUOTE_UNVERIFIED) + # #81.4 — an application (fact-dependent) item is an illustration, not a + # generalizable holding: never auto-approve it. Trust the model's + # rule_type='application' and add a high-precision deixis catch. + if rule_type == "application" or is_fact_dependent(rule_statement): + flags.append(FLAG_APPLICATION) return flags diff --git a/mcp-server/tests/test_halacha_quality.py b/mcp-server/tests/test_halacha_quality.py index 7128ae0..a84a813 100644 --- a/mcp-server/tests/test_halacha_quality.py +++ b/mcp-server/tests/test_halacha_quality.py @@ -181,3 +181,75 @@ def test_consolidation_priority_prefers_approved_then_confidence(): "quote_verified": True, "rule_statement": "x"} # approved sorts before higher-confidence pending → kept as canonical assert min([approved, pending_hi], key=he._consolidation_priority)["id"] == "a" + + +# ── #81.4 fact-dependent / application ── + +@pytest.mark.parametrize("rule", [ + "במקרה דנן ועדת הערר קבעה כי ההיתר בטל", + "בענייננו אין הצדקה לפיצוי", + "בערר שלפנינו הוכח כי השומה שגויה", +]) +def test_is_fact_dependent_hits(rule): + assert hq.is_fact_dependent(rule) is True + + +@pytest.mark.parametrize("rule", [ + "ועדת הערר מוסמכת לדון בהיטל השבחה", + "נטל ההוכחה מוטל על המבקש", + "פגיעה תכנונית מזכה בפיצוי לפי סעיף 197", +]) +def test_is_fact_dependent_misses(rule): + assert hq.is_fact_dependent(rule) is False + + +def test_application_flag_from_rule_type(): + flags = hq.compute_quality_flags( + "נטל ההוכחה על המבקש", "נטל ההוכחה על המבקש כאמור", + rule_type="application", + ) + assert hq.FLAG_APPLICATION in flags + + +def test_application_flag_from_deixis_even_if_binding(): + flags = hq.compute_quality_flags( + "במקרה דנן נדחה הערר", "כפי שקבענו במקרה דנן נדחה הערר", + rule_type="binding", + ) + assert hq.FLAG_APPLICATION in flags + + +def test_clean_binding_rule_has_no_flags(): + flags = hq.compute_quality_flags( + "ועדת הערר מוסמכת לדון בטענות חוקתיות הנוגעות לתכנית", + "הוועדה מוסמכת לדון אף בטענות מסוג זה, ככל שהן נוגעות לתכנית שבנדון.", + rule_type="binding", + ) + assert flags == [] + + +# ── #82.3 lexical near-duplicate signal ── + +def test_jaccard_high_for_reworded_same_rule(): + a = "נטל ההוכחה בהיטל השבחה מוטל על הוועדה המקומית" + b = "נטל ההוכחה בהיטל השבחה מוטל על הוועדה המקומית בלבד" + assert hq.jaccard_shingles(a, b) >= 0.5 + + +def test_jaccard_low_for_distinct_rules(): + a = "ועדת הערר מוסמכת לדון בהיטל השבחה" + b = "המועד להגשת ערר הוא שלושים יום" + assert hq.jaccard_shingles(a, b) < 0.2 + + +def test_normalized_levenshtein_identical_and_disjoint(): + assert hq.normalized_levenshtein("אבג", "אבג") == 1.0 + assert hq.normalized_levenshtein("", "אבג") == 0.0 + + +def test_lexical_near_duplicate_band(): + a = "נטל ההוכחה בהיטל השבחה מוטל על הוועדה המקומית" + b = "נטל ההוכחה בהיטל השבחה מוטל על הוועדה המקומית, כך נפסק" + assert hq.lexical_near_duplicate(a, b) is True + c = "המועד להגשת ערר על שומה הוא שלושים ימים" + assert hq.lexical_near_duplicate(a, c) is False diff --git a/scripts/SCRIPTS.md b/scripts/SCRIPTS.md index 0bad63a..4f9d73d 100644 --- a/scripts/SCRIPTS.md +++ b/scripts/SCRIPTS.md @@ -38,6 +38,9 @@ | `rechunk_legacy_precedents.py` | python | **#57** — re-chunk + re-embed פסיקה שהוטמעה לפני תיקון ה-chunker (#55). בוחר כל `case_law` עם chunk זעיר (`length(trim(content))<50` — טביעת-האצבע של ה-chunker הישן) ומריץ `ingest.reindex_case_law` (re-chunk+re-embed מ-`full_text` שמור בלבד — ללא re-OCR/LLM, feedback_no_reocr_retrofit; idempotent DELETE-then-INSERT). idempotent ברמת-הבאטץ' (שואב מחדש את הסט המושפע בכל ריצה). דגל `--limit N`. רץ עם venv של mcp-server (`cd mcp-server && .venv/bin/python ../scripts/rechunk_legacy_precedents.py`) | חד-פעמי — מיגרציית-נתונים של פסיקה legacy (תוקן 2026-06-03) | | `backfill_nevo_preamble.py` | python | **#86.2** — מיגרציית-נתונים: חיתוך preamble/רציו של נבו שדלף לפסיקה שהוטמעה לפני תיקון #86.1. מאתר כל `case_law` ש-`strip_nevo_preamble(full_text)` עדיין מקצר (דליפה היסטורית), ומבצע: (1) לכידת ה-מיני-רציו ל-`case_law.nevo_ratio` (gold-set ל-#86.3); (2) שכתוב `full_text` החתוך + חישוב-מחדש של `content_hash`; (3) `reindex_case_law` (re-chunk+embed, ללא re-OCR/LLM); (4) **סימון (לא מחיקה)** הלכות ש-`supporting_quote` שלהן בתוך ה-preamble שהוסר → `pending_review` + quality_flag `nevo_preamble_leak`. **שומר-בטיחות:** שורות עם keep%<`--min-keep` (ברירת-מחדל 60) מוחרגות מ-`--apply` כחשד over-strip (אלא אם `--include-suspicious`). **dry-run כברירת-מחדל**; `--apply` כותב backup JSON + manifest CSV ל-`data/audit/` תחילה. idempotent. רץ עם venv של mcp-server. **chair-gated** (לאמת manifest לפני apply) | מיגרציית-נתונים — dry-run בוצע (19 פסקים, 27 הלכות מזוהמות); apply ממתין לאישור | | `nevo_ratio_benchmark.py` | python | **#86.3** — מדידת איכות חילוץ-הלכות מול ה-מיני-רציו של נבו (gold-set מקצועי חינמי). לכל פסק עם `nevo_ratio` (או נגזר מ-`full_text` אם טרם בוצע backfill): LLM-judge מקומי (`claude_session`, אפס עלות) ממפה סמנטית את הלכות-המערכת מול הלכות-נבו ומפיק **recall** (כיסוי הלכות-נבו), **precision** (אחוז הלכותינו הממופות), **granularity** (יחס פירוק — איתות over-extraction ל-#81.5). `--case ` / `--all [--limit N]` / `--model` / `--out`. כותב CSV ל-`data/audit/`. רץ עם venv של mcp-server (דורש Claude CLI מקומי). אומת על בג"ץ 1764/05: recall 0.875, precision 1.0, granularity 1.75x | ידני — מדידת-איכות (CI/ad-hoc) | +| `halacha_goldset.py` | python | **#81.7** — הארנס gold-set לאיכות חילוץ-הלכות. `export --n N` מייצא מדגם מרובד (לפי precedent×rule_type) ל-CSV עם עמודות-תיוג ריקות (`is_holding`/`correct_type`/`quote_complete`) לתיוג ידני (חיים/דפנה). `score --in ` קורא את ה-CSV המתויג ומודד כל ולידטור (`compute_quality_flags`/`is_fact_dependent`/`is_quote_truncated`/`is_thin_restatement`) מול אמת-המידה האנושית: P/R/F1 + confusion. בסיס ל-#81.8 (כיול סף האישור). מייבא את אותם ולידטורים שה-extractor מריץ. רץ עם venv של mcp-server | ידני — export→תיוג→score | +| `halacha_batch_reconcile.py` | python | **#82.7** — dedup חוצה-פסקים offline (שמרני, **dry-run בלבד**). dedup-on-insert משווה רק תוך-פסק; כאן סף מחמיר (cosine ≥0.95, `--cosine`) ולא-הרסני: מאתר זוגות הלכות near-duplicate בין פסקים שונים (pgvector `<=>` exact) עם איתות לקסיקלי (Jaccard/Levenshtein) ומדווח ל-CSV ב-`data/audit/` לסקירת היו"ר. לא מדלג/ממזג/מוחק. `--include-pending`. רץ עם venv של mcp-server. אומת: 819 הלכות → 5 זוגות מועמדים | ידני — דוח-סקירה | +| `calibrate_halacha_dedup.py` | python | **#82.1** — כיול ספי ה-dedup הלקסיקלי (#82.3) מול gold-set הניקוי. קורא `halacha-cleanup-manifest-*.csv` (זוגות duplicate↔survivor מתויגי-אדם), טוען טקסט-survivor מה-DB, ו-sweep של (jaccard_min × levenshtein_min) עם P/R/F1, מסמן את נקודת-העבודה המוגדרת. אימת ש-(0.55, 0.70) → **precision 1.0** (אפס false-merge), recall 0.30 — מתאים לאיתות-משני שחוסם auto-approve. `--manifest `. רץ עם venv של mcp-server | חד-פעמי — כיול (בוצע 2026-06-06) | | `audit_corpus_integrity.py` | python | בדיקה תקופתית של עקביות הקורפוס — 3 בדיקות SQL read-only על `case_law` ו-`cases`: (A) `external_upload` עם prefix פנימי `ערר`/`בל"מ`; (B) `internal_committee` חסר `chair_name`/`district`; (C) `cases.practice_area` מחוץ ל-{`rishuy_uvniya`, `betterment_levy`, `compensation_197`, `''`}. כותב log מצטבר ל-`data/logs/corpus_integrity_audit.log` ובמצב הפרות שולח wakeup ל-CEO ב-Paperclip (best-effort, רק אם `PAPERCLIP_API_URL`+`PAPERCLIP_API_KEY` מוגדרים). דגל: `--no-notify`. Idempotent, יוצא 0. **Cron יומי 07:00**: `0 7 * * * /home/chaim/legal-ai/mcp-server/.venv/bin/python /home/chaim/legal-ai/scripts/audit_corpus_integrity.py` | `0 7 * * *` (cron) | | `backfill_legal_arguments.py` | python | Backfill `legal_arguments` לתיקים עם `claims` קיימים (TaskMaster #36). מקבץ פרופוזיציות גולמיות לטיעונים משפטיים מובחנים (~6-12 לכל צד) דרך `argument_aggregator.aggregate_claims_to_arguments` (Claude CLI). תומך `--dry-run`/`--apply`/`--force`/`--case ...`. **חייב לרוץ מהמכונה המקומית** (לא קונטיינר) — `claude_session` דורש Claude CLI | ידני per-case (`python scripts/backfill_legal_arguments.py --apply --case 1017-03-26`) | | `upload_blam_decisions.py` | python | חד-פעמי (2026-05-26) — העלאת 2 החלטות בל"מ ל-`case_law` (8126/24 סופר נוח, 8047/23 הרנון) דרך `ingest_internal_decision` ישיר, עוקף MCP server שטרם נטען מחדש אחרי הוספת `proceeding_type`. **לא להריץ שוב** | חד-פעמי — להעביר ל-`.archive/` בהזדמנות | diff --git a/scripts/calibrate_halacha_dedup.py b/scripts/calibrate_halacha_dedup.py new file mode 100644 index 0000000..147ec78 --- /dev/null +++ b/scripts/calibrate_halacha_dedup.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +"""#82.1 — calibrate the lexical dedup thresholds against the cleanup gold-set. + +The 2026-06-03 cleanup manifest (data/audit/halacha-cleanup-manifest-*.csv) +records, for each removed halacha, a ``reason`` and a ``survivor_id`` — i.e. a +human-labeled set of TRUE duplicate pairs (deleted rule ↔ its survivor). This +script uses them to validate the lexical near-duplicate thresholds introduced +in #82.3 (``HALACHA`` Jaccard/Levenshtein), so the numbers in +``halacha_quality.lexical_near_duplicate`` are calibrated, not guessed. + +It sweeps (jaccard_min × levenshtein_min) and reports precision/recall against: + * positives — duplicate-labeled pairs (deleted rule ↔ survivor rule) + * negatives — random non-paired rules from the same manifest (≈all distinct) + +and marks the currently-configured operating point. + + cd ~/legal-ai/mcp-server + .venv/bin/python ../scripts/calibrate_halacha_dedup.py \ + --manifest ../data/audit/halacha-cleanup-manifest-20260603T101747Z.csv +""" +from __future__ import annotations + +import argparse +import asyncio +import csv +import sys +from pathlib import Path +from uuid import UUID + +from legal_mcp.services import db, halacha_quality as hq + + +async def _survivor_text(survivor_id: str, manifest_map: dict) -> str: + if survivor_id in manifest_map: + return manifest_map[survivor_id] + try: + row = await db.get_halacha(UUID(survivor_id)) if hasattr(db, "get_halacha") else None + except Exception: + row = None + if row: + return row.get("rule_statement", "") + # fallback: direct query + try: + pool = await db.get_pool() + r = await pool.fetchrow("SELECT rule_statement FROM halachot WHERE id = $1", UUID(survivor_id)) + return r["rule_statement"] if r else "" + except Exception: + return "" + + +async def main(args: argparse.Namespace) -> int: + path = Path(args.manifest) + if not path.is_absolute(): + path = (Path.cwd() / path).resolve() + with path.open(encoding="utf-8") as f: + rows = list(csv.DictReader(f)) + by_id = {r["id"]: r.get("rule_statement", "") for r in rows} + + positives: list[tuple[str, str]] = [] + for r in rows: + if "duplicate" in (r.get("reason") or "").lower() and r.get("survivor_id"): + a = r.get("rule_statement", "") + b = await _survivor_text(r["survivor_id"], by_id) + if a and b: + positives.append((a, b)) + + # negatives: pair each deleted rule with a different, non-survivor rule. + rules = [r.get("rule_statement", "") for r in rows if r.get("rule_statement")] + negatives: list[tuple[str, str]] = [] + for i in range(len(positives)): + a = rules[i % len(rules)] + b = rules[(i * 7 + 3) % len(rules)] # deterministic spread, no RNG + if a and b and a != b: + negatives.append((a, b)) + + print(f"positives (labeled dup pairs): {len(positives)} " + f"negatives: {len(negatives)}", flush=True) + if not positives: + print("no labeled duplicate pairs found in manifest — cannot calibrate", flush=True) + return 1 + + # precompute lexical scores per pair + def scores(pairs): + return [(hq.jaccard_shingles(a, b), hq.normalized_levenshtein(a, b)) for a, b in pairs] + pos_s, neg_s = scores(positives), scores(negatives) + + print(f"\n{'jac_min':>8}{'lev_min':>8}{'P':>8}{'R':>8}{'F1':>8}", flush=True) + best = None + for jm in (0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70): + for lm in (0.60, 0.65, 0.70, 0.75, 0.80, 0.85): + tp = sum(1 for j, l in pos_s if j >= jm or l >= lm) + fp = sum(1 for j, l in neg_s if j >= jm or l >= lm) + fn = len(pos_s) - tp + p = tp / (tp + fp) if (tp + fp) else 0.0 + r = tp / (tp + fn) if (tp + fn) else 0.0 + f1 = 2 * p * r / (p + r) if (p + r) else 0.0 + mark = " <- configured" if (abs(jm - hq._LEX_JACCARD_MIN) < 1e-9 + and abs(lm - hq._LEX_LEVENSHTEIN_MIN) < 1e-9) else "" + if mark: + print(f"{jm:>8.2f}{lm:>8.2f}{p:>8.3f}{r:>8.3f}{f1:>8.3f}{mark}", flush=True) + if best is None or f1 > best[0]: + best = (f1, jm, lm, p, r) + print(f"\nbest F1={best[0]:.3f} at jaccard_min={best[1]}, levenshtein_min={best[2]} " + f"(P={best[3]:.3f}, R={best[4]:.3f})", flush=True) + print("note: positives may include obiter/application cuts (not pure dups); " + "use precision as the guard against false-merges.", flush=True) + return 0 + + +if __name__ == "__main__": + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--manifest", required=True, help="path to halacha-cleanup-manifest-*.csv") + args = ap.parse_args() + sys.exit(asyncio.run(main(args))) diff --git a/scripts/halacha_batch_reconcile.py b/scripts/halacha_batch_reconcile.py new file mode 100644 index 0000000..7a82cf5 --- /dev/null +++ b/scripts/halacha_batch_reconcile.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +"""#82.7 — offline CROSS-precedent halacha dedup (conservative, dry-run reporter). + +Dedup-on-insert (db.store_halachot_for_chunk) only compares within a single +precedent — the 2026-06-03 audit showed cosine ≥0.90 is reliable only +within-precedent. Across precedents the same principle legitimately recurs, so +this batch job is deliberately STRICTER (cosine ≥0.95) and NON-DESTRUCTIVE: it +only reports candidate cross-precedent near-duplicate pairs to a CSV for the +chair to review. Nothing is skipped, merged, or deleted. + +Pairs are found with pgvector's exact cosine (``<=>``) per halacha against +halachot in OTHER precedents; a secondary lexical check (Jaccard/Levenshtein) +is reported alongside so the reviewer can tell "same rule" from "same topic". + + cd ~/legal-ai/mcp-server + .venv/bin/python ../scripts/halacha_batch_reconcile.py # cosine ≥0.95 + .venv/bin/python ../scripts/halacha_batch_reconcile.py --cosine 0.97 +""" +from __future__ import annotations + +import argparse +import asyncio +import csv +import sys +from datetime import datetime, timezone +from pathlib import Path + +from legal_mcp.services import db, halacha_quality as hq + +REPO_ROOT = Path(__file__).resolve().parent.parent +AUDIT_DIR = REPO_ROOT / "data" / "audit" + + +async def main(args: argparse.Namespace) -> int: + cosine = args.cosine + max_dist = 1.0 - cosine + statuses = ("approved", "published") if not args.include_pending else ( + "approved", "published", "pending_review") + + pool = await db.get_pool() + async with pool.acquire() as conn: + rows = await conn.fetch( + "SELECT h.id, h.case_law_id, cl.case_number, h.rule_statement " + "FROM halachot h JOIN case_law cl ON cl.id = h.case_law_id " + "WHERE h.embedding IS NOT NULL AND h.review_status = ANY($1::text[]) " + "ORDER BY h.case_law_id, h.halacha_index", + list(statuses), + ) + print(f"scanning {len(rows)} halachot for cross-precedent pairs " + f"(cosine ≥ {cosine})...", flush=True) + + seen: set[frozenset] = set() + pairs: list[dict] = [] + for r in rows: + # nearest neighbor in a DIFFERENT precedent + nb = await conn.fetchrow( + "SELECT h2.id, cl2.case_number, h2.rule_statement, " + " (h2.embedding <=> (SELECT embedding FROM halachot WHERE id = $1)) AS dist " + "FROM halachot h2 JOIN case_law cl2 ON cl2.id = h2.case_law_id " + "WHERE h2.embedding IS NOT NULL AND h2.case_law_id <> $2 " + " AND h2.review_status = ANY($3::text[]) " + "ORDER BY h2.embedding <=> (SELECT embedding FROM halachot WHERE id = $1) " + "LIMIT 1", + r["id"], r["case_law_id"], list(statuses), + ) + if nb is None or float(nb["dist"]) > max_dist: + continue + key = frozenset({str(r["id"]), str(nb["id"])}) + if key in seen: + continue + seen.add(key) + pairs.append({ + "case_a": r["case_number"], "id_a": r["id"], "rule_a": r["rule_statement"], + "case_b": nb["case_number"], "id_b": nb["id"], "rule_b": nb["rule_statement"], + "cosine": round(1.0 - float(nb["dist"]), 4), + "jaccard": round(hq.jaccard_shingles(r["rule_statement"], nb["rule_statement"]), 3), + "levenshtein": round(hq.normalized_levenshtein(r["rule_statement"], nb["rule_statement"]), 3), + }) + + pairs.sort(key=lambda p: -p["cosine"]) + print(f"found {len(pairs)} cross-precedent candidate pair(s)", flush=True) + for p in pairs[:30]: + print(f" cos={p['cosine']} jac={p['jaccard']} lev={p['levenshtein']} " + f"{p['case_a']} ↔ {p['case_b']}: {p['rule_a'][:60]}...", flush=True) + + if pairs: + ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + AUDIT_DIR.mkdir(parents=True, exist_ok=True) + out = AUDIT_DIR / f"halacha-cross-precedent-{ts}.csv" + with out.open("w", encoding="utf-8", newline="") as f: + w = csv.DictWriter(f, fieldnames=list(pairs[0].keys())) + w.writeheader() + w.writerows(pairs) + print(f"\nreport: {out} (review-only — nothing changed)", flush=True) + return 0 + + +if __name__ == "__main__": + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--cosine", type=float, default=0.95, + help="min cosine for a cross-precedent candidate (default 0.95)") + ap.add_argument("--include-pending", action="store_true", + help="also scan pending_review halachot (default: approved/published only)") + args = ap.parse_args() + sys.exit(asyncio.run(main(args))) diff --git a/scripts/halacha_goldset.py b/scripts/halacha_goldset.py new file mode 100644 index 0000000..8c98e07 --- /dev/null +++ b/scripts/halacha_goldset.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +"""#81.7 — gold-set harness for halacha-extraction quality. + +Two modes — the human tagging in between is the only manual step: + + export — dump a stratified sample of halachot to a CSV with EMPTY label + columns for חיים/דפנה to fill (is_holding, correct_type, + quote_complete). Stratified across precedents and rule_types so + the set isn't dominated by one ruling. + + score — read the tagged CSV back and measure each pure validator + (compute_quality_flags / is_fact_dependent / is_quote_truncated / + is_thin_restatement) against the human labels: precision, recall, + F1 per validator + a confusion summary. This is the ground-truth + #81.8 needs to recalibrate the auto-approve threshold. + +The validators here are the SAME ones the live extractor runs, imported +directly — so the score reflects production behavior, not a reimplementation. + + cd ~/legal-ai/mcp-server + .venv/bin/python ../scripts/halacha_goldset.py export --n 150 + # ... חיים/דפנה fill is_holding / correct_type / quote_complete ... + .venv/bin/python ../scripts/halacha_goldset.py score --in data/audit/halacha-goldset-.csv +""" +from __future__ import annotations + +import argparse +import asyncio +import csv +import sys +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path + +from legal_mcp.services import db, halacha_quality as hq + +REPO_ROOT = Path(__file__).resolve().parent.parent +AUDIT_DIR = REPO_ROOT / "data" / "audit" + +# Columns the human fills. is_holding: 1 if a real generalizable holding, 0 if +# obiter/application/fact-recitation/non-rule. correct_type: binding/interpretive/ +# obiter/application. quote_complete: 1 if the quote is a whole, untruncated span. +LABEL_COLS = ["is_holding", "correct_type", "quote_complete"] +EXPORT_COLS = [ + "id", "case_number", "halacha_index", "rule_type", "review_status", + "confidence", "rule_statement", "supporting_quote", *LABEL_COLS, +] + + +async def _export(n: int) -> int: + rows = await db.list_halachot(limit=5000) + # stratify: round-robin across (case_law_id, rule_type) buckets. + buckets: dict = defaultdict(list) + for r in rows: + buckets[(r["case_law_id"], r.get("rule_type"))].append(r) + sample: list[dict] = [] + keys = list(buckets.values()) + i = 0 + while len(sample) < n and any(keys): + b = keys[i % len(keys)] + if b: + sample.append(b.pop()) + i += 1 + if i > n * 50: + break + ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + AUDIT_DIR.mkdir(parents=True, exist_ok=True) + out = AUDIT_DIR / f"halacha-goldset-{ts}.csv" + with out.open("w", encoding="utf-8", newline="") as f: + w = csv.DictWriter(f, fieldnames=EXPORT_COLS, extrasaction="ignore") + w.writeheader() + for r in sample: + w.writerow({**{k: r.get(k, "") for k in EXPORT_COLS}, + **{lc: "" for lc in LABEL_COLS}}) + print(f"exported {len(sample)} halachot for tagging → {out}", flush=True) + print(f"fill columns: {', '.join(LABEL_COLS)} (is_holding/quote_complete = 1/0)", flush=True) + return 0 + + +def _prf(tp: int, fp: int, fn: int) -> tuple[float, float, float]: + p = tp / (tp + fp) if (tp + fp) else 0.0 + r = tp / (tp + fn) if (tp + fn) else 0.0 + f1 = 2 * p * r / (p + r) if (p + r) else 0.0 + return round(p, 3), round(r, 3), round(f1, 3) + + +def _score(path: Path) -> int: + with path.open(encoding="utf-8") as f: + rows = [r for r in csv.DictReader(f) if (r.get("is_holding") or "").strip() != ""] + if not rows: + print("no labeled rows (is_holding empty everywhere) — nothing to score", flush=True) + return 1 + + # A validator FLAG is a prediction of "NOT a clean holding" (should be + # rejected/reviewed). Ground truth NOT-holding = is_holding == 0. + # We score each validator as a detector of not-holding. + counters: dict[str, dict[str, int]] = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0, "tn": 0}) + + def tally(name: str, predicted_bad: bool, truly_bad: bool): + c = counters[name] + if predicted_bad and truly_bad: + c["tp"] += 1 + elif predicted_bad and not truly_bad: + c["fp"] += 1 + elif not predicted_bad and truly_bad: + c["fn"] += 1 + else: + c["tn"] += 1 + + for r in rows: + rule = r.get("rule_statement", "") + quote = r.get("supporting_quote", "") + rtype = r.get("rule_type", "binding") + quote_complete = (r.get("quote_complete") or "1").strip() not in ("0", "false", "") + truly_not_holding = (r.get("is_holding") or "").strip() in ("0", "false") + + flags = hq.compute_quality_flags(rule, quote, "", quote_complete, rtype) + tally("any_flag", bool(flags), truly_not_holding) + tally("application", hq.FLAG_APPLICATION in flags, truly_not_holding) + tally("non_decision", hq.FLAG_NON_DECISION in flags, truly_not_holding) + tally("thin_restatement", hq.FLAG_THIN_RESTATEMENT in flags, truly_not_holding) + # quote-truncation scored against quote_complete label specifically + tally("truncated_quote", hq.is_quote_truncated(quote), not quote_complete) + + print(f"scored {len(rows)} labeled halachot\n", flush=True) + print(f"{'validator':<18}{'P':>7}{'R':>7}{'F1':>7} tp/fp/fn/tn", flush=True) + for name, c in counters.items(): + p, rec, f1 = _prf(c["tp"], c["fp"], c["fn"]) + print(f"{name:<18}{p:>7}{rec:>7}{f1:>7} " + f"{c['tp']}/{c['fp']}/{c['fn']}/{c['tn']}", flush=True) + return 0 + + +async def main(args: argparse.Namespace) -> int: + if args.mode == "export": + return await _export(args.n) + return _score(Path(args.infile)) + + +if __name__ == "__main__": + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + sub = ap.add_subparsers(dest="mode", required=True) + pe = sub.add_parser("export", help="dump a sample CSV for human tagging") + pe.add_argument("--n", type=int, default=150, help="sample size (default 150)") + ps = sub.add_parser("score", help="measure validators against a tagged CSV") + ps.add_argument("--in", dest="infile", required=True, help="tagged CSV path") + args = ap.parse_args() + sys.exit(asyncio.run(main(args)))