diff --git a/docs/spec/07-learning.md b/docs/spec/07-learning.md index 0eae7eb..8929e67 100644 --- a/docs/spec/07-learning.md +++ b/docs/spec/07-learning.md @@ -228,6 +228,27 @@ Dimensions for Data Quality* (2013) · ISO 8000 (Data quality) | סטטוס: ver (`/precedents`, PR#300) לאישור-יו"ר; CSV-audit ב-`data/audit/canonical-synthesis-*.csv`. **הפרה ידועה:** — (חדש) +### INV-LRN7: חילוץ-עקרונות מגודר-פאנל + טרמינולוגיה נכונה (#152 → G2/G10/INV-AH) +**כלל:** חילוץ עקרונות-משפטיים מפסיקה (להבא ורטרואקטיבית) עובר משטר-פאנל אחיד: +**3 מודלים עצמאיים** (Claude מקומי + DeepSeek + Gemini) מנתחים לעומק כל החלטה, +מציעים מועמדים עם ציון, המועמדים מותאמים בין-מודלית (cosine), ולכל אחד `votes` +(# מודלים) ו-`score` (ממוצע-המצביעים). **כלל-אישור:** 3 קולות→אישור · 2 וציון≥0.85→ +אישור · 2 ו<0.85→`pending_review` (יו"ר, G10) · ≤1→נדחה. **תקרה:** עד +`HALACHA_PANEL_MAX_NEW`=5 עקרונות חדשים לכל החלטה (לפי ציון); עיקרון מוכר מקושר +ל-canonical קיים (cosine, V41) ואינו נספר בתקרה. **טרמינולוגיה (מהות, לא קוסמטיקה):** +ועדת-ערר **מיישמת** דין ואינה יוצרת הלכה — עיקרון מפס"ד מחוזי/עליון מחייב = **הלכה**, +מהחלטת ועדת-ערר = **כלל פרשני**, מפסיקה משכנעת = **עיקרון**; המטרייה = **עקרונות +משפטיים**. הסיווג נגזר מ-`first_established_in` (source_kind/is_binding), ללא עמודה חדשה. +**מקור-יחיד (G2):** extractor (`_extract_via_panel`), סינון רטרואקטיבי (`cull_principles.py`), +ושני הם דרך `services/panel_extraction` + `panel_judges` — אין נתיב-פאנל מקביל. +**מקורות:** gold-set tri-model consensus (AC1=0.92, [[project_goldset_tri_model_consensus]]) · +LegalBench (gemini-2.5-flash) · Trust-or-Escalate (ICLR 2025) | סטטוס: verified +**אכיפה:** `services/panel_extraction.py` (panel_extract/panel_keep_score/classify/apply_cap), +`services/panel_judges.py`, `halacha_extractor._extract_via_panel`, `db.store_panel_principles`, +`scripts/cull_principles.py`, `services/principles.py` (תווית). config `HALACHA_PANEL_*`. +החלטת-יו"ר 2026-06-19; מקור-אמת: [`../legal-principles-redesign.md`](../legal-principles-redesign.md). +**הפרה ידועה:** — (חדש) + --- ## 4. הג'ובים המתוזמנים (תמיכת-תשתית ללולאה) diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py index df5e2cc..75ad0c1 100644 --- a/mcp-server/src/legal_mcp/services/db.py +++ b/mcp-server/src/legal_mcp/services/db.py @@ -14,7 +14,7 @@ import asyncpg from pgvector.asyncpg import register_vector from legal_mcp import config -from legal_mcp.services import court_citation, halacha_quality +from legal_mcp.services import court_citation, halacha_quality, principles logger = logging.getLogger(__name__) @@ -6156,7 +6156,8 @@ async def get_canonical_halacha(canonical_id: "UUID") -> "dict | None": "SELECT ch.id::text, ch.canonical_statement, ch.rule_type, " " ch.practice_areas, ch.subject_tags, ch.review_status, " " ch.instance_count, ch.created_at, ch.updated_at, " - " cl.case_number AS first_established_case " + " cl.case_number AS first_established_case, " + " cl.source_kind, cl.is_binding " "FROM canonical_halachot ch " "LEFT JOIN case_law cl ON cl.id = ch.first_established_in " "WHERE ch.id = $1", @@ -6172,10 +6173,12 @@ async def get_canonical_halacha(canonical_id: "UUID") -> "dict | None": "WHERE h.canonical_id = $1 ORDER BY h.instance_type, cl.case_number", canonical_id, ) - return { - **dict(row), - "instances": [dict(i) for i in instances], - } + out = dict(row) + # #152: source-derived class + Hebrew label (הלכה / כלל פרשני / עיקרון). + out["principle_class"] = principles.principle_class(out.get("source_kind"), out.get("is_binding")) + out["principle_label"] = principles.label_for_class(out["principle_class"]) + out["instances"] = [dict(i) for i in instances] + return out async def list_canonical_halachot( @@ -6190,24 +6193,33 @@ async def list_canonical_halachot( params: list = [] idx = 1 if practice_area: - conditions.append(f"${ idx} = ANY(practice_areas)") + conditions.append(f"${ idx} = ANY(ch.practice_areas)") params.append(practice_area) idx += 1 if review_status: - conditions.append(f"review_status = ${idx}") + conditions.append(f"ch.review_status = ${idx}") params.append(review_status) idx += 1 params += [limit, offset] rows = await pool.fetch( - f"SELECT id::text, canonical_statement, rule_type, practice_areas, " - f" subject_tags, review_status, instance_count, created_at, updated_at " - f"FROM canonical_halachot " + f"SELECT ch.id::text, ch.canonical_statement, ch.rule_type, ch.practice_areas, " + f" ch.subject_tags, ch.review_status, ch.instance_count, " + f" ch.created_at, ch.updated_at, cl.source_kind, cl.is_binding " + f"FROM canonical_halachot ch " + f"LEFT JOIN case_law cl ON cl.id = ch.first_established_in " f"WHERE {' AND '.join(conditions)} " - f"ORDER BY instance_count DESC, created_at DESC " + f"ORDER BY ch.instance_count DESC, ch.created_at DESC " f"LIMIT ${idx} OFFSET ${idx + 1}", *params, ) - return [dict(r) for r in rows] + out = [] + for r in rows: + d = dict(r) + cls = principles.principle_class(d.pop("source_kind", None), d.pop("is_binding", None)) + d["principle_class"] = cls + d["principle_label"] = principles.label_for_class(cls) + out.append(d) + return out async def update_canonical_statement( diff --git a/mcp-server/src/legal_mcp/services/panel_extraction.py b/mcp-server/src/legal_mcp/services/panel_extraction.py index e93fa74..e18a659 100644 --- a/mcp-server/src/legal_mcp/services/panel_extraction.py +++ b/mcp-server/src/legal_mcp/services/panel_extraction.py @@ -126,6 +126,29 @@ def classify(votes: int, score: float) -> str: return "rejected" +def apply_cap(judged: list[dict], max_new: int | None = None) -> list[dict]: + """Per-decision cap for the retroactive cull (#152, Phase C). + + ``judged`` = a decision's principles, each with a panel ``verdict`` + ``score``. + Survivors (approved/pending_review) are ranked by score; those beyond ``max_new`` + are downgraded to 'rejected' (over-cap). Already-rejected stay rejected. Returns + a new list with ``final_verdict`` set on each (order preserved). Pure. + """ + max_new = config.HALACHA_PANEL_MAX_NEW if max_new is None else max_new + survivors = [j for j in judged if j.get("verdict") in ("approved", "pending_review")] + survivors.sort(key=lambda j: j.get("score", 0.0), reverse=True) + keep_ids = {id(j) for j in survivors[:max_new]} + out = [] + for j in judged: + v = j.get("verdict") + if v in ("approved", "pending_review") and id(j) not in keep_ids: + final = "rejected" # over the cap + else: + final = v + out.append({**j, "final_verdict": final}) + return out + + def cluster_candidates( per_model: dict[str, list[dict]], embs: dict[int, list[float]], ) -> list[dict]: @@ -195,6 +218,63 @@ def cluster_candidates( return out +def _keep_score_system(source_kind: str, is_binding: bool) -> str: + if source_kind == "internal_committee": + nature = ("המקור הוא החלטת ועדת-ערר (מיישמת דין, אינה יוצרת הלכה). ראוי-לשמירה = " + "כלל פרשני חדש ובר-הכללה שהוועדה גיבשה; לא-ראוי = יישום תלוי-עובדות, " + "חזרה על דין מוכר, אמרת-אגב, או חזרה מילולית על הציטוט.") + else: + nature = ("ראוי-לשמירה = עיקרון משפטי בר-הכללה והסתמכות (הלכה/פרשנות/כלל-פרוצדורלי); " + "לא-ראוי = החלה תלוית-עובדות, אמרת-אגב, או חזרה מילולית על הציטוט.") + return ( + "אתה משפטן בכיר בוועדת ערר לתכנון ובנייה. הוכרע אם עיקרון שחולץ מפסיקה ראוי " + f"להישמר כתקדים בר-ציטוט. {nature}\n" + "תן גם ציון-ביטחון 0-1 לכך שזהו עיקרון בר-הסתמכות אמיתי.\n" + 'החזר JSON בלבד: {"keep": true/false, "score": 0.0-1.0, "reason": "<משפט קצר>"}. ללא markdown.' + ) + + +async def panel_keep_score( + rule_statement: str, + supporting_quote: str, + reasoning_summary: str = "", + *, + source_kind: str = "external_upload", + is_binding: bool = True, +) -> dict: + """Run the 3-judge panel on ONE existing principle (Phase C cull, #152). + + Each judge votes keep + score; votes = # keepers, score = mean of the keepers' + scores (chaim: "ממוצע המצביעים"), verdict via the shared :func:`classify`. + Returns {votes, score, verdict, voters, per_judge} — per_judge keeps raw + replies for the active-learning round (FU-1). Used by the retroactive cull; + the extractor uses :func:`panel_extract` instead. + """ + import asyncio + system = _keep_score_system(source_kind, is_binding) + user = (f"ניסוח העיקרון:\n{rule_statement}\n\n" + f"היגיון:\n{reasoning_summary}\n\nציטוט תומך:\n{supporting_quote}") + async with httpx.AsyncClient() as client: + c, ds, gm = await asyncio.gather( + panel_judges.judge_claude(system, user, max_tokens=300), + panel_judges.judge_deepseek(client, system, user, max_tokens=300), + panel_judges.judge_gemini(client, system, user, max_tokens=2000), + ) + raw = {"claude": c, "deepseek": ds, "gemini": gm} + keepers, scores = [], [] + for name, reply in raw.items(): + if panel_judges.to_bool(reply, "keep"): + keepers.append(name) + try: + scores.append(max(0.0, min(1.0, float(reply.get("score", 0.0))))) + except (TypeError, ValueError): + scores.append(0.0) + votes = len(keepers) + score = round(sum(scores) / votes, 4) if votes else 0.0 + return {"votes": votes, "score": score, "verdict": classify(votes, score), + "voters": sorted(keepers), "per_judge": raw} + + async def _run_three(system: str, user: str, max_tokens: int) -> dict[str, object]: async with httpx.AsyncClient() as client: import asyncio diff --git a/mcp-server/src/legal_mcp/services/principles.py b/mcp-server/src/legal_mcp/services/principles.py new file mode 100644 index 0000000..a75367b --- /dev/null +++ b/mcp-server/src/legal_mcp/services/principles.py @@ -0,0 +1,45 @@ +"""Legal-principles terminology — the single source for what a principle is CALLED (#152). + +chaim 2026-06-19: "הלכה" was the wrong umbrella. The corpus holds **עקרונות +משפטיים** (legal principles); the term for one depends on its SOURCE: + + • binding higher court (מחוזי/עליון) → "הלכה" (binding precedent) + • appeals committee (internal_committee) → "כלל פרשני" (interpretive rule — + the committee applies law, never makes it) + • non-binding external (persuasive) → "עיקרון" (persuasive principle) + +The class is derived from where a principle was FIRST established +(canonical_halachot.first_established_in → case_law.source_kind/is_binding), so no +new column is needed. UI/tools call :func:`label` instead of hardcoding "הלכה". +""" +from __future__ import annotations + +UMBRELLA = "עקרונות משפטיים" + +CLASS_HALACHA = "halacha" +CLASS_INTERPRETIVE_RULE = "interpretive_rule" +CLASS_PRINCIPLE = "principle" + +_LABEL = { + CLASS_HALACHA: "הלכה", + CLASS_INTERPRETIVE_RULE: "כלל פרשני", + CLASS_PRINCIPLE: "עיקרון", +} + + +def principle_class(source_kind: str | None, is_binding: bool | None) -> str: + """Map a source to its principle class (stable key, not display text).""" + if source_kind == "internal_committee": + return CLASS_INTERPRETIVE_RULE + if is_binding: + return CLASS_HALACHA + return CLASS_PRINCIPLE + + +def label(source_kind: str | None, is_binding: bool | None) -> str: + """Hebrew display term for a principle from this source (#152).""" + return _LABEL[principle_class(source_kind, is_binding)] + + +def label_for_class(cls: str) -> str: + return _LABEL.get(cls, _LABEL[CLASS_PRINCIPLE]) diff --git a/mcp-server/src/legal_mcp/tools/precedent_library.py b/mcp-server/src/legal_mcp/tools/precedent_library.py index 5722378..4c3d87c 100644 --- a/mcp-server/src/legal_mcp/tools/precedent_library.py +++ b/mcp-server/src/legal_mcp/tools/precedent_library.py @@ -407,7 +407,10 @@ async def canonical_halacha_list( limit: int = 50, offset: int = 0, ) -> str: - """רשימת עקרונות קנוניים (canonical_halachot) — שאילתת נוחות לסוכני-הכתיבה. + """רשימת עקרונות משפטיים קנוניים — שאילתת נוחות לסוכני-הכתיבה. + + כל פריט כולל principle_label לפי מקורו (#152): 'הלכה' (פס"ד מחוזי/עליון מחייב), + 'כלל פרשני' (החלטת ועדת-ערר), או 'עיקרון' (פסיקה משכנעת). Args: practice_area: סינון לפי תחום עיסוק (ריק = הכל). diff --git a/mcp-server/tests/test_panel_extraction.py b/mcp-server/tests/test_panel_extraction.py index ae070b0..30e387f 100644 --- a/mcp-server/tests/test_panel_extraction.py +++ b/mcp-server/tests/test_panel_extraction.py @@ -106,6 +106,25 @@ def test_cluster_same_model_twice_counts_one_vote_keeps_best_score(): assert cl["rule_statement"] == "X" +def test_apply_cap_downgrades_over_cap_survivors_by_score(): + judged = [ + {"verdict": "approved", "score": 0.9}, + {"verdict": "approved", "score": 0.7}, + {"verdict": "pending_review", "score": 0.8}, + {"verdict": "rejected", "score": 0.95}, # already rejected stays + ] + out = pe.apply_cap(judged, max_new=2) + fv = [j["final_verdict"] for j in out] + # top-2 survivors by score = 0.9(approved) + 0.8(pending); 0.7 → over cap → rejected + assert fv == ["approved", "rejected", "pending_review", "rejected"] + + +def test_apply_cap_keeps_all_when_under_cap(): + judged = [{"verdict": "approved", "score": 0.9}, {"verdict": "pending_review", "score": 0.5}] + out = pe.apply_cap(judged, max_new=5) + assert [j["final_verdict"] for j in out] == ["approved", "pending_review"] + + def test_cluster_sorted_strongest_first(): a = _c("X", 0.9) # 1 vote b, c = _c("Y", 0.9), _c("Y", 0.9) # 2 votes diff --git a/mcp-server/tests/test_principles_terminology.py b/mcp-server/tests/test_principles_terminology.py new file mode 100644 index 0000000..652619c --- /dev/null +++ b/mcp-server/tests/test_principles_terminology.py @@ -0,0 +1,27 @@ +"""Terminology mapping — הלכה / כלל פרשני / עיקרון by source (#152, Phase D).""" +from __future__ import annotations + +from legal_mcp.services import principles as pr + + +def test_binding_higher_court_is_halacha(): + assert pr.label("external_upload", True) == "הלכה" + assert pr.principle_class("external_upload", True) == pr.CLASS_HALACHA + + +def test_committee_is_interpretive_rule(): + # the appeals committee applies law — never makes a הלכה + assert pr.label("internal_committee", True) == "כלל פרשני" + assert pr.label("internal_committee", False) == "כלל פרשני" + assert pr.principle_class("internal_committee", False) == pr.CLASS_INTERPRETIVE_RULE + + +def test_non_binding_external_is_principle(): + assert pr.label("external_upload", False) == "עיקרון" + assert pr.label(None, None) == "עיקרון" + + +def test_label_for_class_roundtrip(): + for sk, binding in [("external_upload", True), ("internal_committee", False), (None, False)]: + cls = pr.principle_class(sk, binding) + assert pr.label_for_class(cls) == pr.label(sk, binding) diff --git a/scripts/SCRIPTS.md b/scripts/SCRIPTS.md index a945c7b..26549df 100644 --- a/scripts/SCRIPTS.md +++ b/scripts/SCRIPTS.md @@ -65,6 +65,7 @@ | `halacha_panel_calibrate.py` | python | **כיול + מדידת הפאנל** (Trust-or-Escalate, ICLR 2025). `--source live` (ברירת-מחדל): מריץ את שאלת-ה-KEEP על מדגם-הזהב ומודד מול `is_holding` precision+coverage+**split-rate** לכל מדיניות + false-keep/false-drop (מייבא שופטים מ-`halacha_panel_approve`, **חובה מקומי**). **#133/FU-5** — `--source captured`: **אפס-עלות** (בלי re-vote/LLM) — מצליב סבבים שמורים (FU-1) מול הכרעות-יו"ר (FU-2) דרך `db.panel_rounds_vs_chair` ומדווח split-rate+auto-precision **לכל סבב** (מגמת הלולאה: ככל שהרובריקה משתפרת precision נשמר ו-split יורד); משתף את `analyze_pairs` של FU-4 (מקור-יחיד). שתי המדידות מדווחות **anon-stability** (מבחן-אנונימיזציה #81.7) כמטריקת-בריאות נגד echo-chamber. `--batch`/`--limit`/`--concurrency`. | ידני — לפני חיווט `--apply` (live) / תקופתי — מעקב-לולאה (captured) | | `halacha_rubric_distill.py` | python | **#133/FU-4 — זיקוק-רובריקה PROPOSE-ONLY.** מצליב `halacha_panel_rounds` (FU-1, הצבעות+נימוקים) מול הכרעות-היו"ר (FU-2, seeds ב-`halacha_goldset` batch `chair-live`) דרך `db.panel_rounds_vs_chair` (read-only), מנתח דטרמיניסטית **כשלים שיטתיים** (false-keep/false-drop, פיצולים-שהוכרעו, שיעור-מחלוקת-עם-היו"ר לכל שופט), ומציע `KEEP_SYSTEM` v2 + exemplars מופשטים (claude_session מקומי, אפס עלות) כ**דוח-diff** ל-`data/learning/rubric-proposal-.md`. **לעולם לא auto-apply** — אימוץ v2 = עריכה אנושית של הקבוע דרך PR (INV-LRN1); exemplars מופשטים בלבד (INV-LRN5); הסיגנל היחיד = הכרעת-יו"ר, לא הצבעות-פאנל (anti-echo). מתחת ל-12 זוגות → "אין מספיק נתונים". `--no-llm` (סטטיסטיקה בלבד) / `--limit N`. **חובה מקומי**. | תקופתי — אחרי שהצטברו הכרעות-יו"ר על מחלוקות-פאנל | | `backfill_canonical_halachot.py` | python | **V41 — הקמת מודל ההלכות הקנוניות (חד-פעמי + idempotent).** (1) בונה רכיבים-קשורים (connected components) מ-`equivalent_halachot` (transitive closure — union-find). (2) לכל אשכול: בוחר נציג-קנוני (הכי הרבה corroboration → confidence → earliest), יוצר שורת `canonical_halachot`, ומעדכן `canonical_id` + `instance_type` לכל חברי האשכול. (3) לסינגלטונים (ללא קישורי-שוויון): 1:1 canonical. (4) מאכלס `halacha_citation_corroboration.canonical_id` מ-`halachot.canonical_id`. `--dry-run` (ברירת-מחדל, מחשב ומדווח בלבד) / `--apply` (כותב) / `--verbose`. לאחר הרצה: `canonical_statement` = ניסוח-נציג (pending_synthesis); עוקב: `backfill_canonical_synthesis.py` (Phase 4) יסנתז ניסוח-רחב דרך LLM. הרץ: `mcp-server/.venv/bin/python scripts/backfill_canonical_halachot.py --apply`. | **חד-פעמי** (לאחר deploy V41) / idempotent לפי צורך | +| `cull_principles.py` | python | **#152 Phase C — סינון רטרואקטיבי של קורפוס-העקרונות דרך פאנל-3 (הפיך).** מריץ על כל עיקרון 'original' קיים את אותו משטר שה-extractor משתמש בו להבא (`services/panel_extraction.panel_keep_score`, G2): 3 שופטים (Claude מקומי + DeepSeek + Gemini) מצביעים keep+score → כלל-האישור (3 קולות→שורד · 2 וציון≥0.85→שורד · 2 ו<0.85→יו"ר · ≤1→נדחה) → תקרת `HALACHA_PANEL_MAX_NEW`=5 לכל החלטה לפי ציון (`apply_cap`). נדחה → `halachot.review_status='rejected'` + ה-canonical שלו `rejected` (הפיך, גיבוי-CSV ב-`data/audit/` לפני כל כתיבה). מרוסן ב-`usage_limits` (עוצר-רך בתקרת-שימוש, resumable). `--dry-run` (ברירת-מחדל) / `--apply` / `--sample N` (החלטות אקראיות) / `--limit N` / `--no-throttle` / `--verbose`. **חובה מקומי** (3 שופטים). הרץ: `cd mcp-server && HOME=/home/chaim .venv/bin/python ../scripts/cull_principles.py --apply`. | **חד-פעמי** (סינון ראשוני) + ניתן-לחזרה | | `backfill_canonical_synthesis.py` | python | **V41 Phase 4 — סינתזת-LLM ל-`canonical_statement` (idempotent + resumable).** עובר על canonicals ב-`review_status='pending_synthesis'` (רב-instance ראשונים) ומזקק לכל אחד ניסוח אחד כללי ומעוגן בציטוטי-המופעים (INV-AH) דרך `services/canonical_synthesis.py` (מסלול-יחיד, G2). שערים: עיגון/הימנעות, **drift-floor** (cosine מול המקור, ברירת-מחדל 0.80 — סטייה גדולה→נשמר המקור), ואיסור ציטוטי-תיק חדשים. בכל מקרה הסטטוס מתקדם ל-`pending_review` לשער-היו"ר (G10/INV-LRN6). מודל Opus (`HALACHA_CANONICAL_SYNTH_MODEL`). מרוסן ע"י `usage_limits` (עוצר-רך בתקרת-שימוש, resumable). `--dry-run` (ברירת-מחדל) / `--apply` / `--sample N` (מדגם אקראי לבדיקה) / `--limit N` / `--no-throttle` / `--verbose`. CSV-audit ל-`data/audit/canonical-synthesis-*.csv`. **חובה מקומי** (claude_session). הרץ: `cd mcp-server && HOME=/home/chaim .venv/bin/python ../scripts/backfill_canonical_synthesis.py --apply`. שוטף: כלי-MCP `canonical_synthesize_pending`. | **חד-פעמי** (המסה הראשונית) + idempotent לחדשים | | `halacha_batch_reconcile.py` | python | **#82.7** — dedup חוצה-פסקים offline (שמרני, **dry-run בלבד**). dedup-on-insert משווה רק תוך-פסק; כאן סף מחמיר (cosine ≥0.95, `--cosine`) ולא-הרסני: מאתר זוגות הלכות near-duplicate בין פסקים שונים (pgvector `<=>` exact) עם איתות לקסיקלי (Jaccard/Levenshtein) ומדווח ל-CSV ב-`data/audit/` לסקירת היו"ר. לא מדלג/ממזג/מוחק. `--include-pending`. **`--link`** רושם את הזוגות שנמצאו כ-`equivalent_halachot` (parallel authority, #84.2 — **deprecated post-V41** — השתמש ב-`backfill_canonical_halachot.py --apply` במקום). רץ עם venv של mcp-server. | **deprecated** — הוחלף ב-`backfill_canonical_halachot.py` (V41). נשמר לצורכי audit | | `calibrate_halacha_dedup.py` | python | **#82.1** — כיול ספי ה-dedup הלקסיקלי (#82.3) מול gold-set הניקוי. קורא `halacha-cleanup-manifest-*.csv` (זוגות duplicate↔survivor מתויגי-אדם), טוען טקסט-survivor מה-DB, ו-sweep של (jaccard_min × levenshtein_min) עם P/R/F1, מסמן את נקודת-העבודה המוגדרת. אימת ש-(0.55, 0.70) → **precision 1.0** (אפס false-merge), recall 0.30 — מתאים לאיתות-משני שחוסם auto-approve. `--manifest `. רץ עם venv של mcp-server | חד-פעמי — כיול (בוצע 2026-06-06) | diff --git a/scripts/cull_principles.py b/scripts/cull_principles.py new file mode 100644 index 0000000..0b13ddc --- /dev/null +++ b/scripts/cull_principles.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +"""Retroactive cull of the legal-principles corpus via the 3-model panel (#152, Phase C). + +The corpus grew to ~5,243 principles (18.8/decision) under the old single-model +auto-approve. This re-adjudicates EVERY existing 'original' principle with the +SAME regime the extractor now uses going forward (chaim 2026-06-19): + + • 3 judges (Claude local + DeepSeek + Gemini) vote keep + score on each principle. + • Approval rule: 3 votes→survive · 2 & score≥0.85→survive · 2 & <0.85→chair + (pending_review) · ≤1→reject. + • Per DECISION, survivors are capped to HALACHA_PANEL_MAX_NEW (=5) by score; the + rest are rejected (over-cap). + +All logic is shared with the extractor via services/panel_extraction (G2). The +cull is REVERSIBLE: a CSV backup of every (id, old_status) is written before any +write, and a rejected principle's canonical is also set 'rejected' (recoverable). +Throttled by usage_limits (stops gracefully at the soft ceiling, resumable). + + cd ~/legal-ai/mcp-server + HOME=/home/chaim .venv/bin/python ../scripts/cull_principles.py --sample 5 # dry-run, 5 decisions + HOME=/home/chaim .venv/bin/python ../scripts/cull_principles.py --dry-run # all, dry-run + HOME=/home/chaim .venv/bin/python ../scripts/cull_principles.py --apply # full, throttled +""" +from __future__ import annotations + +import argparse +import asyncio +import csv +import os +import random +import sys +from collections import Counter +from datetime import datetime, timezone +from uuid import UUID + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "mcp-server", "src")) + +from legal_mcp import config # noqa: E402 +from legal_mcp.services import db, panel_extraction as pe # noqa: E402 + +try: + from legal_mcp.services import usage_limits +except Exception: # pragma: no cover + usage_limits = None + +AUDIT_DIR = os.path.join(os.path.dirname(__file__), "..", "data", "audit") +_JUDGE_CONCURRENCY = 4 + + +async def _decisions(limit, sample): + """case_law ids that have 'original' principles, with source metadata.""" + pool = await db.get_pool() + rows = await pool.fetch( + "SELECT cl.id, cl.case_number, cl.source_kind, cl.is_binding, " + " count(*) AS n " + "FROM halachot h JOIN case_law cl ON cl.id = h.case_law_id " + "WHERE h.instance_type = 'original' AND h.review_status <> 'rejected' " + "GROUP BY cl.id, cl.case_number, cl.source_kind, cl.is_binding " + "ORDER BY n DESC", + ) + items = [dict(r) for r in rows] + if sample and sample < len(items): + items = random.sample(items, sample) + if limit: + items = items[:limit] + return items + + +async def _principles(case_law_id): + pool = await db.get_pool() + rows = await pool.fetch( + "SELECT id, rule_statement, supporting_quote, reasoning_summary, " + " canonical_id, review_status " + "FROM halachot WHERE case_law_id = $1 AND instance_type = 'original' " + "AND review_status <> 'rejected' ORDER BY halacha_index", + case_law_id, + ) + return [dict(r) for r in rows] + + +def _throttled(): + if usage_limits is None: + return False, "no usage_limits" + u = usage_limits.subscription_usage() + if u is None: + return False, "usage read failed" + over, _r, detail = usage_limits.ceiling_status(u) + return over, detail + + +async def _judge_decision(dec, sem): + principles = await _principles(dec["id"]) + if not principles: + return [] + + async def one(p): + async with sem: + v = await pe.panel_keep_score( + p["rule_statement"], p["supporting_quote"], p.get("reasoning_summary") or "", + source_kind=dec["source_kind"] or "external_upload", + is_binding=bool(dec["is_binding"]), + ) + return {**p, **v} + + judged = await asyncio.gather(*[one(p) for p in principles]) + return pe.apply_cap(list(judged)) + + +async def _apply_decision(judged, reviewer): + pool = await db.get_pool() + async with pool.acquire() as conn: + async with conn.transaction(): + for j in judged: + fv = j["final_verdict"] + if fv == "approved": + await conn.execute( + "UPDATE halachot SET review_status='approved', reviewed_at=now(), " + "reviewer=$2, updated_at=now() WHERE id=$1", j["id"], reviewer) + elif fv == "pending_review": + await conn.execute( + "UPDATE halachot SET review_status='pending_review', reviewer=$2, " + "updated_at=now() WHERE id=$1", j["id"], reviewer) + else: # rejected — also reject its canonical (reversible) + await conn.execute( + "UPDATE halachot SET review_status='rejected', reviewed_at=now(), " + "reviewer=$2, updated_at=now() WHERE id=$1", j["id"], reviewer) + if j.get("canonical_id"): + await conn.execute( + "UPDATE canonical_halachot SET review_status='rejected', " + "updated_at=now() WHERE id=$1", j["canonical_id"]) + + +async def _run(apply, limit, sample, throttle, verbose): + decisions = await _decisions(limit, sample) + mode = "APPLY" if apply else "DRY-RUN" + print(f"[{mode}] {len(decisions)} decisions with principles " + f"(throttle={'on' if throttle else 'off'})\n", flush=True) + if not decisions: + print("nothing to do.") + return 0 + + stamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + os.makedirs(AUDIT_DIR, exist_ok=True) + audit = os.path.join(AUDIT_DIR, f"principle-cull-{'apply' if apply else 'dryrun'}-{stamp}.csv") + reviewer = f"cull:panel v{config.HALACHA_PANEL_SCORE_FLOOR} cap{config.HALACHA_PANEL_MAX_NEW}" + sem = asyncio.Semaphore(_JUDGE_CONCURRENCY) + tally = Counter() + n_in = n_out = 0 + stopped = False + + with open(audit, "w", newline="", encoding="utf-8") as fh: + w = csv.writer(fh) + w.writerow(["case_number", "halacha_id", "old_status", "final_verdict", + "votes", "score", "canonical_id", "rule"]) + for k, dec in enumerate(decisions, 1): + if throttle: + over, detail = _throttled() + if over: + print(f"\n⏸ usage ceiling ({detail}) — stopping at {k-1}/{len(decisions)}. " + f"Re-run to resume.", flush=True) + stopped = True + break + judged = await _judge_decision(dec, sem) + survivors = sum(1 for j in judged if j["final_verdict"] in ("approved", "pending_review")) + n_in += len(judged) + n_out += survivors + for j in judged: + tally[j["final_verdict"]] += 1 + w.writerow([dec["case_number"], str(j["id"]), j["review_status"], + j["final_verdict"], j["votes"], j["score"], + str(j.get("canonical_id") or ""), (j["rule_statement"] or "")[:160]]) + if apply and judged: + await _apply_decision(judged, reviewer) + print(f"[{k}/{len(decisions)}] {dec['case_number']:<16} " + f"{len(judged)}→{survivors} survive", flush=True) + if verbose: + for j in judged: + mark = {"approved": "✓", "pending_review": "→chair", "rejected": "✗"}[j["final_verdict"]] + print(f" {mark} v={j['votes']} s={j['score']} {(j['rule_statement'] or '')[:80]}") + + print(f"\n── {mode} summary{' (stopped early)' if stopped else ''} ──") + print(f" principles judged: {n_in} → survive: {n_out} ({n_in - n_out} rejected)") + for v, c in tally.most_common(): + print(f" {v:<16} {c}") + print(f"\naudit CSV: {audit}") + if not apply: + print("dry-run — no DB writes. Re-run with --apply to commit (reversible).") + return 0 + + +def main(): + p = argparse.ArgumentParser(description="Retroactive principle cull via 3-model panel (#152)") + p.add_argument("--apply", action="store_true", help="write verdicts (reversible, CSV-backed)") + p.add_argument("--dry-run", action="store_true", help="explicit dry-run (default)") + p.add_argument("--limit", type=int, default=None) + p.add_argument("--sample", type=int, default=None, help="random sample of N decisions") + p.add_argument("--no-throttle", action="store_true") + p.add_argument("--verbose", action="store_true") + a = p.parse_args() + return asyncio.run(_run(a.apply, a.limit, a.sample, not a.no_throttle, a.verbose)) + + +if __name__ == "__main__": + raise SystemExit(main())