From 6ff2e36bf9724f46847152ccabb0d00fb6a6b253 Mon Sep 17 00:00:00 2001 From: Chaim Date: Sun, 31 May 2026 14:58:13 +0000 Subject: [PATCH] =?UTF-8?q?feat(eval):=20FU-5=20=E2=80=94=20retrieval=20ev?= =?UTF-8?q?al=20harness=20+=20halacha=20backlog=20visibility=20(#63)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Covers GAP-11 (INV-RET4/G8) and GAP-14 (INV-QA1/G10). Retrieval quality was never measured (only telemetry observation) and the halacha review backlog was invisible (the 10/19 gap was found by accident). Unit B — backlog visibility (pure code, container): - metrics.halacha_backlog(conn) → {pending_review, approved, rejected, published, total, oldest_pending_at}; surfaced in metrics.get_dashboard() (get_metrics MCP tool) and /api/system/diagnostics. Live count revealed 178 pending / 1552 total, oldest from 2026-05-03 — previously invisible. Unit A — retrieval eval harness (host-side scripts): - scripts/eval_gold_bootstrap.py — seeds data/eval/gold-set.jsonl. Two sources: citations (cited==relevant via search_relevance_feedback — empty until decisions cite precedents) and known_item (query=case_name → relevant=self; a real citation-free signal, the methodology #52 checked by hand). Idempotent; preserves source='chair' rows. - scripts/eval_retrieval.py — runs the production retrieval path (search_library / search_internal) over the gold-set; computes precision@k, recall@k, MRR, nDCG@k (k=5,10); aggregates overall + per-corpus + per-practice_area; writes a report and a delta vs committed baseline.json (which records the retrieval_config it reflects). --self-test unit-checks the metric math offline. Gold-set strategy = hybrid (chair decision): bootstrap + chair review. The citation source is empty today (0 cited precedents in decisions), so the seed is known-item (77 queries: 54 internal_decisions + 23 precedent_library). The gold-set is PROVISIONAL until Dafna reviews it (the domain chair-gate). Baseline (production config: multimodal+rerank on): R@10=0.987, MRR=0.837, nDCG@10=0.872. Finding: MULTIMODAL_ENABLED=true slightly lowers known-item recall (image-page results displace exact name matches) — relevant to #15. precedent_library weaker than internal (R@10 0.957 vs 1.0) — one external precedent unfindable by name. "CI gate" realized as discipline (re-runnable harness + committed baseline + run before/after any retrieval-layer change) — retrieval needs prod DB + Voyage, no CI runner has that access. Spec: docs/superpowers/specs/2026-05-31-fu5-eval-harness-design.md Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitignore | 1 + .taskmaster/tasks/tasks.json | 20 +- data/eval/baseline.json | 70 +++++ data/eval/gold-set.jsonl | 77 +++++ .../2026-05-31-fu5-eval-harness-design.md | 92 ++++++ mcp-server/src/legal_mcp/services/metrics.py | 28 ++ scripts/SCRIPTS.md | 2 + scripts/eval_gold_bootstrap.py | 196 ++++++++++++ scripts/eval_retrieval.py | 294 ++++++++++++++++++ web/app.py | 6 +- 10 files changed, 776 insertions(+), 10 deletions(-) create mode 100644 data/eval/baseline.json create mode 100644 data/eval/gold-set.jsonl create mode 100644 docs/superpowers/specs/2026-05-31-fu5-eval-harness-design.md create mode 100644 scripts/eval_gold_bootstrap.py create mode 100644 scripts/eval_retrieval.py diff --git a/.gitignore b/.gitignore index d28d085..2afbb0e 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ legacy/ kiryat-yearim/ continuation-prompt.md node_modules/ +data/eval/eval-report-* diff --git a/.taskmaster/tasks/tasks.json b/.taskmaster/tasks/tasks.json index a832d0c..c14b70f 100644 --- a/.taskmaster/tasks/tasks.json +++ b/.taskmaster/tasks/tasks.json @@ -2175,9 +2175,9 @@ "id": "63", "title": "[FU-5] eval-harness + נראות backlog", "description": "מדידת precision/recall על gold-set + חשיפת backlog הלכות בבדיקת-בריאות.", - "details": "מכסה GAP-11,14. מספק INV-RET4/G8/QA1/G10. severity: High. סוג: קוד + החלטת-יו\"ר (בניית gold-set). תלוי ב-FU-2.", + "details": "מכסה GAP-11,14. מספק INV-RET4/G8/QA1/G10. severity: High. סוג: קוד + החלטת-יו\"ר (בניית gold-set). תלוי ב-FU-2. | DONE 2026-05-31: Unit B (GAP-14) — halacha_backlog נחשף ב-metrics.get_dashboard + /api/system/diagnostics (גילה 178 pending_review מתוך 1552, הישן 3.5.26). Unit A (GAP-11) — scripts/eval_gold_bootstrap.py (citations+known_item) + scripts/eval_retrieval.py (P/R/MRR/nDCG@5,10, self-test, baseline+config). gold-set=77 known-item queries (citation-source ריק: 0 ציטוטים בהחלטות). baseline בייצור: R@10=0.987 MRR=0.837; ממצא: MULTIMODAL=true מוריד known-item recall קלות (relevant ל-#15). gold-set=provisional עד סקירת דפנה (chair-gate; הדומיין). spec: docs/superpowers/specs/2026-05-31-fu5-eval-harness-design.md", "testStrategy": "", - "status": "pending", + "status": "done", "dependencies": [ "60" ], @@ -2189,9 +2189,10 @@ "description": "כיום רק telemetry.log_search_bg; איכות-אחזור לא נמדדת.", "dependencies": [], "details": "INV-RET4/G8", - "status": "pending", + "status": "done", "testStrategy": "", - "parentId": "63" + "parentId": "63", + "updatedAt": "2026-05-31T14:55:38.289Z" }, { "id": 2, @@ -2199,12 +2200,13 @@ "description": "ספירת pending_review בבדיקת-בריאות (10/19 התגלה במקרה).", "dependencies": [], "details": "INV-QA1/G10", - "status": "pending", + "status": "done", "testStrategy": "", - "parentId": "63" + "parentId": "63", + "updatedAt": "2026-05-31T14:55:38.295Z" } ], - "updatedAt": "2026-05-30T17:37:34.741136+00:00" + "updatedAt": "2026-05-31T14:55:38.295Z" }, { "id": "64", @@ -2418,9 +2420,9 @@ ], "metadata": { "version": "1.0.0", - "lastModified": "2026-05-31T14:11:37.689Z", + "lastModified": "2026-05-31T14:55:38.296Z", "taskCount": 70, - "completedCount": 62, + "completedCount": 63, "tags": [ "legal-ai" ] diff --git a/data/eval/baseline.json b/data/eval/baseline.json new file mode 100644 index 0000000..5fc9e81 --- /dev/null +++ b/data/eval/baseline.json @@ -0,0 +1,70 @@ +{ + "gold_size": 77, + "retrieval_config": { + "MULTIMODAL_ENABLED": true, + "VOYAGE_RERANK_ENABLED": true, + "VOYAGE_MODEL": "voyage-3", + "MULTIMODAL_TEXT_WEIGHT": 0.5, + "MULTIMODAL_RRF_K": 60, + "BM25_HYBRID_ENABLED": true + }, + "overall": { + "P@5": 0.1922, + "R@5": 0.9351, + "nDCG@5": 0.8545, + "P@10": 0.1013, + "R@10": 0.987, + "nDCG@10": 0.8718, + "MRR": 0.8367 + }, + "by_corpus": { + "internal_decisions": { + "P@5": 0.1963, + "R@5": 0.963, + "nDCG@5": 0.887, + "P@10": 0.1019, + "R@10": 1.0, + "nDCG@10": 0.899, + "MRR": 0.871 + }, + "precedent_library": { + "P@5": 0.1826, + "R@5": 0.8696, + "nDCG@5": 0.778, + "P@10": 0.1, + "R@10": 0.9565, + "nDCG@10": 0.808, + "MRR": 0.7562 + } + }, + "by_practice_area": { + "betterment_levy": { + "P@5": 0.1897, + "R@5": 0.9231, + "nDCG@5": 0.8595, + "P@10": 0.1, + "R@10": 0.9744, + "nDCG@10": 0.8761, + "MRR": 0.8432 + }, + "compensation_197": { + "P@5": 0.2, + "R@5": 1.0, + "nDCG@5": 1.0, + "P@10": 0.1, + "R@10": 1.0, + "nDCG@10": 1.0, + "MRR": 1.0 + }, + "rishuy_uvniya": { + "P@5": 0.2, + "R@5": 0.9706, + "nDCG@5": 0.861, + "P@10": 0.1029, + "R@10": 1.0, + "nDCG@10": 0.8708, + "MRR": 0.8346 + } + }, + "generated_at": "20260531T145742Z" +} \ No newline at end of file diff --git a/data/eval/gold-set.jsonl b/data/eval/gold-set.jsonl new file mode 100644 index 0000000..0024400 --- /dev/null +++ b/data/eval/gold-set.jsonl @@ -0,0 +1,77 @@ +{"id": "g-e3112d9b6a", "query": "ARAR-24-9002", "practice_area": "compensation_197", "corpus": "internal_decisions", "relevant_case_law_ids": ["730d6f21-08e4-4ae0-8b7e-017dde61003e"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-2ab91a37e3", "query": "אברהם אגסי", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["1a87efe5-6e13-4ed4-a9ec-3f2f7d61e4ec"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-3572817c30", "query": "אברהם אנשין", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["8aeee5cc-26a0-475a-b4e4-c2570e4333f5"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-66dbb8ac16", "query": "אהרון ברק - תכנית רחביה", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["e151fc25-cf12-4563-b638-a86323f8413b"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-3588230bc4", "query": "אואקנין", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["405d51ac-deef-4bdf-aaea-f39b4aaa84fd"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-ff905fe19d", "query": "ב.דייניש", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["f3ab6507-6475-4230-ad96-70d4177a9f72"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-fa8f479ae1", "query": "בוטיק הנביאים", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["691e8220-745b-4631-aff4-338c164ba988"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-4b2c6a86ec", "query": "בית אגודת ישראל", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["7a71adbc-6a21-41a4-a98d-8fdd3f6e7b62"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-e9d5fc6d9b", "query": "בית חנינא מגרש 2010", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["fa0dab0c-bafc-4239-bba4-33cc9790f69f"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-8280afc216", "query": "בית חנינא — אום כולתום", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["a1e51703-474a-44d0-b8c8-5ae8bffb4782"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-e814cc43fa", "query": "בן זאב רמות", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["53c1adb6-81fd-4d0a-b3de-ffe2e6c5b6b3"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-7b1ef92188", "query": "בר-און", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["a60dc67d-67ab-4615-b148-34794d728687"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-9b17fb63a3", "query": "ג'רוזלם הומס אינק", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["9af224ef-5325-488c-a28c-de8ab059dfa3"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-c763aa9a45", "query": "גבאי וזוסמן", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["65065d5b-c0b2-4be3-970c-6b76842da054"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-ac23569fec", "query": "גפטו-פיצריה בצור הדסה", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["496c945a-9ab6-402c-9f9e-39f7af88b7cd"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-8dc2a68af8", "query": "דב ויעל ירון", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["a4716706-b2af-424d-98d8-d7ec45f9aeea"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-94196a641c", "query": "דור ודורשיו 18", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["a3ca3f83-3831-457d-8eed-b5654a201348"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-e19550a361", "query": "האורן 51 מבשרת ציון", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["3e112944-2a0d-4175-bcb6-69e19828b8ad"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-9612266af6", "query": "ההסתדרות הציונית העולמית", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["20999cb0-d9bd-4c4a-a18d-304451e1a30f"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-c39b2a42c7", "query": "הוועדה המקומית ירושלים נ' סופר נוח", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["04b2f953-efce-4e11-b9b5-e583b393c335"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-a145777626", "query": "הכט וסדובסקי", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["ffbd9963-099f-4bf5-b888-af993844e80a"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-33059ab228", "query": "המרכז הארצי לטהרת המשפחה", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["cd815101-e153-468d-a7bc-be1ac88105ae"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-8af7c5a180", "query": "השלום 63 מבשרת ציון", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["ee2104c8-2d31-4173-839c-8b61dcaf2a31"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-0494e34a1d", "query": "וינפלד", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["bd5d849c-c15f-43c3-96ab-d44337af9cb5"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-beca7df79f", "query": "זעיתר", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["098535ec-55c0-44dd-b058-ddaeac8b4cd7"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-f1a9633456", "query": "חוכרת הר חומה", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["e40110b4-9364-4cc7-a5b8-cee9bbedb172"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-3d12dcc821", "query": "חלוואני", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["9d8da0a6-e4dc-4c9b-85ab-36fa5ecbd12f"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-77ae0a9368", "query": "טביסל דניאל", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["f39f807d-90a6-4950-b10f-485dbf7e2ef6"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-4dec58a380", "query": "יסמין 54 מבשרת ציון", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["ac1a34c4-52c5-4e91-b6a7-297f11fe0460"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-776cecae74", "query": "ירושלים שקופה", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["438d693c-6dfd-4a65-a48c-f8e2011bcc10", "ecc63119-6977-4d8e-930d-609dbd990494"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (2 same-named)"} +{"id": "g-824f0d2ca8", "query": "ירושלים שקופה (1112/22)", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["446e96f1-a896-435d-bc33-a9b61b6d0b6c"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-454e470bb4", "query": "ליאור אהרון", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["a5ba233d-27aa-432b-bbef-093a2d49d80a"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-09c8b87f35", "query": "מוצא עילית", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["048af29a-d356-454f-acd6-5d1de32ecb94"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-5055a61633", "query": "מילי וישראל גלון", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["cc812e7b-cf9b-44af-8dfa-36541cb0b72d"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-8a15965c4f", "query": "מנץ", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["ed7ac419-f359-4b51-8e21-adec141629c7"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-48ae72c484", "query": "מפלגת נעם", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["5897b4e1-1fa2-4d83-816d-51f7cdf7cdee"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-ca171fdb45", "query": "מצפה בית שמש", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["8ba7f873-0da4-49cd-955e-98f579e61fb2"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-7e54e8b69b", "query": "מרדכי שטיין", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["228de6b5-b731-4959-a448-e9e941790420"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-62befb6c18", "query": "מרכז קהילתי בית הכרם", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["e73ec1d1-e89e-4d5b-a870-84cbf7b09106"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-cb0a295129", "query": "נחמיה פרומר", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["ab039082-47d1-4f79-9db9-d97c53e3bc80"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-4f9a788676", "query": "נילי אמיתי", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["d3fd9310-621b-4b76-a71f-729dd2044108"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-e9b1ce30da", "query": "סלונים", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["add3da4c-fda0-48d0-8109-957fc9f924a7"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-23b50ceb0d", "query": "סקולוסקי", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["18846024-d630-4a33-9024-6b2388df7007"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-93531bf772", "query": "עוררי רכס חלילים", "practice_area": "compensation_197", "corpus": "internal_decisions", "relevant_case_law_ids": ["288326ca-bf9c-48fe-ba6b-8ef9e65bd0a0"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-f1e0ebc751", "query": "עזבון אליהו הרנון ז\"ל נ' הוועדה המקומית ירושלים", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["6774fe43-0ba9-4409-b128-cacbd168afc3"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-f3c29ce2f8", "query": "עמותת ישיבת טעלז", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["30a606ac-5ba4-46d5-86d4-075564e30d2d"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-0a595fd872", "query": "ערן סופר", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["9c63985a-211f-4af9-a145-c674bdcdb0f6"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-fd95fc1bc0", "query": "פייר קניג 36", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["5cc53869-9e85-469e-85bb-986ac646de07"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-04f32ade81", "query": "פרויקט מגרש 902 בית שמש", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["810f8315-26cf-4069-be16-b5fee7f16a56"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-445fa07583", "query": "קו אופ ופרטוש", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["62c517c8-ab8d-48b1-8472-1f6adc6e3817"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-9f2c58a190", "query": "קרן יעקב הלפרן", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["921d36df-76be-4a53-823b-0d2ac1f79f2e"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-78610b8e8a", "query": "שכן הכלנית 54 מבשרת ציון", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["88e2d381-2e34-49b2-8225-5e72b487854d"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-d043d7c75f", "query": "ששת הימים 6 רמת אשכול", "practice_area": "betterment_levy", "corpus": "internal_decisions", "relevant_case_law_ids": ["a87d30d4-d3a3-439d-9909-c282024aafba"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-1cdefcfaba", "query": "תמ\"א רש\"י 32 תל אביב", "practice_area": "rishuy_uvniya", "corpus": "internal_decisions", "relevant_case_law_ids": ["3cbd2d6c-ff20-4af2-ab92-c105bb30fbc6"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-a65f37501c", "query": "אגא וכט", "practice_area": "rishuy_uvniya", "corpus": "precedent_library", "relevant_case_law_ids": ["1847e97e-6e38-494f-b079-0fc59066788a"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-10e5dca5b8", "query": "אהוד שפר", "practice_area": "rishuy_uvniya", "corpus": "precedent_library", "relevant_case_law_ids": ["9024da7b-f408-4b6f-808f-c514a83728e4"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-b42d0ceaaa", "query": "אירוס הגלבוע", "practice_area": "rishuy_uvniya", "corpus": "precedent_library", "relevant_case_law_ids": ["b673d649-d162-4f81-a323-c7d89e8334ce"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-4d50ccd2dd", "query": "אנטרים", "practice_area": "rishuy_uvniya", "corpus": "precedent_library", "relevant_case_law_ids": ["48909f09-8a65-4a2d-8697-e2f50bf9a756"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-bbf0e30d31", "query": "ארגון עמק שווה", "practice_area": "rishuy_uvniya", "corpus": "precedent_library", "relevant_case_law_ids": ["41d5a21c-a28a-428f-a35e-bc7d0dc89539"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-dac18ac10f", "query": "ב. דייניש", "practice_area": "betterment_levy", "corpus": "precedent_library", "relevant_case_law_ids": ["950d8c1b-4976-4a68-8b8e-7d0bdd056e1d"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-0d130898bb", "query": "בולקינד", "practice_area": "betterment_levy", "corpus": "precedent_library", "relevant_case_law_ids": ["e57c4a6b-66a0-4d52-85af-5018f03cf295"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-789c4ff1a7", "query": "בית אגודת ישראל", "practice_area": "betterment_levy", "corpus": "precedent_library", "relevant_case_law_ids": ["ced7ea50-689b-465d-bf79-99e22a72e0df", "aadedc2d-e990-4d6d-9dd1-8be4fa6dcbe2"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (2 same-named)"} +{"id": "g-06b07271bb", "query": "ברק - תכנית רחביה", "practice_area": "betterment_levy", "corpus": "precedent_library", "relevant_case_law_ids": ["57be0d1a-293f-481f-aa5b-bfa7dc73f99e"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-4160927269", "query": "גבעת האירוסים", "practice_area": "rishuy_uvniya", "corpus": "precedent_library", "relevant_case_law_ids": ["e26f2fa2-50e5-407d-8724-8c707dcda51b"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-4fe81acc94", "query": "הבית ברחוב שמעוני", "practice_area": "betterment_levy", "corpus": "precedent_library", "relevant_case_law_ids": ["53ccf47e-0fc7-4248-b486-02f57a9c689c"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-faa7cc3548", "query": "הקדש עדת הבוכרים", "practice_area": "betterment_levy", "corpus": "precedent_library", "relevant_case_law_ids": ["587381e4-d194-4d37-b00f-ccf7242ba228"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-0901d5d211", "query": "כנסייה אוונגלית אפיסקופלית", "practice_area": "betterment_levy", "corpus": "precedent_library", "relevant_case_law_ids": ["4bde8ca8-7862-4b19-9dd7-de2e31d82721"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-62fd2080df", "query": "לויתן אדיב שמואל", "practice_area": "betterment_levy", "corpus": "precedent_library", "relevant_case_law_ids": ["b80d94a0-b836-44f5-8cc6-18d8cf26e41d"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-9f934d9159", "query": "לויתן וקלמנוביץ", "practice_area": "betterment_levy", "corpus": "precedent_library", "relevant_case_law_ids": ["436efd48-c8ab-49f0-b3a9-52bf15ea806d"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-9e829d5277", "query": "מועצה אזורית מטה בנימין", "practice_area": "", "corpus": "precedent_library", "relevant_case_law_ids": ["d7b635b1-6607-46ac-9868-44e4fd598e5a"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-b3acf850af", "query": "משה ירושלמי", "practice_area": "betterment_levy", "corpus": "precedent_library", "relevant_case_law_ids": ["e18aa906-e0f5-452f-a17a-f1c299095340"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-631a47d8b0", "query": "משרד התחבורה נ' גלר", "practice_area": "betterment_levy", "corpus": "precedent_library", "relevant_case_law_ids": ["8bfcd217-cde3-4930-a058-c9a59182c338"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-f8aaaa60d7", "query": "נווה שלום", "practice_area": "betterment_levy", "corpus": "precedent_library", "relevant_case_law_ids": ["4f85e3f1-237a-4dac-b949-87a43ee6f633"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-dbb1358ccf", "query": "ניצני עוז", "practice_area": "rishuy_uvniya", "corpus": "precedent_library", "relevant_case_law_ids": ["e08f81d3-6183-494c-aec3-f20d39e2755e"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-ae5917860b", "query": "סרוזברג ואח'", "practice_area": "", "corpus": "precedent_library", "relevant_case_law_ids": ["d9772726-9766-4509-8067-b20fa625a1a9"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-e1e175248c", "query": "עמותת העצמאים באילת", "practice_area": "rishuy_uvniya", "corpus": "precedent_library", "relevant_case_law_ids": ["f59e74c2-6433-47c9-bd0e-580cf4171fbb"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} +{"id": "g-86116ced86", "query": "שמי אשקלוני", "practice_area": "betterment_levy", "corpus": "precedent_library", "relevant_case_law_ids": ["7352e510-c769-45e4-b4ef-d85271743506"], "source": "bootstrap_known_item", "note": "known-item: search by case_name → expect the case itself (1 same-named)"} diff --git a/docs/superpowers/specs/2026-05-31-fu5-eval-harness-design.md b/docs/superpowers/specs/2026-05-31-fu5-eval-harness-design.md new file mode 100644 index 0000000..63515b2 --- /dev/null +++ b/docs/superpowers/specs/2026-05-31-fu5-eval-harness-design.md @@ -0,0 +1,92 @@ +# FU-5 — Retrieval Eval Harness + Backlog Visibility (design) + +**Task:** #63 (legal-ai tag) · **Covers:** GAP-11, GAP-14 · **Provides:** INV-RET4, G8, INV-QA1, G10 +**Status:** approved 2026-05-31 (gold-set strategy = hybrid, chair decision). Technical architecture +decided per `feedback_research_architecture_decisions` (chair adjudicates domain, not architecture). + +## Problem + +1. **GAP-11 (INV-RET4/G8):** retrieval quality is never measured. Only `telemetry.log_search_bg` + records queries (observation, not evaluation). No gold-set, no precision/recall. Every RRF-weight + / `k` / embedder change is tuned "by feel". +2. **GAP-14 (INV-QA1/G10):** the halacha review backlog (`review_status='pending_review'`) is + invisible — the 10/19-approved gap was found by accident. The human gate has no visibility. + +## Two independent units + +### Unit A — Retrieval eval harness (GAP-11) + +**Existing leverage:** `search_relevance_feedback` already captures a real ground-truth signal — +when a finalized decision cites a precedent, `infer_relevance_from_citations` marks it +`relevance_score=3` against the `search_logs` where it appeared (telemetry.py). This bootstraps the +gold-set without hand-labeling. + +**A1. Gold-set — versioned file `data/eval/gold-set.jsonl`** (single SoT; reviewable/diffable/ +chair-editable). One JSON object per line: +```json +{"id":"g001","query":"...","practice_area":"betterment_levy", + "corpus":"precedent_library|internal_decisions", + "relevant_case_law_ids":["uuid",...],"source":"bootstrap|chair","note":""} +``` + +**A2. Bootstrap generator — `scripts/eval_gold_bootstrap.py`** (host-side, mcp-server venv): +reads `search_relevance_feedback` (score=3) ⨝ `search_logs`, groups by normalized query → +relevant `case_law_id` set, emits `source=bootstrap` entries. Idempotent: re-run regenerates the +bootstrap section; never overwrites `source=chair` rows. **Chair gate:** Dafna reviews the file, +corrects/augments, promotes entries to `source=chair`. + +**A3. Harness — `scripts/eval_retrieval.py`** (host-side, mcp-server venv; needs POSTGRES + VOYAGE): +runs the **production retrieval path** (same service functions the MCP search tools call) for each +gold query, computes per-query **precision@k, recall@k, MRR, nDCG@k** (k∈{5,10}); relevant = gold +ids. Aggregates mean overall + per corpus + per practice_area. Writes +`data/eval/eval-report-.{json,md}`, prints a summary, and a delta vs the committed +`data/eval/baseline.json`. `--update-baseline` rewrites the snapshot. + +**"CI gate" — realized as discipline, not automation.** Retrieval needs the prod DB + Voyage API; +no CI runner has that access. The gate is: re-runnable harness + committed `baseline.json` + a +documented "run before/after any retrieval-layer change, attach the delta" rule (SCRIPTS.md). A true +automated CI gate would require a separate frozen corpus fixture — out of scope, noted as future. + +**Scope:** the two precedent corpora (`search_precedent_library` + `search_internal_decisions`), +where the citation signal exists. `search_decisions`/`search_case_documents` return case-document +chunks (not `case_law`) and carry no citation ground-truth — deliberately out of scope. + +**Metrics rationale:** precision@k + recall@k are spec-required (INV-RET4). MRR (first-relevant +rank) and nDCG@k (graded, position-weighted) are standard IR complements (Manning et al., 2008) — +nDCG matches the telemetry docstring's stated nDCG@10 aspiration. + +### Unit B — Backlog visibility (GAP-14) — pure code + +Expose the halacha review backlog where health is already surfaced: +- **`metrics.get_dashboard()`** (mcp-server/src/legal_mcp/services/metrics.py) — add + `halacha_backlog: {pending_review, approved, rejected, published, total, oldest_pending_at}` from + `halachot.review_status` + `min(created_at) where pending_review`. Surfaces through the + `get_metrics` MCP tool (agents + dashboard). +- **`/api/system/diagnostics`** (web/app.py) — add the same `halacha_backlog` block to the health + snapshot. + +## Files + +| File | Unit | Kind | Deploy | +|------|------|------|--------| +| `scripts/eval_gold_bootstrap.py` | A2 | new, host-side | none | +| `scripts/eval_retrieval.py` | A3 | new, host-side | none | +| `data/eval/gold-set.jsonl` | A1 | data (on disk; chair-reviewed) | none | +| `data/eval/baseline.json` | A3 | committed snapshot | none | +| `mcp-server/src/legal_mcp/services/metrics.py` | B | edit `get_dashboard` | Coolify | +| `web/app.py` | B | edit diagnostics | Coolify | +| `scripts/SCRIPTS.md` | A | doc | none | + +## Test strategy + +- Bootstrap: idempotent (re-run = same bootstrap rows; chair rows untouched); 0 chair rows clobbered. +- Harness: metric math unit-verified offline on a synthetic (ranking, relevant-set) fixture + (precision@k / recall@k / MRR / nDCG@k against hand-computed values) before any DB run. +- Unit B: `get_metrics` (no case_number) returns `halacha_backlog` with counts summing to total; + diagnostics endpoint returns the same block. Verified against prod counts. + +## Chair gate (domain — the only thing requiring Dafna) + +After bootstrap produces `gold-set.jsonl`, Dafna reviews: are these queries representative, and are +the marked precedents the *correct* answers? Her edits make the gold-set authoritative. Until then +the baseline is "provisional (bootstrap-only)". diff --git a/mcp-server/src/legal_mcp/services/metrics.py b/mcp-server/src/legal_mcp/services/metrics.py index 230beee..306ce69 100644 --- a/mcp-server/src/legal_mcp/services/metrics.py +++ b/mcp-server/src/legal_mcp/services/metrics.py @@ -103,6 +103,30 @@ async def get_case_metrics(case_id: UUID) -> dict: return metrics +async def halacha_backlog(conn) -> dict: + """תור אישור-ההלכות (GAP-14 / INV-QA1 / G10) — נראות ה-backlog האנושי. + + הלכות נכנסות כ-`pending_review` ובלתי-נראות לחיפוש עד אישור היו"ר; בלי ספירה + גלויה, אישור-חסר נשאר סמוי (10/19 התגלה במקרה). מקבל connection פתוח כדי + שאפשר יהיה לשלב בסנאפ-שוט קיים (get_dashboard, /api/system/diagnostics). + """ + rows = await conn.fetch( + "SELECT review_status, COUNT(*) AS n FROM halachot GROUP BY review_status" + ) + counts = {r["review_status"]: r["n"] for r in rows} + oldest = await conn.fetchval( + "SELECT MIN(created_at) FROM halachot WHERE review_status = 'pending_review'" + ) + return { + "pending_review": counts.get("pending_review", 0), + "approved": counts.get("approved", 0), + "rejected": counts.get("rejected", 0), + "published": counts.get("published", 0), + "total": sum(counts.values()), + "oldest_pending_at": oldest.isoformat() if oldest else None, + } + + async def get_dashboard() -> dict: """דשבורד כולל — סיכום מדדים על כל התיקים.""" pool = await db.get_pool() @@ -152,6 +176,9 @@ async def get_dashboard() -> dict: "SELECT AVG(total_words) FROM decisions WHERE total_words > 0" ) + # Halacha review backlog (GAP-14 / INV-QA1 / G10) + backlog = await halacha_backlog(conn) + return { "summary": { "total_cases": total_cases, @@ -168,6 +195,7 @@ async def get_dashboard() -> dict: "stale_embedding_case_law": stale_embedding_case_law, }, "cases_by_status": cases_by_status, + "halacha_backlog": backlog, "qa": { "cases_validated": qa_total, "cases_passed": qa_passed, diff --git a/scripts/SCRIPTS.md b/scripts/SCRIPTS.md index 4a8f936..225dfa7 100644 --- a/scripts/SCRIPTS.md +++ b/scripts/SCRIPTS.md @@ -15,6 +15,8 @@ | `test_retrieval_by_name.py` | python | בדיקת אחזור-לפי-שם (#52/RC-A) — מאמת ש`search_precedent_library`/`search_internal_decisions` מדרגים את ההחלטה עצמה (אגסי) מעל מי שמצטט אותה, + רגרסיות לשאילתות מהותיות. הרצה: `DOTENV_PATH=/home/chaim/.env DATA_DIR=.../data mcp-server/.venv/bin/python scripts/test_retrieval_by_name.py` (exit 0 = עבר). | ידני אחרי שינוי שכבת חיפוש | | `fu2b_reconcile_internal_case_numbers.py` | python | **FU-2b (GAP-07/08) — תיאום `case_number` של `internal_committee`** מציטוט-מלא למספר-בסיס קנוני (X1: trim·prefix-strip·`/`→`-`, חודש נשמר). דטרמיניסטי (token יחיד; 0/>1 → flag). `--dry-run` (ברירת-מחדל) מפיק טבלת-תיאום ל-`data/audit/fu2b-reconciliation-*.{csv,md}` עם flags (DUP_CHECK / PROC_MISMATCH / MISMATCH). `--apply --approved ` מגבה ואז מעדכן רק שורות שאושרו ע"י היו"ר. scope: internal בלבד (external → #68). FK-safe. | חד-פעמי, **chair-gated** (apply רק אחרי אישור דפנה) | | `fu2c_reconcile_external_case_numbers.py` | python | **FU-2c (GAP-08, #68) — תיאום `case_number` של פסיקה חיצונית** (`source_kind <> internal_committee`) מציטוט-מלא לצורה קנונית **מציין-הליך + docket** (החלטת-יו"ר 2026-05-31, Option A: `/` נשמר, *לא* `-`; תואם db.py:369 ו-INV-ID2). דטרמיניסטי (designator+docket; 0/>1 docket → flag). `--dry-run` (ברירת-מחדל) מפיק `data/audit/fu2c-reconciliation-*.{csv,md}` עם flags (MISMATCH / NO_CITATION / CIT_NO_DOCKET / DESIG_MISMATCH / DUP_CHECK). `--apply --approved ` מגבה ואז מעדכן שורות לא-חוסמות (כולל ADVISORY/NO_CITATION). `--overrides ` (id,proposed_canonical,reason) פותח שורות-חוסמות בהכרעת-יו"ר מפורשת (למשל פס"ד מאוחד — ראה `data/audit/fu2c-overrides.csv` לרשומת לויתן/קלמנוביץ). לוגיקת-החילוץ + פיצול flags אומתו offline על 24 רשומות. scope: external בלבד (internal = FU-2b). FK-safe. | חד-פעמי, **chair-gated** (apply רק אחרי אישור דפנה) | +| `eval_gold_bootstrap.py` | python | **FU-5 (GAP-11) — bootstrap ל-gold-set** של הערכת-אחזור ל-`data/eval/gold-set.jsonl`. שני מקורות: `--source citations` (cited==relevant מ-`search_relevance_feedback`; ריק עד שייצברו ציטוטים) ו-`--source known_item` (query=שם-תיק → relevant=עצמו; אות אמיתי היום). Idempotent — שומר שורות `source=chair`, מחדש `bootstrap_*`. דורש POSTGRES. | לפני eval; חוזר כשנצבר ground-truth | +| `eval_retrieval.py` | python | **FU-5 (GAP-11, INV-RET4/G8) — harness הערכת-אחזור** — מריץ את מסלול-האחזור בייצור (`search_library`/`search_internal`) על ה-gold-set, מחשב precision@k/recall@k/MRR/nDCG@k (k=5,10), מצרף overall+per-corpus+per-PA ל-`data/eval/eval-report-.{json,md}` + delta מול `data/eval/baseline.json` (מתעד retrieval_config). `--self-test` בודק את המטריקות offline; `--update-baseline` מאמץ snapshot. **שער-CI במשמעת:** הרץ לפני/אחרי כל שינוי בשכבת-האחזור באותו קונפיג. דורש POSTGRES+VOYAGE_API_KEY. | לפני/אחרי שינוי RRF/k/embedder/rerank | | `auto-sync-cases.sh` | bash | סנכרון תיקי ערר ל-Gitea — רץ כל דקה | `* * * * *` (cron) | | `backup-db.sh` | bash | גיבוי PostgreSQL יומי ל-`data/backups/` (gzip) | לתזמן: `0 2 * * *` | | `restore-db.sh` | bash | שחזור DB מגיבוי (companion ל-backup-db.sh) | ידני | diff --git a/scripts/eval_gold_bootstrap.py b/scripts/eval_gold_bootstrap.py new file mode 100644 index 0000000..8f821ac --- /dev/null +++ b/scripts/eval_gold_bootstrap.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +"""FU-5 (GAP-11) — bootstrap a retrieval gold-set into data/eval/gold-set.jsonl. + +The gold-set is the labeled (query → relevant case_law_ids) set the eval harness +(scripts/eval_retrieval.py) measures precision/recall against. This script SEEDS it +automatically; the chair then reviews/augments (rows with source='chair' are never +clobbered). Two seed sources: + + --source citations : the chosen hybrid signal — "cited == relevant". Reads + search_relevance_feedback (populated by telemetry.infer_relevance_from_citations + once decisions cite precedents) ⨝ search_logs, groups by query. Yields nothing + until decisions accumulate citations + searches are logged with case context. + + --source known_item : known-item retrieval (Manning et al. 2008, ch. 8) — query = + a precedent's case_name, relevant = that precedent (and any same-named sibling + in the same corpus). A real, citation-free precision/recall signal available + TODAY; this is what #52 (test_retrieval_by_name) checked by hand. Use this to + get a baseline before the citation signal exists. + + --source both (default): emit both. Sources are tagged (bootstrap_known_item / + bootstrap_citation) so the chair can tell them apart. + +Idempotent: regenerates the bootstrap_* rows each run; preserves source='chair' rows. +Merge key = (corpus, normalized query). + +Usage (mcp-server venv; needs POSTGRES): + PY=/home/chaim/legal-ai/mcp-server/.venv/bin/python + POSTGRES_PASSWORD=… POSTGRES_HOST=127.0.0.1 POSTGRES_PORT=5433 \ + $PY scripts/eval_gold_bootstrap.py --source both +""" +from __future__ import annotations + +import argparse +import asyncio +import hashlib +import json +import os +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO_ROOT / "mcp-server" / "src")) + +if "POSTGRES_URL" not in os.environ: + os.environ["POSTGRES_URL"] = ( + f"postgres://{os.environ.get('POSTGRES_USER','legal_ai')}:" + f"{os.environ.get('POSTGRES_PASSWORD','')}@" + f"{os.environ.get('POSTGRES_HOST','127.0.0.1')}:" + f"{os.environ.get('POSTGRES_PORT','5433')}/" + f"{os.environ.get('POSTGRES_DB','legal_ai')}" + ) + +GOLD_PATH = REPO_ROOT / "data" / "eval" / "gold-set.jsonl" + +# search_type (telemetry) → eval corpus name +_TYPE_TO_CORPUS = {"precedent_library": "precedent_library", "internal_decisions": "internal_decisions"} +# case_law.source_kind → eval corpus (which retrieval tool searches it) +_KIND_TO_CORPUS = {"external_upload": "precedent_library", "internal_committee": "internal_decisions"} + + +def _norm_query(q: str) -> str: + return " ".join((q or "").split()).strip() + + +def _entry_id(corpus: str, query: str) -> str: + h = hashlib.sha1(f"{corpus}|{_norm_query(query)}".encode("utf-8")).hexdigest()[:10] + return f"g-{h}" + + +async def _known_item_rows(conn, sample: int | None) -> list[dict]: + """query = case_name, relevant = all same-named precedents in the same corpus.""" + rows = await conn.fetch( + "SELECT id, coalesce(case_name,'') AS case_name, coalesce(practice_area,'') AS pa, " + "source_kind FROM case_law " + "WHERE source_kind IN ('external_upload','internal_committee') " + "AND coalesce(searchable, true) AND length(trim(coalesce(case_name,''))) >= 2 " + "ORDER BY source_kind, case_name") + # group by (corpus, normalized case_name) → relevant ids + groups: dict[tuple[str, str], dict] = {} + for r in rows: + corpus = _KIND_TO_CORPUS[r["source_kind"]] + key = (corpus, _norm_query(r["case_name"])) + g = groups.setdefault(key, {"pa": r["pa"], "ids": []}) + g["ids"].append(str(r["id"])) + out: list[dict] = [] + for (corpus, name), g in groups.items(): + out.append({ + "id": _entry_id(corpus, name), + "query": name, + "practice_area": g["pa"], + "corpus": corpus, + "relevant_case_law_ids": g["ids"], + "source": "bootstrap_known_item", + "note": f"known-item: search by case_name → expect the case itself ({len(g['ids'])} same-named)", + }) + out.sort(key=lambda e: (e["corpus"], e["query"])) + if sample is not None and sample > 0: + out = out[:sample] + return out + + +async def _citation_rows(conn) -> list[dict]: + """query → relevant case_law_ids, from the cited==relevant signal in + search_relevance_feedback ⨝ search_logs (score >= 2).""" + rows = await conn.fetch( + "SELECT sl.query, sl.search_type, coalesce(sl.practice_area,'') AS pa, " + " rf.case_law_id " + "FROM search_relevance_feedback rf " + "JOIN search_logs sl ON sl.id = rf.search_log_id " + "WHERE rf.relevance_score >= 2 AND sl.search_type IN ('precedent_library','internal_decisions')") + groups: dict[tuple[str, str], dict] = {} + for r in rows: + corpus = _TYPE_TO_CORPUS[r["search_type"]] + key = (corpus, _norm_query(r["query"])) + g = groups.setdefault(key, {"pa": r["pa"], "ids": set()}) + g["ids"].add(str(r["case_law_id"])) + if not g["pa"]: + g["pa"] = r["pa"] + out: list[dict] = [] + for (corpus, query), g in groups.items(): + out.append({ + "id": _entry_id(corpus, query), + "query": query, + "practice_area": g["pa"], + "corpus": corpus, + "relevant_case_law_ids": sorted(g["ids"]), + "source": "bootstrap_citation", + "note": "cited == relevant (auto-inferred from finalized decisions)", + }) + out.sort(key=lambda e: (e["corpus"], e["query"])) + return out + + +def _load_existing() -> list[dict]: + if not GOLD_PATH.exists(): + return [] + out = [] + for line in GOLD_PATH.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if line: + out.append(json.loads(line)) + return out + + +def _merge(existing: list[dict], fresh: list[dict]) -> tuple[list[dict], dict]: + """Keep all source='chair' rows; replace bootstrap_* rows with fresh ones. + Merge key = (corpus, normalized query). Chair rows win on key conflict.""" + chair = [e for e in existing if e.get("source") == "chair"] + chair_keys = {(e["corpus"], _norm_query(e["query"])) for e in chair} + kept_fresh = [e for e in fresh if (e["corpus"], _norm_query(e["query"])) not in chair_keys] + merged = chair + kept_fresh + merged.sort(key=lambda e: (e["corpus"], e["source"] != "chair", e["query"])) + stats = { + "chair_rows_preserved": len(chair), + "bootstrap_rows": len(kept_fresh), + "total": len(merged), + } + return merged, stats + + +async def main() -> int: + ap = argparse.ArgumentParser(description="FU-5 gold-set bootstrap") + ap.add_argument("--source", choices=["citations", "known_item", "both"], default="both") + ap.add_argument("--sample", type=int, default=None, help="cap known-item queries (default: all named)") + args = ap.parse_args() + + from legal_mcp.services import db + pool = await db.get_pool() + fresh: list[dict] = [] + async with pool.acquire() as conn: + if args.source in ("citations", "both"): + cit = await _citation_rows(conn) + fresh += cit + print(f"citation source: {len(cit)} queries") + if args.source in ("known_item", "both"): + ki = await _known_item_rows(conn, args.sample) + fresh += ki + print(f"known-item source: {len(ki)} queries") + + existing = _load_existing() + merged, stats = _merge(existing, fresh) + GOLD_PATH.parent.mkdir(parents=True, exist_ok=True) + with GOLD_PATH.open("w", encoding="utf-8") as f: + for e in merged: + f.write(json.dumps(e, ensure_ascii=False) + "\n") + print(f"wrote {GOLD_PATH}") + print(f" chair rows preserved: {stats['chair_rows_preserved']}") + print(f" bootstrap rows: {stats['bootstrap_rows']}") + print(f" total gold queries: {stats['total']}") + if stats["total"] == 0: + print(" NOTE: gold-set empty — no citation signal yet and no named precedents found.") + return 0 + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) diff --git a/scripts/eval_retrieval.py b/scripts/eval_retrieval.py new file mode 100644 index 0000000..78e3a79 --- /dev/null +++ b/scripts/eval_retrieval.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +"""FU-5 (GAP-11, INV-RET4/G8) — retrieval eval harness: precision/recall/MRR/nDCG. + +Runs the PRODUCTION retrieval path (the same service functions the MCP search tools +call) over the labeled gold-set (data/eval/gold-set.jsonl, built by +scripts/eval_gold_bootstrap.py) and reports retrieval quality. This is the empirical +measurement INV-RET4 requires: no more tuning RRF weights / k / embedder "by feel". + +Metrics per query (relevant = gold case_law_ids; ranked = retrieved case_law_ids): + • precision@k = |top-k ∩ relevant| / k + • recall@k = |top-k ∩ relevant| / |relevant| + • MRR = 1 / rank-of-first-relevant (0 if none retrieved) + • nDCG@k = DCG@k / IDCG@k (binary gains, log2 discount) +Aggregated as the mean overall, per corpus, and per practice_area. + +"CI gate" by discipline: run before AND after any retrieval-layer change (RRF weights, +k, chunk threshold, embedder, rerank) and compare to the committed data/eval/baseline.json. +Retrieval needs the prod DB + Voyage, so this is a re-runnable script, not automated CI. + +Usage (mcp-server venv; needs POSTGRES + VOYAGE_API_KEY for live runs): + PY=/home/chaim/legal-ai/mcp-server/.venv/bin/python + $PY scripts/eval_retrieval.py --self-test # offline metric unit tests (no DB) + POSTGRES_PASSWORD=… VOYAGE_API_KEY=… POSTGRES_HOST=127.0.0.1 POSTGRES_PORT=5433 \ + $PY scripts/eval_retrieval.py # run eval, write report + baseline delta + … $PY scripts/eval_retrieval.py --update-baseline # adopt current run as the new baseline +""" +from __future__ import annotations + +import argparse +import asyncio +import json +import math +import os +import sys +from datetime import datetime, timezone +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO_ROOT / "mcp-server" / "src")) + +if "POSTGRES_URL" not in os.environ: + os.environ["POSTGRES_URL"] = ( + f"postgres://{os.environ.get('POSTGRES_USER','legal_ai')}:" + f"{os.environ.get('POSTGRES_PASSWORD','')}@" + f"{os.environ.get('POSTGRES_HOST','127.0.0.1')}:" + f"{os.environ.get('POSTGRES_PORT','5433')}/" + f"{os.environ.get('POSTGRES_DB','legal_ai')}" + ) + +EVAL_DIR = REPO_ROOT / "data" / "eval" +GOLD_PATH = EVAL_DIR / "gold-set.jsonl" +BASELINE_PATH = EVAL_DIR / "baseline.json" +K_VALUES = (5, 10) + + +# ── metrics (pure, unit-tested offline) ────────────────────────────────────── +def precision_at_k(ranked: list[str], relevant: set[str], k: int) -> float: + if k <= 0: + return 0.0 + topk = ranked[:k] + return sum(1 for r in topk if r in relevant) / k + + +def recall_at_k(ranked: list[str], relevant: set[str], k: int) -> float: + if not relevant: + return 0.0 + topk = ranked[:k] + return sum(1 for r in topk if r in relevant) / len(relevant) + + +def mrr(ranked: list[str], relevant: set[str]) -> float: + for i, r in enumerate(ranked, start=1): + if r in relevant: + return 1.0 / i + return 0.0 + + +def ndcg_at_k(ranked: list[str], relevant: set[str], k: int) -> float: + if not relevant: + return 0.0 + dcg = sum((1.0 / math.log2(i + 1)) for i, r in enumerate(ranked[:k], start=1) if r in relevant) + ideal_hits = min(len(relevant), k) + idcg = sum(1.0 / math.log2(i + 1) for i in range(1, ideal_hits + 1)) + return dcg / idcg if idcg else 0.0 + + +def _self_test() -> int: + # ranked positions: 1 2 3 4 + ranked = ["A", "B", "C", "D"] + rel = {"B", "D"} # relevant at ranks 2 and 4 + ok = True + + def chk(name, got, exp): + nonlocal ok + good = abs(got - exp) < 1e-9 + ok = ok and good + print(f" {name:14} got={got:.6f} exp={exp:.6f} {'ok' if good else 'FAIL'}") + + chk("P@2", precision_at_k(ranked, rel, 2), 1 / 2) # B hit → 1/2 + chk("P@4", precision_at_k(ranked, rel, 4), 2 / 4) # B,D → 2/4 + chk("R@2", recall_at_k(ranked, rel, 2), 1 / 2) # 1 of 2 found + chk("R@4", recall_at_k(ranked, rel, 4), 2 / 2) # both found + chk("MRR", mrr(ranked, rel), 1 / 2) # first rel at rank 2 + # nDCG@4: DCG = 1/log2(3) + 1/log2(5); IDCG = 1/log2(2)+1/log2(3) + dcg = 1 / math.log2(3) + 1 / math.log2(5) + idcg = 1 / math.log2(2) + 1 / math.log2(3) + chk("nDCG@4", ndcg_at_k(ranked, rel, 4), dcg / idcg) + chk("MRR-none", mrr(ranked, {"Z"}), 0.0) + chk("R@k-empty", recall_at_k(ranked, set(), 4), 0.0) + print("ALL PASS" if ok else "*** FAILURES ***") + return 0 if ok else 1 + + +# ── retrieval (production path) ────────────────────────────────────────────── +def _ranked_ids(results: list[dict]) -> list[str]: + """Ranked, de-duplicated case_law_ids from a result list (order = ranking).""" + out: list[str] = [] + seen: set[str] = set() + for r in results or []: + if not isinstance(r, dict): + continue + cid = r.get("case_law_id") + if cid is None: + continue + s = str(cid) + if s not in seen: + seen.add(s) + out.append(s) + return out + + +async def _retrieve(corpus: str, query: str, practice_area: str, limit: int) -> list[str]: + from legal_mcp.services import precedent_library, internal_decisions + if corpus == "precedent_library": + res = await precedent_library.search_library(query=query, practice_area=practice_area, limit=limit) + elif corpus == "internal_decisions": + res = await internal_decisions.search_internal(query=query, practice_area=practice_area, limit=limit) + else: + return [] + return _ranked_ids(res) + + +def _retrieval_config() -> dict: + """Capture the retrieval knobs the run reflects — a baseline is only comparable + to another run under the SAME config (multimodal/rerank/weights change results).""" + from legal_mcp import config as cfg + return { + "MULTIMODAL_ENABLED": cfg.MULTIMODAL_ENABLED, + "VOYAGE_RERANK_ENABLED": cfg.VOYAGE_RERANK_ENABLED, + "VOYAGE_MODEL": cfg.VOYAGE_MODEL, + "MULTIMODAL_TEXT_WEIGHT": cfg.MULTIMODAL_TEXT_WEIGHT, + "MULTIMODAL_RRF_K": cfg.MULTIMODAL_RRF_K, + "BM25_HYBRID_ENABLED": cfg.BM25_HYBRID_ENABLED, + } + + +def _load_gold() -> list[dict]: + if not GOLD_PATH.exists(): + return [] + out = [] + for line in GOLD_PATH.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if line: + out.append(json.loads(line)) + return out + + +def _mean(vals: list[float]) -> float: + return sum(vals) / len(vals) if vals else 0.0 + + +def _aggregate(per_query: list[dict]) -> dict: + """Mean of every metric across the given per-query records.""" + agg: dict[str, float] = {} + if not per_query: + return agg + keys = [k for k in per_query[0]["metrics"]] + for mk in keys: + agg[mk] = round(_mean([q["metrics"][mk] for q in per_query]), 4) + return agg + + +async def _run() -> dict: + gold = _load_gold() + kmax = max(K_VALUES) + per_query: list[dict] = [] + for g in gold: + relevant = set(g.get("relevant_case_law_ids") or []) + ranked = await _retrieve(g["corpus"], g["query"], g.get("practice_area", ""), kmax) + m: dict[str, float] = {} + for k in K_VALUES: + m[f"P@{k}"] = precision_at_k(ranked, relevant, k) + m[f"R@{k}"] = recall_at_k(ranked, relevant, k) + m[f"nDCG@{k}"] = ndcg_at_k(ranked, relevant, k) + m["MRR"] = mrr(ranked, relevant) + per_query.append({ + "id": g["id"], "corpus": g["corpus"], "practice_area": g.get("practice_area", ""), + "query": g["query"], "n_relevant": len(relevant), "n_retrieved": len(ranked), + "first_rank": next((i for i, r in enumerate(ranked, 1) if r in relevant), None), + "metrics": m, + }) + + corpora = sorted({q["corpus"] for q in per_query}) + pas = sorted({q["practice_area"] for q in per_query if q["practice_area"]}) + return { + "gold_size": len(gold), + "retrieval_config": _retrieval_config(), + "overall": _aggregate(per_query), + "by_corpus": {c: _aggregate([q for q in per_query if q["corpus"] == c]) for c in corpora}, + "by_practice_area": {p: _aggregate([q for q in per_query if q["practice_area"] == p]) for p in pas}, + "per_query": per_query, + } + + +def _ts() -> str: + return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + + +def _delta_table(cur: dict, base: dict | None) -> str: + lines = ["| metric | current | baseline | Δ |", "|---|---|---|---|"] + base_overall = (base or {}).get("overall", {}) + for mk, cv in cur["overall"].items(): + bv = base_overall.get(mk) + d = f"{cv - bv:+.4f}" if isinstance(bv, (int, float)) else "—" + lines.append(f"| {mk} | {cv:.4f} | {bv if bv is not None else '—'} | {d} |") + return "\n".join(lines) + + +def _write_report(result: dict, base: dict | None, ts: str) -> tuple[Path, Path]: + EVAL_DIR.mkdir(parents=True, exist_ok=True) + jp = EVAL_DIR / f"eval-report-{ts}.json" + mp = EVAL_DIR / f"eval-report-{ts}.md" + jp.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8") + cfg = result.get("retrieval_config", {}) + cfg_line = " · ".join(f"{k}={v}" for k, v in cfg.items()) + base_cfg = (base or {}).get("retrieval_config") + cfg_warn = "" + if base_cfg and base_cfg != cfg: + cfg_warn = "\n> ⚠ retrieval_config differs from baseline — deltas are NOT apples-to-apples.\n" + lines = [f"# FU-5 — דוח הערכת-אחזור — {ts}\n", + f"- gold queries: {result['gold_size']}", + f"- retrieval_config: {cfg_line}", + f"- baseline: {'data/eval/baseline.json' if base else '(none yet)'}", + cfg_warn, + "## Overall (mean) — delta vs baseline\n", _delta_table(result, base), "", + "## Per corpus\n"] + if result["by_corpus"]: + metric_keys = list(next(iter(result["by_corpus"].values())).keys()) + lines.append("| corpus | " + " | ".join(metric_keys) + " |") + lines.append("|" + "---|" * (len(metric_keys) + 1)) + for c, agg in result["by_corpus"].items(): + lines.append(f"| {c} | " + " | ".join(f"{agg[k]:.4f}" for k in metric_keys) + " |") + else: + lines.append("(none)") + mp.write_text("\n".join(lines) + "\n", encoding="utf-8") + return jp, mp + + +async def main() -> int: + ap = argparse.ArgumentParser(description="FU-5 retrieval eval harness") + ap.add_argument("--self-test", action="store_true", help="run offline metric unit tests and exit") + ap.add_argument("--update-baseline", action="store_true", help="write current run as data/eval/baseline.json") + args = ap.parse_args() + + if args.self_test: + return _self_test() + + gold = _load_gold() + if not gold: + print(f"gold-set empty ({GOLD_PATH}). Run scripts/eval_gold_bootstrap.py first.", file=sys.stderr) + return 2 + + result = await _run() + base = json.loads(BASELINE_PATH.read_text(encoding="utf-8")) if BASELINE_PATH.exists() else None + ts = _ts() + jp, mp = _write_report(result, base, ts) + + print(f"EVAL: {result['gold_size']} queries") + for mk, v in result["overall"].items(): + bv = (base or {}).get("overall", {}).get(mk) + d = f" (Δ {v - bv:+.4f})" if isinstance(bv, (int, float)) else "" + print(f" {mk:8} {v:.4f}{d}") + print(f" report: {mp}") + + if args.update_baseline: + snapshot = {k: result[k] for k in ("gold_size", "retrieval_config", "overall", "by_corpus", "by_practice_area")} + snapshot["generated_at"] = ts + BASELINE_PATH.write_text(json.dumps(snapshot, ensure_ascii=False, indent=2), encoding="utf-8") + print(f" baseline updated: {BASELINE_PATH}") + return 0 + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) diff --git a/web/app.py b/web/app.py index 3f241d7..364c54d 100644 --- a/web/app.py +++ b/web/app.py @@ -30,7 +30,7 @@ import asyncpg import httpx from legal_mcp import config -from legal_mcp.services import chunker, db, embeddings, extractor, git_sync, processor, proofreader, research_md +from legal_mcp.services import chunker, db, embeddings, extractor, git_sync, metrics as metrics_service, processor, proofreader, research_md from legal_mcp.tools import cases as cases_tools, search as search_tools, workflow as workflow_tools, drafting as drafting_tools, precedents as precedents_tools # Import integration clients (same directory) @@ -2210,6 +2210,9 @@ async def system_diagnostics(): "ORDER BY d.created_at DESC LIMIT 20" ) + # Halacha review backlog (GAP-14 / INV-QA1 / G10) — human gate visibility + halacha_backlog = await metrics_service.halacha_backlog(conn) + active_tasks = [ {"task_id": tid, "filename": d.get("filename", ""), "status": d.get("status", ""), "step": d.get("step", "")} @@ -2219,6 +2222,7 @@ async def system_diagnostics(): return { "db_ok": db_ok, "tables": tables, + "halacha_backlog": halacha_backlog, "failed_documents": [ { "id": str(r["id"]), -- 2.49.1