chore(eval): add 9 chair-approved semantic queries to gold-set (FU-5)

The gold-set was 77 known-item probes (query=case_name). Added 9 chair-approved SEMANTIC queries (S1–S9) — a real legal question per row, relevant = the precedents that should surface (drawn from subject_tags, chair-confirmed). These test what matters: does retrieval answer a legal issue, not just find a case by name. source='chair' (preserved across re-bootstrap). practice_area left empty so the filter never excludes a cross-tagged precedent (s.197 rulings sit under betterment_levy). Baseline now 86 queries. Finding from the 9 semantic queries: MRR ≈ 1.0 — the system surfaces a lead relevant precedent at rank 1 for nearly every question — but R@10 ranges 0.5–1.0: for broad questions with many co-relevant precedents (e.g. נטרול תמ"א 38 = 5 relevant → R@10 0.60; שמאי מכריע = 2 → 0.50) some co-relevant rulings miss the top-10. Lead-precedent retrieval is strong; exhaustive multi-precedent recall is the gap. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-31 15:57:45 +00:00
parent eef04b0f09
commit 7161c3d010
2 changed files with 29 additions and 20 deletions
--- a/data/eval/baseline.json
+++ b/data/eval/baseline.json
@@ -1,5 +1,5 @@
 {
-  "gold_size": 77,
+  "gold_size": 86,
  "retrieval_config": {
    "MULTIMODAL_ENABLED": true,
    "VOYAGE_RERANK_ENABLED": true,
@@ -9,13 +9,13 @@
    "BM25_HYBRID_ENABLED": true
  },
  "overall": {
-    "P@5": 0.1922,
-    "R@5": 0.9351,
-    "nDCG@5": 0.8545,
-    "P@10": 0.1013,
-    "R@10": 0.987,
-    "nDCG@10": 0.8718,
-    "MRR": 0.8367
+    "P@5": 0.214,
+    "R@5": 0.899,
+    "nDCG@5": 0.8311,
+    "P@10": 0.1163,
+    "R@10": 0.9649,
+    "nDCG@10": 0.8554,
+    "MRR": 0.8482
  },
  "by_corpus": {
    "internal_decisions": {
@@ -24,17 +24,17 @@
      "nDCG@5": 0.887,
      "P@10": 0.1019,
      "R@10": 1.0,
-      "nDCG@10": 0.899,
-      "MRR": 0.871
+      "nDCG@10": 0.8994,
+      "MRR": 0.8713
    },
    "precedent_library": {
-      "P@5": 0.1826,
-      "R@5": 0.8696,
-      "nDCG@5": 0.778,
-      "P@10": 0.1,
-      "R@10": 0.9565,
-      "nDCG@10": 0.808,
-      "MRR": 0.7562
+      "P@5": 0.2438,
+      "R@5": 0.7911,
+      "nDCG@5": 0.7367,
+      "P@10": 0.1406,
+      "R@10": 0.9057,
+      "nDCG@10": 0.7813,
+      "MRR": 0.8092
    }
  },
  "by_practice_area": {
@@ -44,8 +44,8 @@
      "nDCG@5": 0.8595,
      "P@10": 0.1,
      "R@10": 0.9744,
-      "nDCG@10": 0.8761,
-      "MRR": 0.8432
+      "nDCG@10": 0.8766,
+      "MRR": 0.8437
    },
    "compensation_197": {
      "P@5": 0.2,
@@ -66,5 +66,5 @@
      "MRR": 0.8346
    }
  },
-  "generated_at": "20260531T154736Z"
+  "generated_at": "20260531T155717Z"
 }