chore(eval): add 9 chair-approved semantic queries to gold-set (FU-5)

The gold-set was 77 known-item probes (query=case_name). Added 9 chair-approved
SEMANTIC queries (S1–S9) — a real legal question per row, relevant = the
precedents that should surface (drawn from subject_tags, chair-confirmed). These
test what matters: does retrieval answer a legal issue, not just find a case by
name. source='chair' (preserved across re-bootstrap). practice_area left empty
so the filter never excludes a cross-tagged precedent (s.197 rulings sit under
betterment_levy).

Baseline now 86 queries. Finding from the 9 semantic queries: MRR ≈ 1.0 — the
system surfaces a lead relevant precedent at rank 1 for nearly every question —
but R@10 ranges 0.5–1.0: for broad questions with many co-relevant precedents
(e.g. נטרול תמ"א 38 = 5 relevant → R@10 0.60; שמאי מכריע = 2 → 0.50) some
co-relevant rulings miss the top-10. Lead-precedent retrieval is strong;
exhaustive multi-precedent recall is the gap.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-31 15:57:45 +00:00
parent eef04b0f09
commit 7161c3d010
2 changed files with 29 additions and 20 deletions

View File

@@ -1,5 +1,5 @@
{
"gold_size": 77,
"gold_size": 86,
"retrieval_config": {
"MULTIMODAL_ENABLED": true,
"VOYAGE_RERANK_ENABLED": true,
@@ -9,13 +9,13 @@
"BM25_HYBRID_ENABLED": true
},
"overall": {
"P@5": 0.1922,
"R@5": 0.9351,
"nDCG@5": 0.8545,
"P@10": 0.1013,
"R@10": 0.987,
"nDCG@10": 0.8718,
"MRR": 0.8367
"P@5": 0.214,
"R@5": 0.899,
"nDCG@5": 0.8311,
"P@10": 0.1163,
"R@10": 0.9649,
"nDCG@10": 0.8554,
"MRR": 0.8482
},
"by_corpus": {
"internal_decisions": {
@@ -24,17 +24,17 @@
"nDCG@5": 0.887,
"P@10": 0.1019,
"R@10": 1.0,
"nDCG@10": 0.899,
"MRR": 0.871
"nDCG@10": 0.8994,
"MRR": 0.8713
},
"precedent_library": {
"P@5": 0.1826,
"R@5": 0.8696,
"nDCG@5": 0.778,
"P@10": 0.1,
"R@10": 0.9565,
"nDCG@10": 0.808,
"MRR": 0.7562
"P@5": 0.2438,
"R@5": 0.7911,
"nDCG@5": 0.7367,
"P@10": 0.1406,
"R@10": 0.9057,
"nDCG@10": 0.7813,
"MRR": 0.8092
}
},
"by_practice_area": {
@@ -44,8 +44,8 @@
"nDCG@5": 0.8595,
"P@10": 0.1,
"R@10": 0.9744,
"nDCG@10": 0.8761,
"MRR": 0.8432
"nDCG@10": 0.8766,
"MRR": 0.8437
},
"compensation_197": {
"P@5": 0.2,
@@ -66,5 +66,5 @@
"MRR": 0.8346
}
},
"generated_at": "20260531T154736Z"
"generated_at": "20260531T155717Z"
}