chore(#15): adopt MULTIMODAL_TEXT_WEIGHT=0.65 + close #15, open #80

A/B eval (eval_retrieval.py, 86-query gold-set) showed the 0.5 default was
mis-tuned: the image side was too heavy and dragged precedent_library recall
0.971 -> 0.885. Sweep 0.5..0.75 — at 0.65 multimodal beats text-only on every
overall metric AND every corpus (R@5 0.994 vs 0.989, nDCG@5 0.960 vs 0.944,
MRR 0.954 vs 0.936). Dafna approved.

- MULTIMODAL_TEXT_WEIGHT=0.65 set in Coolify (legal-ai, runtime) + redeploy.
- baseline.json updated to the 0.65 config (future regression reference).
- #15 done (premise was stale — multimodal already default on 110 docs; the
  win was tuning the weight, not the backfill).
- #80 opened: the costly 140-doc legacy backfill is deferred until a targeted
  image-answer gold-set proves the table/image value prop (untested here).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-03 08:45:06 +00:00
parent bb42aeeff4
commit 4debe9995b
2 changed files with 59 additions and 44 deletions

View File

@@ -2,50 +2,50 @@
"gold_size": 86,
"retrieval_config": {
"MULTIMODAL_ENABLED": true,
"VOYAGE_RERANK_ENABLED": true,
"VOYAGE_RERANK_ENABLED": false,
"VOYAGE_MODEL": "voyage-3",
"MULTIMODAL_TEXT_WEIGHT": 0.5,
"MULTIMODAL_TEXT_WEIGHT": 0.65,
"MULTIMODAL_RRF_K": 60,
"BM25_HYBRID_ENABLED": true
},
"overall": {
"P@5": 0.214,
"R@5": 0.899,
"nDCG@5": 0.8311,
"P@10": 0.1163,
"R@10": 0.9649,
"nDCG@10": 0.8554,
"MRR": 0.8482
"P@5": 0.2465,
"R@5": 0.9938,
"nDCG@5": 0.9597,
"P@10": 0.1244,
"R@10": 0.9961,
"nDCG@10": 0.9611,
"MRR": 0.9535
},
"by_corpus": {
"internal_decisions": {
"P@5": 0.1963,
"R@5": 0.963,
"nDCG@5": 0.887,
"P@5": 0.2037,
"R@5": 1.0,
"nDCG@5": 0.978,
"P@10": 0.1019,
"R@10": 1.0,
"nDCG@10": 0.8994,
"MRR": 0.8713
"nDCG@10": 0.978,
"MRR": 0.9722
},
"precedent_library": {
"P@5": 0.2438,
"R@5": 0.7911,
"nDCG@5": 0.7367,
"P@10": 0.1406,
"R@10": 0.9057,
"nDCG@10": 0.7813,
"MRR": 0.8092
"P@5": 0.3188,
"R@5": 0.9833,
"nDCG@5": 0.9288,
"P@10": 0.1625,
"R@10": 0.9896,
"nDCG@10": 0.9326,
"MRR": 0.9219
}
},
"by_practice_area": {
"betterment_levy": {
"P@5": 0.1897,
"R@5": 0.9231,
"nDCG@5": 0.8595,
"P@10": 0.1,
"R@10": 0.9744,
"nDCG@10": 0.8766,
"MRR": 0.8437
"P@5": 0.2051,
"R@5": 1.0,
"nDCG@5": 0.9621,
"P@10": 0.1026,
"R@10": 1.0,
"nDCG@10": 0.9621,
"MRR": 0.9487
},
"compensation_197": {
"P@5": 0.2,
@@ -57,14 +57,14 @@
"MRR": 1.0
},
"rishuy_uvniya": {
"P@5": 0.2,
"R@5": 0.9706,
"nDCG@5": 0.861,
"P@5": 0.2059,
"R@5": 1.0,
"nDCG@5": 0.9976,
"P@10": 0.1029,
"R@10": 1.0,
"nDCG@10": 0.8708,
"MRR": 0.8346
"nDCG@10": 0.9976,
"MRR": 1.0
}
},
"generated_at": "20260531T155717Z"
"generated_at": "20260603T084350Z"
}