fix(retrieval): switch hybrid merge to Reciprocal Rank Fusion (RRF)
Some checks are pending
Build & Deploy / build-and-deploy (push) Waiting to run
Some checks are pending
Build & Deploy / build-and-deploy (push) Waiting to run
Cosine scores in voyage-3 (~0.4-0.5) and voyage-multimodal-3
(~0.2-0.25) live on different scales. The previous weighted-sum
merge let text always dominate — verified empirically: 0 image-only
hits across 7 queries on case 8174-24, image side contributed nothing.
RRF combines by *rank* in each list rather than raw score, robust
to scale differences. Per-item score:
rrf_score = text_weight / (k + text_rank)
+ image_weight / (k + image_rank)
A row that appears in both lists (joined on (id_field, page_number))
gets both terms — surfaced as match_type='text+image'.
After fix on 8174-24 (146 image rows): 2 image-only hits land in
top-5 across all 7 test queries, surfacing actual table/diagram/
signature pages (p12, p13 of שומת המשיבה for 'טבלת השוואת ערכי שומה',
p25 of שומת השגה for 'תרשים גוש וחלקה', etc).
On 8137-24 (273 image rows): 'חישוב היוון של דמי החכירה' goes from
0 baseline results → 5 hybrid results (3 text + 2 image), opening
recall on scanned content the OCR layer misses.
Default MULTIMODAL_TEXT_WEIGHT 0.65 → 0.5 (vanilla RRF) since the
prior 0.65 was tuned for raw cosine scales that no longer apply.
New env knob MULTIMODAL_RRF_K (default 60, standard literature).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -73,13 +73,19 @@ MULTIMODAL_DPI = int(os.environ.get("MULTIMODAL_DPI", "144"))
|
|||||||
# Separate, lower DPI for the JPEG thumbnail saved to disk for UI
|
# Separate, lower DPI for the JPEG thumbnail saved to disk for UI
|
||||||
# preview. ~96dpi → ~20KB/page; ingestion-time, no re-render at view.
|
# preview. ~96dpi → ~20KB/page; ingestion-time, no re-render at view.
|
||||||
MULTIMODAL_THUMB_DPI = int(os.environ.get("MULTIMODAL_THUMB_DPI", "96"))
|
MULTIMODAL_THUMB_DPI = int(os.environ.get("MULTIMODAL_THUMB_DPI", "96"))
|
||||||
# Hybrid merge weight for the *text* side. The image side gets
|
# Hybrid merge: Reciprocal Rank Fusion (RRF) bias for the *text* side.
|
||||||
# (1 - this). POC found text dominates most queries; image wins only
|
# voyage-3 cosine scores (~0.4-0.5) and voyage-multimodal-3 scores
|
||||||
# on table/visual queries — slight text bias starting point, tunable
|
# (~0.20-0.25) live on different scales; a direct weighted sum lets
|
||||||
# per env without redeploy.
|
# text always dominate. RRF is rank-based and robust to that. The
|
||||||
|
# weight here biases the contribution of each side: 0.5 = balanced
|
||||||
|
# (vanilla RRF), >0.5 favours text, <0.5 favours image. Tunable per
|
||||||
|
# env without redeploy.
|
||||||
MULTIMODAL_TEXT_WEIGHT = float(
|
MULTIMODAL_TEXT_WEIGHT = float(
|
||||||
os.environ.get("MULTIMODAL_TEXT_WEIGHT", "0.65")
|
os.environ.get("MULTIMODAL_TEXT_WEIGHT", "0.5")
|
||||||
)
|
)
|
||||||
|
# RRF damping constant. Standard literature value is 60: lower values
|
||||||
|
# concentrate weight at top ranks; higher values flatten the curve.
|
||||||
|
MULTIMODAL_RRF_K = int(os.environ.get("MULTIMODAL_RRF_K", "60"))
|
||||||
|
|
||||||
# Halacha extraction — auto-approve threshold. Halachot with extractor
|
# Halacha extraction — auto-approve threshold. Halachot with extractor
|
||||||
# confidence >= this value are inserted with review_status='approved'
|
# confidence >= this value are inserted with review_status='approved'
|
||||||
|
|||||||
@@ -140,59 +140,72 @@ def _merge(
|
|||||||
id_field: str,
|
id_field: str,
|
||||||
text_weight: float,
|
text_weight: float,
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""Weighted merge of text + image rows.
|
"""Reciprocal Rank Fusion of text + image rows.
|
||||||
|
|
||||||
Joins on ``(id_field, page_number)``. Halachot in precedent rows
|
Why RRF: voyage-3 cosine scores (~0.4-0.5) and voyage-multimodal-3
|
||||||
have no page_number; for those, image_score = max page score in
|
scores (~0.2-0.25) live on different scales — a direct weighted
|
||||||
the same case_law row (case-level boost).
|
sum lets text always dominate. RRF combines by *rank* in each list,
|
||||||
|
making the merge robust to score-scale differences.
|
||||||
|
|
||||||
Image-only rows (no matching text hit) appear with match_type='image'
|
Per item::
|
||||||
and empty content — UI shows the thumbnail instead of a snippet.
|
|
||||||
|
rrf_score = text_weight / (k + text_rank)
|
||||||
|
+ image_weight / (k + image_rank)
|
||||||
|
|
||||||
|
A row that appears in only one list contributes that list's term
|
||||||
|
only. Rows joined at ``(id_field, page_number)`` get both terms —
|
||||||
|
surfaced as ``match_type='text+image'`` with the thumbnail attached.
|
||||||
|
|
||||||
|
Halachot in precedent rows have no page_number; they remain
|
||||||
|
text-only under RRF (the case-level image boost is dropped — RRF
|
||||||
|
works on rank, not raw scores).
|
||||||
"""
|
"""
|
||||||
|
from legal_mcp import config as _cfg
|
||||||
img_weight = 1.0 - text_weight
|
img_weight = 1.0 - text_weight
|
||||||
img_by_key: dict[tuple, dict] = {}
|
k = _cfg.MULTIMODAL_RRF_K
|
||||||
img_max_by_id: dict[str, float] = {}
|
|
||||||
for r in img_rows:
|
|
||||||
rid = str(r[id_field])
|
|
||||||
page = r.get("page_number")
|
|
||||||
img_by_key[(rid, page)] = r
|
|
||||||
score = float(r.get("score", 0.0))
|
|
||||||
img_max_by_id[rid] = max(img_max_by_id.get(rid, 0.0), score)
|
|
||||||
|
|
||||||
seen: set = set()
|
# Index image rows by their join key for boost detection.
|
||||||
|
img_rank_by_key: dict[tuple, int] = {}
|
||||||
|
img_row_by_key: dict[tuple, dict] = {}
|
||||||
|
for rank, r in enumerate(img_rows, 1):
|
||||||
|
key = (str(r[id_field]), r.get("page_number"))
|
||||||
|
img_rank_by_key[key] = rank
|
||||||
|
img_row_by_key[key] = r
|
||||||
|
|
||||||
|
seen_image_keys: set = set()
|
||||||
merged: list[dict] = []
|
merged: list[dict] = []
|
||||||
for r in text_rows:
|
for rank, r in enumerate(text_rows, 1):
|
||||||
rid = str(r[id_field])
|
rid = str(r[id_field])
|
||||||
page = r.get("page_number")
|
page = r.get("page_number")
|
||||||
key = (rid, page) if page is not None else None
|
key = (rid, page) if page is not None else None
|
||||||
img_hit = img_by_key.get(key) if key else None
|
img_rank = img_rank_by_key.get(key) if key else None
|
||||||
text_score = float(r.get("score", 0.0))
|
text_term = text_weight / (k + rank)
|
||||||
if img_hit:
|
image_term = img_weight / (k + img_rank) if img_rank else 0.0
|
||||||
image_score = float(img_hit["score"])
|
|
||||||
elif r.get("type") == "halacha":
|
|
||||||
image_score = img_max_by_id.get(rid, 0.0)
|
|
||||||
else:
|
|
||||||
image_score = 0.0
|
|
||||||
d = dict(r)
|
d = dict(r)
|
||||||
d["text_score"] = text_score
|
d["text_score"] = float(r.get("score", 0.0))
|
||||||
d["image_score"] = image_score
|
d["text_rank"] = rank
|
||||||
d["score"] = text_score * text_weight + image_score * img_weight
|
if img_rank:
|
||||||
d["match_type"] = "text+image" if img_hit else "text"
|
img_hit = img_row_by_key[key]
|
||||||
if img_hit:
|
d["image_score"] = float(img_hit.get("score", 0.0))
|
||||||
|
d["image_rank"] = img_rank
|
||||||
d["image_thumbnail_path"] = img_hit.get("image_thumbnail_path")
|
d["image_thumbnail_path"] = img_hit.get("image_thumbnail_path")
|
||||||
if key:
|
d["match_type"] = "text+image"
|
||||||
seen.add(key)
|
seen_image_keys.add(key)
|
||||||
|
else:
|
||||||
|
d["image_score"] = 0.0
|
||||||
|
d["match_type"] = "text"
|
||||||
|
d["score"] = text_term + image_term
|
||||||
merged.append(d)
|
merged.append(d)
|
||||||
|
|
||||||
for r in img_rows:
|
for rank, r in enumerate(img_rows, 1):
|
||||||
rid = str(r[id_field])
|
key = (str(r[id_field]), r.get("page_number"))
|
||||||
key = (rid, r.get("page_number"))
|
if key in seen_image_keys:
|
||||||
if key in seen:
|
|
||||||
continue
|
continue
|
||||||
d = dict(r)
|
d = dict(r)
|
||||||
d["text_score"] = 0.0
|
d["text_score"] = 0.0
|
||||||
d["image_score"] = float(r.get("score", 0.0))
|
d["image_score"] = float(r.get("score", 0.0))
|
||||||
d["score"] = float(r.get("score", 0.0)) * img_weight
|
d["image_rank"] = rank
|
||||||
|
d["score"] = img_weight / (k + rank)
|
||||||
d["match_type"] = "image"
|
d["match_type"] = "image"
|
||||||
d["content"] = ""
|
d["content"] = ""
|
||||||
d["section_type"] = "image"
|
d["section_type"] = "image"
|
||||||
|
|||||||
Reference in New Issue
Block a user