feat(retrieval): add voyage-multimodal-3 page-image embeddings (feature flag)
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m50s

Stage C: per-page image embeddings via voyage-multimodal-3 + hybrid
text+image search. Off by default; enable with MULTIMODAL_ENABLED=true.

- Schema V9: document_image_embeddings + precedent_image_embeddings
  (vector(1024), page_number, image_thumbnail_path)
- extractor.render_pages_for_multimodal renders PDF pages at
  MULTIMODAL_DPI (144) for embedding + JPEG thumbnails at
  MULTIMODAL_THUMB_DPI (96) for UI preview, in one pass
- embeddings.embed_images calls voyage-multimodal-3 in 50-page batches
- services/hybrid_search.py orchestrator: rerank applied to text side
  first (rerank-2 is text-only); image side cosine; weighted merge
  with text_weight 0.65 (env-tunable); image-only pages surface as
  match_type='image' so dense scanned content still appears
- processor.process_document and precedent_library.ingest_precedent
  gated by flag — non-fatal on multimodal failure
- scripts/multimodal_backfill.py — idempotent per-case CLI to embed
  existing documents without re-extracting text

Validated locally on a 5-page response brief: render 0.31s, embed 8.32s,
hybrid merge surfaces image rows correctly. Production rollout starts
with flag=false (no behavior change), then per-case A/B.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-03 19:24:52 +00:00
parent b9cdcf980d
commit 242f668319
10 changed files with 1038 additions and 40 deletions

View File

@@ -6,7 +6,7 @@ import json
import logging
from uuid import UUID
from legal_mcp.services import db, embeddings, rerank
from legal_mcp.services import db, embeddings, hybrid_search
logger = logging.getLogger(__name__)
@@ -43,9 +43,9 @@ async def search_decisions(
)
query_emb = await embeddings.embed_query(query)
results = await rerank.maybe_rerank(
results = await hybrid_search.search_documents_hybrid(
query=query,
base_search=lambda **kw: db.search_similar(query_embedding=query_emb, **kw),
query_text_embedding=query_emb,
limit=limit,
section_type=section_type or None,
practice_area=practice_area or None,
@@ -59,11 +59,13 @@ async def search_decisions(
for r in results:
formatted.append({
"score": round(float(r["score"]), 4),
"case_number": r["case_number"],
"document": r["document_title"],
"section": r["section_type"],
"page": r["page_number"],
"content": r["content"],
"case_number": r.get("case_number"),
"document": r.get("document_title"),
"section": r.get("section_type"),
"page": r.get("page_number"),
"content": r.get("content", ""),
"match_type": r.get("match_type", "text"),
"image_thumbnail": r.get("image_thumbnail_path"),
})
return json.dumps(formatted, ensure_ascii=False, indent=2)
@@ -87,9 +89,9 @@ async def search_case_documents(
query_emb = await embeddings.embed_query(query)
# Restricted to case_id — practice_area filter would be redundant.
results = await rerank.maybe_rerank(
results = await hybrid_search.search_documents_hybrid(
query=query,
base_search=lambda **kw: db.search_similar(query_embedding=query_emb, **kw),
query_text_embedding=query_emb,
limit=limit,
case_id=UUID(case["id"]),
)
@@ -101,10 +103,12 @@ async def search_case_documents(
for r in results:
formatted.append({
"score": round(float(r["score"]), 4),
"document": r["document_title"],
"section": r["section_type"],
"page": r["page_number"],
"content": r["content"],
"document": r.get("document_title"),
"section": r.get("section_type"),
"page": r.get("page_number"),
"content": r.get("content", ""),
"match_type": r.get("match_type", "text"),
"image_thumbnail": r.get("image_thumbnail_path"),
})
return json.dumps(formatted, ensure_ascii=False, indent=2)
@@ -139,12 +143,11 @@ async def find_similar_cases(
)
query_emb = await embeddings.embed_query(description)
# Use description as the query text for rerank too.
# Note: even with rerank we ask for ``limit*3`` so the dedup-by-case
# Even with rerank we ask for ``limit*3`` so the dedup-by-case
# step downstream still has enough rows to pick the best per case.
results = await rerank.maybe_rerank(
results = await hybrid_search.search_documents_hybrid(
query=description,
base_search=lambda **kw: db.search_similar(query_embedding=query_emb, **kw),
query_text_embedding=query_emb,
limit=limit * 3,
practice_area=practice_area or None,
appeal_subtype=appeal_subtype or None,
@@ -153,14 +156,16 @@ async def find_similar_cases(
if not results:
return "לא נמצאו תיקים דומים."
# Deduplicate by case_number, keep best score per case
# Deduplicate by case_number, keep best score per case.
# image-only rows still carry case_number from the join.
seen_cases = {}
for r in results:
cn = r["case_number"]
cn = r.get("case_number")
if not cn:
continue
if cn not in seen_cases or r["score"] > seen_cases[cn]["score"]:
seen_cases[cn] = r
# Sort by score and limit
top_cases = sorted(seen_cases.values(), key=lambda x: x["score"], reverse=True)[:limit]
formatted = []
@@ -168,8 +173,9 @@ async def find_similar_cases(
formatted.append({
"score": round(float(r["score"]), 4),
"case_number": r["case_number"],
"document": r["document_title"],
"relevant_section": r["content"][:500],
"document": r.get("document_title"),
"relevant_section": (r.get("content") or "")[:500],
"match_type": r.get("match_type", "text"),
})
return json.dumps(formatted, ensure_ascii=False, indent=2)