feat(retrieval): add voyage rerank-2 cross-encoder stage (feature flag)
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m29s

Stage B of voyage-upgrades-plan rewritten: instead of context-3 (which
4 POCs showed inconsistent improvement), add a cross-encoder rerank
layer on top of voyage-3. Default off (VOYAGE_RERANK_ENABLED=false).

POC validation (785-doc corpus, 12 queries, claude-haiku-4-5 judge):
- mean@3 +4.5% (4.306 → 4.500)
- practical-category queries +11.6% (3.78 → 4.22)
- latency +702ms per query
- no schema change, no re-embed, no double storage

Plumbing:
- config: VOYAGE_RERANK_ENABLED / _MODEL / _FETCH_K env vars
- embeddings.voyage_rerank() wraps voyageai client.rerank
- services/rerank.py: maybe_rerank() helper — fetches FETCH_K candidates
  via the bi-encoder then reranks to top-K. Fail-open if Voyage rerank is
  unavailable.
- tools/search.py: search_decisions, search_case_documents,
  find_similar_cases all wrapped
- services/precedent_library.search_library wrapped

Smoke-tested locally with flag on/off — produces expected behaviour and
latency profile. Ready for production rollout via Coolify env flip after
deploy.

POCs (kept under scripts/ for reference):
- voyage_context3_poc{_long}.py — context-3 evaluation (rejected)
- voyage_multimodal_poc.py — multimodal-3 (stage C, deferred)
- voyage_rerank_judge_poc.py — single-case rerank benchmark
- voyage_rerank_corpus_poc.py — full-corpus rerank validation

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-03 18:43:41 +00:00
parent 688ba37d9c
commit 26c3fddf41
13 changed files with 1578 additions and 100 deletions

View File

@@ -0,0 +1,213 @@
"""POC #3: voyage-3 (text) vs voyage-multimodal-3.5 (page images) on a
real appraisal PDF (89 pages, full of tables / signatures / numerical
data — the corpus class where multimodal should help most).
Document under test:
baf10153-d2fc-4481-b250-9fe87440ce69
"נספח - שומה מכרעת (אבלין דוידזון שמאמא) - 15.09.24"
case 8137-24, 89 pages, 2.1 MB
The pipeline:
1. Pull the existing voyage-3 text-chunk embeddings from `document_chunks`.
2. Render each PDF page → PNG (PyMuPDF, dpi=144).
3. Embed all pages via voyage-multimodal-3.5.
4. Run benchmark queries (mix of generic + table-specific + visual)
against both: text top-K and page top-K.
The comparison is *qualitative* — text and image embeddings are
different "spaces" returning different ID types (chunk_id vs page_num).
What we look at is whether image-based retrieval surfaces tables,
signatures, or numerical data that text-only OCR loses.
No DB writes.
Usage:
/home/chaim/legal-ai/mcp-server/.venv/bin/python \\
/home/chaim/legal-ai/scripts/voyage_multimodal_poc.py
"""
from __future__ import annotations
import asyncio
import io
import math
import os
import time
ENV_PATH = os.path.expanduser("~/.env")
if os.path.isfile(ENV_PATH):
with open(ENV_PATH) as f:
for line in f:
line = line.strip()
if line and not line.startswith("#") and "=" in line:
k, v = line.split("=", 1)
os.environ.setdefault(k, v)
import asyncpg # noqa: E402
import voyageai # noqa: E402
import fitz # PyMuPDF # noqa: E402
from PIL import Image # noqa: E402
DOCUMENT_ID = "baf10153-d2fc-4481-b250-9fe87440ce69"
PDF_PATH = (
"/home/chaim/legal-ai/data/cases/8137-24/documents/originals/"
"נספח - שומה מכרעת (אבלין דוידזון שמאמא) - 15.09.24.pdf"
)
TEXT_MODEL = "voyage-3"
MULTIMODAL_MODEL = "voyage-multimodal-3" # check supported: 3.5 may not exist yet
DPI = 144
# voyage-multimodal: max 1000 inputs/call, 320M pixels/call (rough),
# so 89 pages at 1240×1750 ≈ 192M pixels = single call.
QUERIES = [
# generic-textual (both should handle)
"שיטת ההיוון בשומה",
"מתודולוגיית הערכת שווי",
# table/numerical (multimodal should help)
"טבלת השוואת ערכים לפני ואחרי התכנית",
"שווי המקרקעין במצב הקודם",
"שווי המקרקעין במצב החדש",
"ירידת ערך באחוזים",
# visual elements (text-only loses)
"חתימת השמאי",
"תרשים גוש וחלקה",
"מפת מיקום הנכס",
# context-heavy
"מסקנת השמאי המכריע",
"עקרון הצפיפות בתכנית",
]
def cosine(a: list[float], b: list[float]) -> float:
dot = sum(x * y for x, y in zip(a, b))
na = math.sqrt(sum(x * x for x in a))
nb = math.sqrt(sum(y * y for y in b))
return dot / (na * nb) if na and nb else 0.0
def parse_pgvector(s: str) -> list[float]:
return [float(x) for x in s.strip("[]").split(",")]
def render_pdf_pages(pdf_path: str, dpi: int) -> list[Image.Image]:
"""Render each page → PIL.Image (RGB)."""
doc = fitz.open(pdf_path)
images: list[Image.Image] = []
for page in doc:
pix = page.get_pixmap(dpi=dpi)
png_bytes = pix.tobytes("png")
img = Image.open(io.BytesIO(png_bytes)).convert("RGB")
images.append(img)
doc.close()
return images
async def main():
api_key = os.environ["VOYAGE_API_KEY"]
pg_pw = os.environ["POSTGRES_PASSWORD"]
voyage = voyageai.Client(api_key=api_key)
# 1. Render PDF pages
print(f"[render] {PDF_PATH}")
start = time.time()
images = render_pdf_pages(PDF_PATH, DPI)
elapsed = time.time() - start
print(f"[render] {len(images)} pages in {elapsed:.1f}s, "
f"{images[0].size}px @ {DPI}dpi")
# 2. Pull existing text chunks + voyage-3 embeddings
pool = await asyncpg.create_pool(
host="127.0.0.1", port=5433, user="legal_ai",
password=pg_pw, database="legal_ai",
min_size=1, max_size=2,
)
rows = await pool.fetch("""
SELECT id, chunk_index, page_number, content,
embedding::text AS emb_text
FROM document_chunks
WHERE document_id = $1
ORDER BY chunk_index
""", DOCUMENT_ID)
print(f"[text] {len(rows)} text chunks loaded (voyage-3 in DB)")
text_contents = [r["content"] for r in rows]
text_chunk_pages = [r["page_number"] for r in rows]
text_embs = [parse_pgvector(r["emb_text"]) for r in rows]
# 3. Multimodal embed — try multimodal-3 first, fall back if needed
target_model = "voyage-multimodal-3"
print(f"[multimodal] embedding {len(images)} pages with {target_model}")
start = time.time()
try:
mm_result = voyage.multimodal_embed(
inputs=[[img] for img in images], # list of single-image inputs
model=target_model,
input_type="document",
truncation=True,
)
except voyageai.error.InvalidRequestError as e:
print(f" [error] {e}")
await pool.close()
return
elapsed = time.time() - start
image_embs = mm_result.embeddings
mm_tokens = getattr(mm_result, "total_tokens", "?")
image_tokens = getattr(mm_result, "image_pixels", "?")
text_tokens_mm = getattr(mm_result, "text_tokens", "?")
print(f"[multimodal] done in {elapsed:.1f}s — "
f"total_tokens={mm_tokens} text_tokens={text_tokens_mm} "
f"image_pixels={image_tokens}")
assert len(image_embs) == len(images), "embedding count mismatch"
print(f"[multimodal] embedding dim = {len(image_embs[0])}")
# 4. Run queries
print("\n" + "=" * 100)
print("QUERY RESULTS — top-5 chunks (text/voyage-3) "
"vs top-5 pages (multimodal)")
print("=" * 100)
for q_idx, query in enumerate(QUERIES, 1):
# Text-side: voyage-3 query embedding
q_text = voyage.embed(
[query], model=TEXT_MODEL, input_type="query"
).embeddings[0]
# Multimodal-side: same model, query input_type
q_mm = voyage.multimodal_embed(
inputs=[[query]],
model=target_model,
input_type="query",
).embeddings[0]
text_scores = sorted(
[(cosine(q_text, e), i) for i, e in enumerate(text_embs)],
reverse=True,
)[:5]
mm_scores = sorted(
[(cosine(q_mm, e), i) for i, e in enumerate(image_embs)],
reverse=True,
)[:5]
print(f"\n[Q{q_idx}] {query}")
print(f" --- text (voyage-3) top-5 ---")
for s, i in text_scores:
page = text_chunk_pages[i] if text_chunk_pages[i] else "?"
preview = text_contents[i].replace("\n", " ").strip()[:70]
print(f" {s:.3f} page={page:>3} chunk={i:>3} {preview}")
print(f" --- multimodal (image-only) top-5 ---")
for s, i in mm_scores:
print(f" {s:.3f} page={i+1:>3} (image)")
# Token / cost summary
print("\n" + "=" * 100)
print("SUMMARY")
print("=" * 100)
print(f"PDF: {len(images)} pages @ {DPI}dpi → {target_model}")
print(f"Total multimodal tokens: {mm_tokens}")
print(f"Embedding dim: {len(image_embs[0])}")
print(f"Time: {elapsed:.1f}s for full doc")
await pool.close()
if __name__ == "__main__":
asyncio.run(main())