Files
legal-ai/scripts/voyage_multimodal_poc.py
Chaim 26c3fddf41
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m29s
feat(retrieval): add voyage rerank-2 cross-encoder stage (feature flag)
Stage B of voyage-upgrades-plan rewritten: instead of context-3 (which
4 POCs showed inconsistent improvement), add a cross-encoder rerank
layer on top of voyage-3. Default off (VOYAGE_RERANK_ENABLED=false).

POC validation (785-doc corpus, 12 queries, claude-haiku-4-5 judge):
- mean@3 +4.5% (4.306 → 4.500)
- practical-category queries +11.6% (3.78 → 4.22)
- latency +702ms per query
- no schema change, no re-embed, no double storage

Plumbing:
- config: VOYAGE_RERANK_ENABLED / _MODEL / _FETCH_K env vars
- embeddings.voyage_rerank() wraps voyageai client.rerank
- services/rerank.py: maybe_rerank() helper — fetches FETCH_K candidates
  via the bi-encoder then reranks to top-K. Fail-open if Voyage rerank is
  unavailable.
- tools/search.py: search_decisions, search_case_documents,
  find_similar_cases all wrapped
- services/precedent_library.search_library wrapped

Smoke-tested locally with flag on/off — produces expected behaviour and
latency profile. Ready for production rollout via Coolify env flip after
deploy.

POCs (kept under scripts/ for reference):
- voyage_context3_poc{_long}.py — context-3 evaluation (rejected)
- voyage_multimodal_poc.py — multimodal-3 (stage C, deferred)
- voyage_rerank_judge_poc.py — single-case rerank benchmark
- voyage_rerank_corpus_poc.py — full-corpus rerank validation

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 18:43:41 +00:00

214 lines
7.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""POC #3: voyage-3 (text) vs voyage-multimodal-3.5 (page images) on a
real appraisal PDF (89 pages, full of tables / signatures / numerical
data — the corpus class where multimodal should help most).
Document under test:
baf10153-d2fc-4481-b250-9fe87440ce69
"נספח - שומה מכרעת (אבלין דוידזון שמאמא) - 15.09.24"
case 8137-24, 89 pages, 2.1 MB
The pipeline:
1. Pull the existing voyage-3 text-chunk embeddings from `document_chunks`.
2. Render each PDF page → PNG (PyMuPDF, dpi=144).
3. Embed all pages via voyage-multimodal-3.5.
4. Run benchmark queries (mix of generic + table-specific + visual)
against both: text top-K and page top-K.
The comparison is *qualitative* — text and image embeddings are
different "spaces" returning different ID types (chunk_id vs page_num).
What we look at is whether image-based retrieval surfaces tables,
signatures, or numerical data that text-only OCR loses.
No DB writes.
Usage:
/home/chaim/legal-ai/mcp-server/.venv/bin/python \\
/home/chaim/legal-ai/scripts/voyage_multimodal_poc.py
"""
from __future__ import annotations
import asyncio
import io
import math
import os
import time
ENV_PATH = os.path.expanduser("~/.env")
if os.path.isfile(ENV_PATH):
with open(ENV_PATH) as f:
for line in f:
line = line.strip()
if line and not line.startswith("#") and "=" in line:
k, v = line.split("=", 1)
os.environ.setdefault(k, v)
import asyncpg # noqa: E402
import voyageai # noqa: E402
import fitz # PyMuPDF # noqa: E402
from PIL import Image # noqa: E402
DOCUMENT_ID = "baf10153-d2fc-4481-b250-9fe87440ce69"
PDF_PATH = (
"/home/chaim/legal-ai/data/cases/8137-24/documents/originals/"
"נספח - שומה מכרעת (אבלין דוידזון שמאמא) - 15.09.24.pdf"
)
TEXT_MODEL = "voyage-3"
MULTIMODAL_MODEL = "voyage-multimodal-3" # check supported: 3.5 may not exist yet
DPI = 144
# voyage-multimodal: max 1000 inputs/call, 320M pixels/call (rough),
# so 89 pages at 1240×1750 ≈ 192M pixels = single call.
QUERIES = [
# generic-textual (both should handle)
"שיטת ההיוון בשומה",
"מתודולוגיית הערכת שווי",
# table/numerical (multimodal should help)
"טבלת השוואת ערכים לפני ואחרי התכנית",
"שווי המקרקעין במצב הקודם",
"שווי המקרקעין במצב החדש",
"ירידת ערך באחוזים",
# visual elements (text-only loses)
"חתימת השמאי",
"תרשים גוש וחלקה",
"מפת מיקום הנכס",
# context-heavy
"מסקנת השמאי המכריע",
"עקרון הצפיפות בתכנית",
]
def cosine(a: list[float], b: list[float]) -> float:
dot = sum(x * y for x, y in zip(a, b))
na = math.sqrt(sum(x * x for x in a))
nb = math.sqrt(sum(y * y for y in b))
return dot / (na * nb) if na and nb else 0.0
def parse_pgvector(s: str) -> list[float]:
return [float(x) for x in s.strip("[]").split(",")]
def render_pdf_pages(pdf_path: str, dpi: int) -> list[Image.Image]:
"""Render each page → PIL.Image (RGB)."""
doc = fitz.open(pdf_path)
images: list[Image.Image] = []
for page in doc:
pix = page.get_pixmap(dpi=dpi)
png_bytes = pix.tobytes("png")
img = Image.open(io.BytesIO(png_bytes)).convert("RGB")
images.append(img)
doc.close()
return images
async def main():
api_key = os.environ["VOYAGE_API_KEY"]
pg_pw = os.environ["POSTGRES_PASSWORD"]
voyage = voyageai.Client(api_key=api_key)
# 1. Render PDF pages
print(f"[render] {PDF_PATH}")
start = time.time()
images = render_pdf_pages(PDF_PATH, DPI)
elapsed = time.time() - start
print(f"[render] {len(images)} pages in {elapsed:.1f}s, "
f"{images[0].size}px @ {DPI}dpi")
# 2. Pull existing text chunks + voyage-3 embeddings
pool = await asyncpg.create_pool(
host="127.0.0.1", port=5433, user="legal_ai",
password=pg_pw, database="legal_ai",
min_size=1, max_size=2,
)
rows = await pool.fetch("""
SELECT id, chunk_index, page_number, content,
embedding::text AS emb_text
FROM document_chunks
WHERE document_id = $1
ORDER BY chunk_index
""", DOCUMENT_ID)
print(f"[text] {len(rows)} text chunks loaded (voyage-3 in DB)")
text_contents = [r["content"] for r in rows]
text_chunk_pages = [r["page_number"] for r in rows]
text_embs = [parse_pgvector(r["emb_text"]) for r in rows]
# 3. Multimodal embed — try multimodal-3 first, fall back if needed
target_model = "voyage-multimodal-3"
print(f"[multimodal] embedding {len(images)} pages with {target_model}")
start = time.time()
try:
mm_result = voyage.multimodal_embed(
inputs=[[img] for img in images], # list of single-image inputs
model=target_model,
input_type="document",
truncation=True,
)
except voyageai.error.InvalidRequestError as e:
print(f" [error] {e}")
await pool.close()
return
elapsed = time.time() - start
image_embs = mm_result.embeddings
mm_tokens = getattr(mm_result, "total_tokens", "?")
image_tokens = getattr(mm_result, "image_pixels", "?")
text_tokens_mm = getattr(mm_result, "text_tokens", "?")
print(f"[multimodal] done in {elapsed:.1f}s — "
f"total_tokens={mm_tokens} text_tokens={text_tokens_mm} "
f"image_pixels={image_tokens}")
assert len(image_embs) == len(images), "embedding count mismatch"
print(f"[multimodal] embedding dim = {len(image_embs[0])}")
# 4. Run queries
print("\n" + "=" * 100)
print("QUERY RESULTS — top-5 chunks (text/voyage-3) "
"vs top-5 pages (multimodal)")
print("=" * 100)
for q_idx, query in enumerate(QUERIES, 1):
# Text-side: voyage-3 query embedding
q_text = voyage.embed(
[query], model=TEXT_MODEL, input_type="query"
).embeddings[0]
# Multimodal-side: same model, query input_type
q_mm = voyage.multimodal_embed(
inputs=[[query]],
model=target_model,
input_type="query",
).embeddings[0]
text_scores = sorted(
[(cosine(q_text, e), i) for i, e in enumerate(text_embs)],
reverse=True,
)[:5]
mm_scores = sorted(
[(cosine(q_mm, e), i) for i, e in enumerate(image_embs)],
reverse=True,
)[:5]
print(f"\n[Q{q_idx}] {query}")
print(f" --- text (voyage-3) top-5 ---")
for s, i in text_scores:
page = text_chunk_pages[i] if text_chunk_pages[i] else "?"
preview = text_contents[i].replace("\n", " ").strip()[:70]
print(f" {s:.3f} page={page:>3} chunk={i:>3} {preview}")
print(f" --- multimodal (image-only) top-5 ---")
for s, i in mm_scores:
print(f" {s:.3f} page={i+1:>3} (image)")
# Token / cost summary
print("\n" + "=" * 100)
print("SUMMARY")
print("=" * 100)
print(f"PDF: {len(images)} pages @ {DPI}dpi → {target_model}")
print(f"Total multimodal tokens: {mm_tokens}")
print(f"Embedding dim: {len(image_embs[0])}")
print(f"Time: {elapsed:.1f}s for full doc")
await pool.close()
if __name__ == "__main__":
asyncio.run(main())