All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m29s
Stage B of voyage-upgrades-plan rewritten: instead of context-3 (which
4 POCs showed inconsistent improvement), add a cross-encoder rerank
layer on top of voyage-3. Default off (VOYAGE_RERANK_ENABLED=false).
POC validation (785-doc corpus, 12 queries, claude-haiku-4-5 judge):
- mean@3 +4.5% (4.306 → 4.500)
- practical-category queries +11.6% (3.78 → 4.22)
- latency +702ms per query
- no schema change, no re-embed, no double storage
Plumbing:
- config: VOYAGE_RERANK_ENABLED / _MODEL / _FETCH_K env vars
- embeddings.voyage_rerank() wraps voyageai client.rerank
- services/rerank.py: maybe_rerank() helper — fetches FETCH_K candidates
via the bi-encoder then reranks to top-K. Fail-open if Voyage rerank is
unavailable.
- tools/search.py: search_decisions, search_case_documents,
find_similar_cases all wrapped
- services/precedent_library.search_library wrapped
Smoke-tested locally with flag on/off — produces expected behaviour and
latency profile. Ready for production rollout via Coolify env flip after
deploy.
POCs (kept under scripts/ for reference):
- voyage_context3_poc{_long}.py — context-3 evaluation (rejected)
- voyage_multimodal_poc.py — multimodal-3 (stage C, deferred)
- voyage_rerank_judge_poc.py — single-case rerank benchmark
- voyage_rerank_corpus_poc.py — full-corpus rerank validation
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
214 lines
7.3 KiB
Python
214 lines
7.3 KiB
Python
"""POC #3: voyage-3 (text) vs voyage-multimodal-3.5 (page images) on a
|
||
real appraisal PDF (89 pages, full of tables / signatures / numerical
|
||
data — the corpus class where multimodal should help most).
|
||
|
||
Document under test:
|
||
baf10153-d2fc-4481-b250-9fe87440ce69
|
||
"נספח - שומה מכרעת (אבלין דוידזון שמאמא) - 15.09.24"
|
||
case 8137-24, 89 pages, 2.1 MB
|
||
|
||
The pipeline:
|
||
1. Pull the existing voyage-3 text-chunk embeddings from `document_chunks`.
|
||
2. Render each PDF page → PNG (PyMuPDF, dpi=144).
|
||
3. Embed all pages via voyage-multimodal-3.5.
|
||
4. Run benchmark queries (mix of generic + table-specific + visual)
|
||
against both: text top-K and page top-K.
|
||
|
||
The comparison is *qualitative* — text and image embeddings are
|
||
different "spaces" returning different ID types (chunk_id vs page_num).
|
||
What we look at is whether image-based retrieval surfaces tables,
|
||
signatures, or numerical data that text-only OCR loses.
|
||
|
||
No DB writes.
|
||
|
||
Usage:
|
||
/home/chaim/legal-ai/mcp-server/.venv/bin/python \\
|
||
/home/chaim/legal-ai/scripts/voyage_multimodal_poc.py
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import io
|
||
import math
|
||
import os
|
||
import time
|
||
|
||
ENV_PATH = os.path.expanduser("~/.env")
|
||
if os.path.isfile(ENV_PATH):
|
||
with open(ENV_PATH) as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if line and not line.startswith("#") and "=" in line:
|
||
k, v = line.split("=", 1)
|
||
os.environ.setdefault(k, v)
|
||
|
||
import asyncpg # noqa: E402
|
||
import voyageai # noqa: E402
|
||
import fitz # PyMuPDF # noqa: E402
|
||
from PIL import Image # noqa: E402
|
||
|
||
|
||
DOCUMENT_ID = "baf10153-d2fc-4481-b250-9fe87440ce69"
|
||
PDF_PATH = (
|
||
"/home/chaim/legal-ai/data/cases/8137-24/documents/originals/"
|
||
"נספח - שומה מכרעת (אבלין דוידזון שמאמא) - 15.09.24.pdf"
|
||
)
|
||
TEXT_MODEL = "voyage-3"
|
||
MULTIMODAL_MODEL = "voyage-multimodal-3" # check supported: 3.5 may not exist yet
|
||
DPI = 144
|
||
# voyage-multimodal: max 1000 inputs/call, 320M pixels/call (rough),
|
||
# so 89 pages at 1240×1750 ≈ 192M pixels = single call.
|
||
|
||
QUERIES = [
|
||
# generic-textual (both should handle)
|
||
"שיטת ההיוון בשומה",
|
||
"מתודולוגיית הערכת שווי",
|
||
# table/numerical (multimodal should help)
|
||
"טבלת השוואת ערכים לפני ואחרי התכנית",
|
||
"שווי המקרקעין במצב הקודם",
|
||
"שווי המקרקעין במצב החדש",
|
||
"ירידת ערך באחוזים",
|
||
# visual elements (text-only loses)
|
||
"חתימת השמאי",
|
||
"תרשים גוש וחלקה",
|
||
"מפת מיקום הנכס",
|
||
# context-heavy
|
||
"מסקנת השמאי המכריע",
|
||
"עקרון הצפיפות בתכנית",
|
||
]
|
||
|
||
|
||
def cosine(a: list[float], b: list[float]) -> float:
|
||
dot = sum(x * y for x, y in zip(a, b))
|
||
na = math.sqrt(sum(x * x for x in a))
|
||
nb = math.sqrt(sum(y * y for y in b))
|
||
return dot / (na * nb) if na and nb else 0.0
|
||
|
||
|
||
def parse_pgvector(s: str) -> list[float]:
|
||
return [float(x) for x in s.strip("[]").split(",")]
|
||
|
||
|
||
def render_pdf_pages(pdf_path: str, dpi: int) -> list[Image.Image]:
|
||
"""Render each page → PIL.Image (RGB)."""
|
||
doc = fitz.open(pdf_path)
|
||
images: list[Image.Image] = []
|
||
for page in doc:
|
||
pix = page.get_pixmap(dpi=dpi)
|
||
png_bytes = pix.tobytes("png")
|
||
img = Image.open(io.BytesIO(png_bytes)).convert("RGB")
|
||
images.append(img)
|
||
doc.close()
|
||
return images
|
||
|
||
|
||
async def main():
|
||
api_key = os.environ["VOYAGE_API_KEY"]
|
||
pg_pw = os.environ["POSTGRES_PASSWORD"]
|
||
|
||
voyage = voyageai.Client(api_key=api_key)
|
||
|
||
# 1. Render PDF pages
|
||
print(f"[render] {PDF_PATH}")
|
||
start = time.time()
|
||
images = render_pdf_pages(PDF_PATH, DPI)
|
||
elapsed = time.time() - start
|
||
print(f"[render] {len(images)} pages in {elapsed:.1f}s, "
|
||
f"{images[0].size}px @ {DPI}dpi")
|
||
|
||
# 2. Pull existing text chunks + voyage-3 embeddings
|
||
pool = await asyncpg.create_pool(
|
||
host="127.0.0.1", port=5433, user="legal_ai",
|
||
password=pg_pw, database="legal_ai",
|
||
min_size=1, max_size=2,
|
||
)
|
||
rows = await pool.fetch("""
|
||
SELECT id, chunk_index, page_number, content,
|
||
embedding::text AS emb_text
|
||
FROM document_chunks
|
||
WHERE document_id = $1
|
||
ORDER BY chunk_index
|
||
""", DOCUMENT_ID)
|
||
print(f"[text] {len(rows)} text chunks loaded (voyage-3 in DB)")
|
||
text_contents = [r["content"] for r in rows]
|
||
text_chunk_pages = [r["page_number"] for r in rows]
|
||
text_embs = [parse_pgvector(r["emb_text"]) for r in rows]
|
||
|
||
# 3. Multimodal embed — try multimodal-3 first, fall back if needed
|
||
target_model = "voyage-multimodal-3"
|
||
print(f"[multimodal] embedding {len(images)} pages with {target_model}…")
|
||
start = time.time()
|
||
try:
|
||
mm_result = voyage.multimodal_embed(
|
||
inputs=[[img] for img in images], # list of single-image inputs
|
||
model=target_model,
|
||
input_type="document",
|
||
truncation=True,
|
||
)
|
||
except voyageai.error.InvalidRequestError as e:
|
||
print(f" [error] {e}")
|
||
await pool.close()
|
||
return
|
||
elapsed = time.time() - start
|
||
image_embs = mm_result.embeddings
|
||
mm_tokens = getattr(mm_result, "total_tokens", "?")
|
||
image_tokens = getattr(mm_result, "image_pixels", "?")
|
||
text_tokens_mm = getattr(mm_result, "text_tokens", "?")
|
||
print(f"[multimodal] done in {elapsed:.1f}s — "
|
||
f"total_tokens={mm_tokens} text_tokens={text_tokens_mm} "
|
||
f"image_pixels={image_tokens}")
|
||
assert len(image_embs) == len(images), "embedding count mismatch"
|
||
print(f"[multimodal] embedding dim = {len(image_embs[0])}")
|
||
|
||
# 4. Run queries
|
||
print("\n" + "=" * 100)
|
||
print("QUERY RESULTS — top-5 chunks (text/voyage-3) "
|
||
"vs top-5 pages (multimodal)")
|
||
print("=" * 100)
|
||
|
||
for q_idx, query in enumerate(QUERIES, 1):
|
||
# Text-side: voyage-3 query embedding
|
||
q_text = voyage.embed(
|
||
[query], model=TEXT_MODEL, input_type="query"
|
||
).embeddings[0]
|
||
# Multimodal-side: same model, query input_type
|
||
q_mm = voyage.multimodal_embed(
|
||
inputs=[[query]],
|
||
model=target_model,
|
||
input_type="query",
|
||
).embeddings[0]
|
||
|
||
text_scores = sorted(
|
||
[(cosine(q_text, e), i) for i, e in enumerate(text_embs)],
|
||
reverse=True,
|
||
)[:5]
|
||
mm_scores = sorted(
|
||
[(cosine(q_mm, e), i) for i, e in enumerate(image_embs)],
|
||
reverse=True,
|
||
)[:5]
|
||
|
||
print(f"\n[Q{q_idx}] {query}")
|
||
print(f" --- text (voyage-3) top-5 ---")
|
||
for s, i in text_scores:
|
||
page = text_chunk_pages[i] if text_chunk_pages[i] else "?"
|
||
preview = text_contents[i].replace("\n", " ").strip()[:70]
|
||
print(f" {s:.3f} page={page:>3} chunk={i:>3} {preview}")
|
||
print(f" --- multimodal (image-only) top-5 ---")
|
||
for s, i in mm_scores:
|
||
print(f" {s:.3f} page={i+1:>3} (image)")
|
||
|
||
# Token / cost summary
|
||
print("\n" + "=" * 100)
|
||
print("SUMMARY")
|
||
print("=" * 100)
|
||
print(f"PDF: {len(images)} pages @ {DPI}dpi → {target_model}")
|
||
print(f"Total multimodal tokens: {mm_tokens}")
|
||
print(f"Embedding dim: {len(image_embs[0])}")
|
||
print(f"Time: {elapsed:.1f}s for full doc")
|
||
|
||
await pool.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|