"""POC #3: voyage-3 (text) vs voyage-multimodal-3.5 (page images) on a real appraisal PDF (89 pages, full of tables / signatures / numerical data — the corpus class where multimodal should help most). Document under test: baf10153-d2fc-4481-b250-9fe87440ce69 "נספח - שומה מכרעת (אבלין דוידזון שמאמא) - 15.09.24" case 8137-24, 89 pages, 2.1 MB The pipeline: 1. Pull the existing voyage-3 text-chunk embeddings from `document_chunks`. 2. Render each PDF page → PNG (PyMuPDF, dpi=144). 3. Embed all pages via voyage-multimodal-3.5. 4. Run benchmark queries (mix of generic + table-specific + visual) against both: text top-K and page top-K. The comparison is *qualitative* — text and image embeddings are different "spaces" returning different ID types (chunk_id vs page_num). What we look at is whether image-based retrieval surfaces tables, signatures, or numerical data that text-only OCR loses. No DB writes. Usage: /home/chaim/legal-ai/mcp-server/.venv/bin/python \\ /home/chaim/legal-ai/scripts/voyage_multimodal_poc.py """ from __future__ import annotations import asyncio import io import math import os import time ENV_PATH = os.path.expanduser("~/.env") if os.path.isfile(ENV_PATH): with open(ENV_PATH) as f: for line in f: line = line.strip() if line and not line.startswith("#") and "=" in line: k, v = line.split("=", 1) os.environ.setdefault(k, v) import asyncpg # noqa: E402 import voyageai # noqa: E402 import fitz # PyMuPDF # noqa: E402 from PIL import Image # noqa: E402 DOCUMENT_ID = "baf10153-d2fc-4481-b250-9fe87440ce69" PDF_PATH = ( "/home/chaim/legal-ai/data/cases/8137-24/documents/originals/" "נספח - שומה מכרעת (אבלין דוידזון שמאמא) - 15.09.24.pdf" ) TEXT_MODEL = "voyage-3" MULTIMODAL_MODEL = "voyage-multimodal-3" # check supported: 3.5 may not exist yet DPI = 144 # voyage-multimodal: max 1000 inputs/call, 320M pixels/call (rough), # so 89 pages at 1240×1750 ≈ 192M pixels = single call. QUERIES = [ # generic-textual (both should handle) "שיטת ההיוון בשומה", "מתודולוגיית הערכת שווי", # table/numerical (multimodal should help) "טבלת השוואת ערכים לפני ואחרי התכנית", "שווי המקרקעין במצב הקודם", "שווי המקרקעין במצב החדש", "ירידת ערך באחוזים", # visual elements (text-only loses) "חתימת השמאי", "תרשים גוש וחלקה", "מפת מיקום הנכס", # context-heavy "מסקנת השמאי המכריע", "עקרון הצפיפות בתכנית", ] def cosine(a: list[float], b: list[float]) -> float: dot = sum(x * y for x, y in zip(a, b)) na = math.sqrt(sum(x * x for x in a)) nb = math.sqrt(sum(y * y for y in b)) return dot / (na * nb) if na and nb else 0.0 def parse_pgvector(s: str) -> list[float]: return [float(x) for x in s.strip("[]").split(",")] def render_pdf_pages(pdf_path: str, dpi: int) -> list[Image.Image]: """Render each page → PIL.Image (RGB).""" doc = fitz.open(pdf_path) images: list[Image.Image] = [] for page in doc: pix = page.get_pixmap(dpi=dpi) png_bytes = pix.tobytes("png") img = Image.open(io.BytesIO(png_bytes)).convert("RGB") images.append(img) doc.close() return images async def main(): api_key = os.environ["VOYAGE_API_KEY"] pg_pw = os.environ["POSTGRES_PASSWORD"] voyage = voyageai.Client(api_key=api_key) # 1. Render PDF pages print(f"[render] {PDF_PATH}") start = time.time() images = render_pdf_pages(PDF_PATH, DPI) elapsed = time.time() - start print(f"[render] {len(images)} pages in {elapsed:.1f}s, " f"{images[0].size}px @ {DPI}dpi") # 2. Pull existing text chunks + voyage-3 embeddings pool = await asyncpg.create_pool( host="127.0.0.1", port=5433, user="legal_ai", password=pg_pw, database="legal_ai", min_size=1, max_size=2, ) rows = await pool.fetch(""" SELECT id, chunk_index, page_number, content, embedding::text AS emb_text FROM document_chunks WHERE document_id = $1 ORDER BY chunk_index """, DOCUMENT_ID) print(f"[text] {len(rows)} text chunks loaded (voyage-3 in DB)") text_contents = [r["content"] for r in rows] text_chunk_pages = [r["page_number"] for r in rows] text_embs = [parse_pgvector(r["emb_text"]) for r in rows] # 3. Multimodal embed — try multimodal-3 first, fall back if needed target_model = "voyage-multimodal-3" print(f"[multimodal] embedding {len(images)} pages with {target_model}…") start = time.time() try: mm_result = voyage.multimodal_embed( inputs=[[img] for img in images], # list of single-image inputs model=target_model, input_type="document", truncation=True, ) except voyageai.error.InvalidRequestError as e: print(f" [error] {e}") await pool.close() return elapsed = time.time() - start image_embs = mm_result.embeddings mm_tokens = getattr(mm_result, "total_tokens", "?") image_tokens = getattr(mm_result, "image_pixels", "?") text_tokens_mm = getattr(mm_result, "text_tokens", "?") print(f"[multimodal] done in {elapsed:.1f}s — " f"total_tokens={mm_tokens} text_tokens={text_tokens_mm} " f"image_pixels={image_tokens}") assert len(image_embs) == len(images), "embedding count mismatch" print(f"[multimodal] embedding dim = {len(image_embs[0])}") # 4. Run queries print("\n" + "=" * 100) print("QUERY RESULTS — top-5 chunks (text/voyage-3) " "vs top-5 pages (multimodal)") print("=" * 100) for q_idx, query in enumerate(QUERIES, 1): # Text-side: voyage-3 query embedding q_text = voyage.embed( [query], model=TEXT_MODEL, input_type="query" ).embeddings[0] # Multimodal-side: same model, query input_type q_mm = voyage.multimodal_embed( inputs=[[query]], model=target_model, input_type="query", ).embeddings[0] text_scores = sorted( [(cosine(q_text, e), i) for i, e in enumerate(text_embs)], reverse=True, )[:5] mm_scores = sorted( [(cosine(q_mm, e), i) for i, e in enumerate(image_embs)], reverse=True, )[:5] print(f"\n[Q{q_idx}] {query}") print(f" --- text (voyage-3) top-5 ---") for s, i in text_scores: page = text_chunk_pages[i] if text_chunk_pages[i] else "?" preview = text_contents[i].replace("\n", " ").strip()[:70] print(f" {s:.3f} page={page:>3} chunk={i:>3} {preview}") print(f" --- multimodal (image-only) top-5 ---") for s, i in mm_scores: print(f" {s:.3f} page={i+1:>3} (image)") # Token / cost summary print("\n" + "=" * 100) print("SUMMARY") print("=" * 100) print(f"PDF: {len(images)} pages @ {DPI}dpi → {target_model}") print(f"Total multimodal tokens: {mm_tokens}") print(f"Embedding dim: {len(image_embs[0])}") print(f"Time: {elapsed:.1f}s for full doc") await pool.close() if __name__ == "__main__": asyncio.run(main())