legal-ai/scripts/voyage_multimodal_poc.py

"""POC #3: voyage-3 (text) vs voyage-multimodal-3.5 (page images) on a
real appraisal PDF (89 pages, full of tables / signatures / numerical
data — the corpus class where multimodal should help most).

Document under test:
  baf10153-d2fc-4481-b250-9fe87440ce69
  "נספח - שומה מכרעת (אבלין דוידזון שמאמא) - 15.09.24"
  case 8137-24, 89 pages, 2.1 MB

The pipeline:
  1. Pull the existing voyage-3 text-chunk embeddings from `document_chunks`.
  2. Render each PDF page → PNG (PyMuPDF, dpi=144).
  3. Embed all pages via voyage-multimodal-3.5.
  4. Run benchmark queries (mix of generic + table-specific + visual)
     against both: text top-K and page top-K.

The comparison is *qualitative* — text and image embeddings are
different "spaces" returning different ID types (chunk_id vs page_num).
What we look at is whether image-based retrieval surfaces tables,
signatures, or numerical data that text-only OCR loses.

No DB writes.

Usage:
    /home/chaim/legal-ai/mcp-server/.venv/bin/python \\
        /home/chaim/legal-ai/scripts/voyage_multimodal_poc.py
"""
from __future__ import annotations

import asyncio
import io
import math
import os
import time

ENV_PATH = os.path.expanduser("~/.env")
if os.path.isfile(ENV_PATH):
    with open(ENV_PATH) as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith("#") and "=" in line:
                k, v = line.split("=", 1)
                os.environ.setdefault(k, v)

import asyncpg  # noqa: E402
import voyageai  # noqa: E402
import fitz  # PyMuPDF  # noqa: E402
from PIL import Image  # noqa: E402


DOCUMENT_ID = "baf10153-d2fc-4481-b250-9fe87440ce69"
PDF_PATH = (
    "/home/chaim/legal-ai/data/cases/8137-24/documents/originals/"
    "נספח - שומה מכרעת (אבלין דוידזון שמאמא) - 15.09.24.pdf"
)
TEXT_MODEL = "voyage-3"
MULTIMODAL_MODEL = "voyage-multimodal-3"  # check supported: 3.5 may not exist yet
DPI = 144
# voyage-multimodal: max 1000 inputs/call, 320M pixels/call (rough),
# so 89 pages at 1240×1750 ≈ 192M pixels = single call.

QUERIES = [
    # generic-textual (both should handle)
    "שיטת ההיוון בשומה",
    "מתודולוגיית הערכת שווי",
    # table/numerical (multimodal should help)
    "טבלת השוואת ערכים לפני ואחרי התכנית",
    "שווי המקרקעין במצב הקודם",
    "שווי המקרקעין במצב החדש",
    "ירידת ערך באחוזים",
    # visual elements (text-only loses)
    "חתימת השמאי",
    "תרשים גוש וחלקה",
    "מפת מיקום הנכס",
    # context-heavy
    "מסקנת השמאי המכריע",
    "עקרון הצפיפות בתכנית",
]


def cosine(a: list[float], b: list[float]) -> float:
    dot = sum(x * y for x, y in zip(a, b))
    na = math.sqrt(sum(x * x for x in a))
    nb = math.sqrt(sum(y * y for y in b))
    return dot / (na * nb) if na and nb else 0.0


def parse_pgvector(s: str) -> list[float]:
    return [float(x) for x in s.strip("[]").split(",")]


def render_pdf_pages(pdf_path: str, dpi: int) -> list[Image.Image]:
    """Render each page → PIL.Image (RGB)."""
    doc = fitz.open(pdf_path)
    images: list[Image.Image] = []
    for page in doc:
        pix = page.get_pixmap(dpi=dpi)
        png_bytes = pix.tobytes("png")
        img = Image.open(io.BytesIO(png_bytes)).convert("RGB")
        images.append(img)
    doc.close()
    return images


async def main():
    api_key = os.environ["VOYAGE_API_KEY"]
    pg_pw = os.environ["POSTGRES_PASSWORD"]

    voyage = voyageai.Client(api_key=api_key)

    # 1. Render PDF pages
    print(f"[render] {PDF_PATH}")
    start = time.time()
    images = render_pdf_pages(PDF_PATH, DPI)
    elapsed = time.time() - start
    print(f"[render] {len(images)} pages in {elapsed:.1f}s, "
          f"{images[0].size}px @ {DPI}dpi")

    # 2. Pull existing text chunks + voyage-3 embeddings
    pool = await asyncpg.create_pool(
        host="127.0.0.1", port=5433, user="legal_ai",
        password=pg_pw, database="legal_ai",
        min_size=1, max_size=2,
    )
    rows = await pool.fetch("""
        SELECT id, chunk_index, page_number, content,
               embedding::text AS emb_text
        FROM document_chunks
        WHERE document_id = $1
        ORDER BY chunk_index
    """, DOCUMENT_ID)
    print(f"[text] {len(rows)} text chunks loaded (voyage-3 in DB)")
    text_contents = [r["content"] for r in rows]
    text_chunk_pages = [r["page_number"] for r in rows]
    text_embs = [parse_pgvector(r["emb_text"]) for r in rows]

    # 3. Multimodal embed — try multimodal-3 first, fall back if needed
    target_model = "voyage-multimodal-3"
    print(f"[multimodal] embedding {len(images)} pages with {target_model}…")
    start = time.time()
    try:
        mm_result = voyage.multimodal_embed(
            inputs=[[img] for img in images],  # list of single-image inputs
            model=target_model,
            input_type="document",
            truncation=True,
        )
    except voyageai.error.InvalidRequestError as e:
        print(f"  [error] {e}")
        await pool.close()
        return
    elapsed = time.time() - start
    image_embs = mm_result.embeddings
    mm_tokens = getattr(mm_result, "total_tokens", "?")
    image_tokens = getattr(mm_result, "image_pixels", "?")
    text_tokens_mm = getattr(mm_result, "text_tokens", "?")
    print(f"[multimodal] done in {elapsed:.1f}s — "
          f"total_tokens={mm_tokens}  text_tokens={text_tokens_mm}  "
          f"image_pixels={image_tokens}")
    assert len(image_embs) == len(images), "embedding count mismatch"
    print(f"[multimodal] embedding dim = {len(image_embs[0])}")

    # 4. Run queries
    print("\n" + "=" * 100)
    print("QUERY RESULTS — top-5 chunks (text/voyage-3) "
          "vs top-5 pages (multimodal)")
    print("=" * 100)

    for q_idx, query in enumerate(QUERIES, 1):
        # Text-side: voyage-3 query embedding
        q_text = voyage.embed(
            [query], model=TEXT_MODEL, input_type="query"
        ).embeddings[0]
        # Multimodal-side: same model, query input_type
        q_mm = voyage.multimodal_embed(
            inputs=[[query]],
            model=target_model,
            input_type="query",
        ).embeddings[0]

        text_scores = sorted(
            [(cosine(q_text, e), i) for i, e in enumerate(text_embs)],
            reverse=True,
        )[:5]
        mm_scores = sorted(
            [(cosine(q_mm, e), i) for i, e in enumerate(image_embs)],
            reverse=True,
        )[:5]

        print(f"\n[Q{q_idx}] {query}")
        print(f"  --- text (voyage-3) top-5 ---")
        for s, i in text_scores:
            page = text_chunk_pages[i] if text_chunk_pages[i] else "?"
            preview = text_contents[i].replace("\n", " ").strip()[:70]
            print(f"    {s:.3f}  page={page:>3}  chunk={i:>3}  {preview}")
        print(f"  --- multimodal (image-only) top-5 ---")
        for s, i in mm_scores:
            print(f"    {s:.3f}  page={i+1:>3}  (image)")

    # Token / cost summary
    print("\n" + "=" * 100)
    print("SUMMARY")
    print("=" * 100)
    print(f"PDF: {len(images)} pages @ {DPI}dpi → {target_model}")
    print(f"Total multimodal tokens: {mm_tokens}")
    print(f"Embedding dim: {len(image_embs[0])}")
    print(f"Time: {elapsed:.1f}s for full doc")

    await pool.close()


if __name__ == "__main__":
    asyncio.run(main())