feat(retrieval): add voyage rerank-2 cross-encoder stage (feature flag)

Stage B of voyage-upgrades-plan rewritten: instead of context-3 (which 4 POCs showed inconsistent improvement), add a cross-encoder rerank layer on top of voyage-3. Default off (VOYAGE_RERANK_ENABLED=false). POC validation (785-doc corpus, 12 queries, claude-haiku-4-5 judge): - mean@3 +4.5% (4.306 → 4.500) - practical-category queries +11.6% (3.78 → 4.22) - latency +702ms per query - no schema change, no re-embed, no double storage Plumbing: - config: VOYAGE_RERANK_ENABLED / _MODEL / _FETCH_K env vars - embeddings.voyage_rerank() wraps voyageai client.rerank - services/rerank.py: maybe_rerank() helper — fetches FETCH_K candidates via the bi-encoder then reranks to top-K. Fail-open if Voyage rerank is unavailable. - tools/search.py: search_decisions, search_case_documents, find_similar_cases all wrapped - services/precedent_library.search_library wrapped Smoke-tested locally with flag on/off — produces expected behaviour and latency profile. Ready for production rollout via Coolify env flip after deploy. POCs (kept under scripts/ for reference): - voyage_context3_poc{_long}.py — context-3 evaluation (rejected) - voyage_multimodal_poc.py — multimodal-3 (stage C, deferred) - voyage_rerank_judge_poc.py — single-case rerank benchmark - voyage_rerank_corpus_poc.py — full-corpus rerank validation Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 18:43:41 +00:00
parent 688ba37d9c
commit 26c3fddf41
13 changed files with 1578 additions and 100 deletions
--- a/scripts/voyage_multimodal_poc.py
+++ b/scripts/voyage_multimodal_poc.py
@@ -0,0 +1,213 @@
+"""POC #3: voyage-3 (text) vs voyage-multimodal-3.5 (page images) on a
+real appraisal PDF (89 pages, full of tables / signatures / numerical
+data — the corpus class where multimodal should help most).
+
+Document under test:
+  baf10153-d2fc-4481-b250-9fe87440ce69
+  "נספח - שומה מכרעת (אבלין דוידזון שמאמא) - 15.09.24"
+  case 8137-24, 89 pages, 2.1 MB
+
+The pipeline:
+  1. Pull the existing voyage-3 text-chunk embeddings from `document_chunks`.
+  2. Render each PDF page → PNG (PyMuPDF, dpi=144).
+  3. Embed all pages via voyage-multimodal-3.5.
+  4. Run benchmark queries (mix of generic + table-specific + visual)
+     against both: text top-K and page top-K.
+
+The comparison is *qualitative* — text and image embeddings are
+different "spaces" returning different ID types (chunk_id vs page_num).
+What we look at is whether image-based retrieval surfaces tables,
+signatures, or numerical data that text-only OCR loses.
+
+No DB writes.
+
+Usage:
+    /home/chaim/legal-ai/mcp-server/.venv/bin/python \\
+        /home/chaim/legal-ai/scripts/voyage_multimodal_poc.py
+"""
+from __future__ import annotations
+
+import asyncio
+import io
+import math
+import os
+import time
+
+ENV_PATH = os.path.expanduser("~/.env")
+if os.path.isfile(ENV_PATH):
+    with open(ENV_PATH) as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#") and "=" in line:
+                k, v = line.split("=", 1)
+                os.environ.setdefault(k, v)
+
+import asyncpg  # noqa: E402
+import voyageai  # noqa: E402
+import fitz  # PyMuPDF  # noqa: E402
+from PIL import Image  # noqa: E402
+
+
+DOCUMENT_ID = "baf10153-d2fc-4481-b250-9fe87440ce69"
+PDF_PATH = (
+    "/home/chaim/legal-ai/data/cases/8137-24/documents/originals/"
+    "נספח - שומה מכרעת (אבלין דוידזון שמאמא) - 15.09.24.pdf"
+)
+TEXT_MODEL = "voyage-3"
+MULTIMODAL_MODEL = "voyage-multimodal-3"  # check supported: 3.5 may not exist yet
+DPI = 144
+# voyage-multimodal: max 1000 inputs/call, 320M pixels/call (rough),
+# so 89 pages at 1240×1750 ≈ 192M pixels = single call.
+
+QUERIES = [
+    # generic-textual (both should handle)
+    "שיטת ההיוון בשומה",
+    "מתודולוגיית הערכת שווי",
+    # table/numerical (multimodal should help)
+    "טבלת השוואת ערכים לפני ואחרי התכנית",
+    "שווי המקרקעין במצב הקודם",
+    "שווי המקרקעין במצב החדש",
+    "ירידת ערך באחוזים",
+    # visual elements (text-only loses)
+    "חתימת השמאי",
+    "תרשים גוש וחלקה",
+    "מפת מיקום הנכס",
+    # context-heavy
+    "מסקנת השמאי המכריע",
+    "עקרון הצפיפות בתכנית",
+]
+
+
+def cosine(a: list[float], b: list[float]) -> float:
+    dot = sum(x * y for x, y in zip(a, b))
+    na = math.sqrt(sum(x * x for x in a))
+    nb = math.sqrt(sum(y * y for y in b))
+    return dot / (na * nb) if na and nb else 0.0
+
+
+def parse_pgvector(s: str) -> list[float]:
+    return [float(x) for x in s.strip("[]").split(",")]
+
+
+def render_pdf_pages(pdf_path: str, dpi: int) -> list[Image.Image]:
+    """Render each page → PIL.Image (RGB)."""
+    doc = fitz.open(pdf_path)
+    images: list[Image.Image] = []
+    for page in doc:
+        pix = page.get_pixmap(dpi=dpi)
+        png_bytes = pix.tobytes("png")
+        img = Image.open(io.BytesIO(png_bytes)).convert("RGB")
+        images.append(img)
+    doc.close()
+    return images
+
+
+async def main():
+    api_key = os.environ["VOYAGE_API_KEY"]
+    pg_pw = os.environ["POSTGRES_PASSWORD"]
+
+    voyage = voyageai.Client(api_key=api_key)
+
+    # 1. Render PDF pages
+    print(f"[render] {PDF_PATH}")
+    start = time.time()
+    images = render_pdf_pages(PDF_PATH, DPI)
+    elapsed = time.time() - start
+    print(f"[render] {len(images)} pages in {elapsed:.1f}s, "
+          f"{images[0].size}px @ {DPI}dpi")
+
+    # 2. Pull existing text chunks + voyage-3 embeddings
+    pool = await asyncpg.create_pool(
+        host="127.0.0.1", port=5433, user="legal_ai",
+        password=pg_pw, database="legal_ai",
+        min_size=1, max_size=2,
+    )
+    rows = await pool.fetch("""
+        SELECT id, chunk_index, page_number, content,
+               embedding::text AS emb_text
+        FROM document_chunks
+        WHERE document_id = $1
+        ORDER BY chunk_index
+    """, DOCUMENT_ID)
+    print(f"[text] {len(rows)} text chunks loaded (voyage-3 in DB)")
+    text_contents = [r["content"] for r in rows]
+    text_chunk_pages = [r["page_number"] for r in rows]
+    text_embs = [parse_pgvector(r["emb_text"]) for r in rows]
+
+    # 3. Multimodal embed — try multimodal-3 first, fall back if needed
+    target_model = "voyage-multimodal-3"
+    print(f"[multimodal] embedding {len(images)} pages with {target_model}…")
+    start = time.time()
+    try:
+        mm_result = voyage.multimodal_embed(
+            inputs=[[img] for img in images],  # list of single-image inputs
+            model=target_model,
+            input_type="document",
+            truncation=True,
+        )
+    except voyageai.error.InvalidRequestError as e:
+        print(f"  [error] {e}")
+        await pool.close()
+        return
+    elapsed = time.time() - start
+    image_embs = mm_result.embeddings
+    mm_tokens = getattr(mm_result, "total_tokens", "?")
+    image_tokens = getattr(mm_result, "image_pixels", "?")
+    text_tokens_mm = getattr(mm_result, "text_tokens", "?")
+    print(f"[multimodal] done in {elapsed:.1f}s — "
+          f"total_tokens={mm_tokens}  text_tokens={text_tokens_mm}  "
+          f"image_pixels={image_tokens}")
+    assert len(image_embs) == len(images), "embedding count mismatch"
+    print(f"[multimodal] embedding dim = {len(image_embs[0])}")
+
+    # 4. Run queries
+    print("\n" + "=" * 100)
+    print("QUERY RESULTS — top-5 chunks (text/voyage-3) "
+          "vs top-5 pages (multimodal)")
+    print("=" * 100)
+
+    for q_idx, query in enumerate(QUERIES, 1):
+        # Text-side: voyage-3 query embedding
+        q_text = voyage.embed(
+            [query], model=TEXT_MODEL, input_type="query"
+        ).embeddings[0]
+        # Multimodal-side: same model, query input_type
+        q_mm = voyage.multimodal_embed(
+            inputs=[[query]],
+            model=target_model,
+            input_type="query",
+        ).embeddings[0]
+
+        text_scores = sorted(
+            [(cosine(q_text, e), i) for i, e in enumerate(text_embs)],
+            reverse=True,
+        )[:5]
+        mm_scores = sorted(
+            [(cosine(q_mm, e), i) for i, e in enumerate(image_embs)],
+            reverse=True,
+        )[:5]
+
+        print(f"\n[Q{q_idx}] {query}")
+        print(f"  --- text (voyage-3) top-5 ---")
+        for s, i in text_scores:
+            page = text_chunk_pages[i] if text_chunk_pages[i] else "?"
+            preview = text_contents[i].replace("\n", " ").strip()[:70]
+            print(f"    {s:.3f}  page={page:>3}  chunk={i:>3}  {preview}")
+        print(f"  --- multimodal (image-only) top-5 ---")
+        for s, i in mm_scores:
+            print(f"    {s:.3f}  page={i+1:>3}  (image)")
+
+    # Token / cost summary
+    print("\n" + "=" * 100)
+    print("SUMMARY")
+    print("=" * 100)
+    print(f"PDF: {len(images)} pages @ {DPI}dpi → {target_model}")
+    print(f"Total multimodal tokens: {mm_tokens}")
+    print(f"Embedding dim: {len(image_embs[0])}")
+    print(f"Time: {elapsed:.1f}s for full doc")
+
+    await pool.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())