"""POC: Compare voyage-3 vs voyage-context-3 retrieval on case 403/17. Pulls all chunks of "אהרון ברק - תכנית רחביה" (case_law_id=e151fc25-...), runs them through voyage-context-3 in a single contextualized_embed call, then runs benchmark queries and compares rankings against the existing voyage-3 embeddings (already in the DB). No DB writes — all comparisons in memory. Output: ranking table for each query showing top-10 from both models side-by-side. Usage: /home/chaim/legal-ai/mcp-server/.venv/bin/python \\ /home/chaim/legal-ai/scripts/voyage_context3_poc.py """ from __future__ import annotations import asyncio import math import os import sys import time # Load ~/.env ENV_PATH = os.path.expanduser("~/.env") if os.path.isfile(ENV_PATH): with open(ENV_PATH) as f: for line in f: line = line.strip() if line and not line.startswith("#") and "=" in line: k, v = line.split("=", 1) os.environ.setdefault(k, v) import asyncpg # noqa: E402 import voyageai # noqa: E402 # Using קלמנוביץ/לויתן (52K chars, 63 chunks, ~18K tokens) # — fits in single context-3 call (32K token limit per inner list). # אהרון ברק (60K tokens) requires splitting; we'll handle that after POC. CASE_ID = "436efd48-c8ab-49f0-b3a9-52bf15ea806d" # בר"מ 25226-04-25 CONTEXT_MODEL = "voyage-context-3" BASELINE_MODEL = "voyage-3" # already in DB QUERIES = [ "סמכות ועדת ערר", "פיצויים לפי סעיף 197", "ירידת ערך מקרקעין", "תכנית פוגעת", "שיקול דעת ועדה מקומית", "חוות דעת שמאי מכריע", "מקרקעין גובלים", "תקופת התיישנות תביעה", "אינטרס ציבורי בתכנון", "דחיית תביעת פיצויים", ] def cosine(a: list[float], b: list[float]) -> float: dot = sum(x * y for x, y in zip(a, b)) na = math.sqrt(sum(x * x for x in a)) nb = math.sqrt(sum(y * y for y in b)) return dot / (na * nb) if na and nb else 0.0 def parse_pgvector(s: str) -> list[float]: """pgvector text format: '[0.1,0.2,...]'.""" return [float(x) for x in s.strip("[]").split(",")] async def main(): api_key = os.environ["VOYAGE_API_KEY"] pg_pw = os.environ["POSTGRES_PASSWORD"] voyage = voyageai.Client(api_key=api_key) pool = await asyncpg.create_pool( host="127.0.0.1", port=5433, user="legal_ai", password=pg_pw, database="legal_ai", min_size=1, max_size=2, ) # 1. Pull all chunks + their existing voyage-3 embeddings rows = await pool.fetch(""" SELECT chunk_index, content, embedding::text AS emb_text FROM precedent_chunks WHERE case_law_id = $1 ORDER BY chunk_index """, CASE_ID) print(f"[load] {len(rows)} chunks from case 403/17") chunks = [r["content"] for r in rows] indices = [r["chunk_index"] for r in rows] baseline_embs = [parse_pgvector(r["emb_text"]) for r in rows] # 2. Embed all chunks with voyage-context-3 — single contextualized call total_chars = sum(len(c) for c in chunks) print(f"[context] embedding {len(chunks)} chunks, {total_chars:,} chars total") start = time.time() result = voyage.contextualized_embed( inputs=[chunks], # one document = one inner list model=CONTEXT_MODEL, input_type="document", ) elapsed = time.time() - start # ContextualizedEmbeddingsObject: result.results = list of per-document # embeddings. result.results[0].embeddings = list of chunk embeddings. context_embs = result.results[0].embeddings total_tokens = getattr(result, "total_tokens", "?") print(f"[context] done in {elapsed:.1f}s — total_tokens={total_tokens}") assert len(context_embs) == len(chunks), "embedding count mismatch" # 3. For each query — embed twice and compare top-10 print("\n" + "=" * 100) print(f"{'Q':<3} {'baseline (voyage-3)':<48} {'context-3':<48}") print("=" * 100) rank_overlaps = [] score_lifts = [] for q_idx, query in enumerate(QUERIES, 1): # Baseline query embedding (regular embed) q_baseline = voyage.embed( [query], model=BASELINE_MODEL, input_type="query" ).embeddings[0] # Context query embedding — must use contextualized_embed even for # single-string queries (regular embed() rejects voyage-context-3). q_context = voyage.contextualized_embed( inputs=[[query]], model=CONTEXT_MODEL, input_type="query", ).results[0].embeddings[0] # Score every chunk under both models scores_b = sorted( [(cosine(q_baseline, e), i) for i, e in enumerate(baseline_embs)], reverse=True, ) scores_c = sorted( [(cosine(q_context, e), i) for i, e in enumerate(context_embs)], reverse=True, ) top10_b = [i for _, i in scores_b[:10]] top10_c = [i for _, i in scores_c[:10]] # Compute overlap and avg score in top-3 overlap = len(set(top10_b) & set(top10_c)) avg_b_top3 = sum(s for s, _ in scores_b[:3]) / 3 avg_c_top3 = sum(s for s, _ in scores_c[:3]) / 3 rank_overlaps.append(overlap) score_lifts.append(avg_c_top3 - avg_b_top3) print(f"\n[Q{q_idx}] {query}") print(f" overlap top-10: {overlap}/10 | avg score top-3: " f"baseline={avg_b_top3:.3f} context-3={avg_c_top3:.3f} " f"Δ={avg_c_top3 - avg_b_top3:+.3f}") for rank in range(5): sb, ib = scores_b[rank] sc, ic = scores_c[rank] cb = chunks[ib].replace("\n", " ").strip()[:50] cc = chunks[ic].replace("\n", " ").strip()[:50] print(f" #{rank+1} [{indices[ib]:3d}] {sb:.3f} {cb:<55} " f"| [{indices[ic]:3d}] {sc:.3f} {cc}") # Summary print("\n" + "=" * 100) print("SUMMARY") print("=" * 100) avg_overlap = sum(rank_overlaps) / len(rank_overlaps) avg_lift = sum(score_lifts) / len(score_lifts) print(f"Avg overlap top-10: {avg_overlap:.1f}/10 " f"(higher = models agree more)") print(f"Avg score lift top-3 (context - baseline): {avg_lift:+.4f}") print(f"\nNote: cosine scores are not directly comparable across models.") print(f"What matters more is which CHUNKS bubble to the top —") print(f"reading the actual content above tells the real story.") await pool.close() if __name__ == "__main__": asyncio.run(main())