"""POC #2: voyage-3 vs voyage-context-3 on a LONG case (אהרון ברק 403/17). Case is 178K chars / 219 chunks / ~60K tokens — too big for a single contextualized_embed call (32K token limit per inner list). We split the chunks into overlapping sliding windows (~80 chunks each, ~22K tokens) and merge: each chunk gets the embedding from the window where it sits *most centrally* (max symmetric context on both sides). The hypothesis: voyage-context-3 should shine here because the case is full of internal references ("ראה לעיל סעיף 13", "להבדיל מעניין X", "תוצאת הבחינה ב-בר"מ 1975/24 שנידונה לעיל"). voyage-3 embeds chunks in isolation; context-3 sees ~80 surrounding chunks per embedding. No DB writes. Output: side-by-side ranking comparison + summary. Usage: /home/chaim/legal-ai/mcp-server/.venv/bin/python \\ /home/chaim/legal-ai/scripts/voyage_context3_poc_long.py """ from __future__ import annotations import asyncio import math import os import sys import time ENV_PATH = os.path.expanduser("~/.env") if os.path.isfile(ENV_PATH): with open(ENV_PATH) as f: for line in f: line = line.strip() if line and not line.startswith("#") and "=" in line: k, v = line.split("=", 1) os.environ.setdefault(k, v) import asyncpg # noqa: E402 import voyageai # noqa: E402 CASE_ID = "e151fc25-cf12-4563-b638-a86323f8413b" # 403/17 אהרון ברק (178K chars) CONTEXT_MODEL = "voyage-context-3" BASELINE_MODEL = "voyage-3" # Sliding-window split params. With 219 chunks and ~60K tokens total # (~275 tokens/chunk average), 3 windows of 80 chunks each is ~22K tokens # per call — comfortably under 32K. WINDOW_SIZE = 80 WINDOW_STRIDE = 70 # overlap = WINDOW_SIZE - WINDOW_STRIDE = 10 # Mix of: # (a) generic queries (also tested in POC #1) # (b) queries that require *internal* document context QUERIES = [ # generic "תכנית רחביה הוראות בנייה", "פיצויים לפי סעיף 197 ירידת ערך", "השפעת תכנית על שווי מקרקעין", "סמכות ועדת ערר לדון בפיצויים", "תוספת זכויות בנייה כפיצוי", # internal-context — should benefit context-3 "ההבחנה בין השבחה לפיצויים", "מה נקבע לגבי תמ\"א 38 בפסק הדין", "ההלכה שנקבעה בעניין רובע 3", "כלל הנטרול של זכויות תכנוניות", "הסכמת השופט אלרון לחוות הדעת", ] def cosine(a: list[float], b: list[float]) -> float: dot = sum(x * y for x, y in zip(a, b)) na = math.sqrt(sum(x * x for x in a)) nb = math.sqrt(sum(y * y for y in b)) return dot / (na * nb) if na and nb else 0.0 def parse_pgvector(s: str) -> list[float]: return [float(x) for x in s.strip("[]").split(",")] def build_windows(n: int, size: int, stride: int) -> list[tuple[int, int]]: """Return list of (start, end) ranges (end exclusive) covering 0..n. Last window extends to n exactly. Overlap = size - stride. """ windows = [] start = 0 while start < n: end = min(start + size, n) windows.append((start, end)) if end == n: break start += stride return windows def assign_chunk_to_window( chunk_idx: int, windows: list[tuple[int, int]], ) -> int: """Pick the window where chunk_idx sits most centrally (max symmetric distance to either edge). Ties broken by larger window.""" best = -1 best_score = -1 for w_idx, (s, e) in enumerate(windows): if not (s <= chunk_idx < e): continue # symmetric distance: min(distance to s, distance to e-1) dist = min(chunk_idx - s, (e - 1) - chunk_idx) if dist > best_score: best_score = dist best = w_idx return best async def main(): api_key = os.environ["VOYAGE_API_KEY"] pg_pw = os.environ["POSTGRES_PASSWORD"] voyage = voyageai.Client(api_key=api_key) pool = await asyncpg.create_pool( host="127.0.0.1", port=5433, user="legal_ai", password=pg_pw, database="legal_ai", min_size=1, max_size=2, ) rows = await pool.fetch(""" SELECT chunk_index, content, embedding::text AS emb_text FROM precedent_chunks WHERE case_law_id = $1 ORDER BY chunk_index """, CASE_ID) n = len(rows) print(f"[load] {n} chunks from אהרון ברק 403/17") chunks = [r["content"] for r in rows] indices = [r["chunk_index"] for r in rows] baseline_embs = [parse_pgvector(r["emb_text"]) for r in rows] # Build windows windows = build_windows(n, WINDOW_SIZE, WINDOW_STRIDE) print(f"[windows] {len(windows)} windows: " f"{', '.join(f'[{s}:{e})' for s, e in windows)}") # Embed each window with context-3 window_embs: list[list[list[float]]] = [] # [window][chunk_in_window][dim] total_call_tokens = 0 total_start = time.time() for w_idx, (s, e) in enumerate(windows): sub_chunks = chunks[s:e] sub_chars = sum(len(c) for c in sub_chunks) start = time.time() result = voyage.contextualized_embed( inputs=[sub_chunks], model=CONTEXT_MODEL, input_type="document", ) elapsed = time.time() - start toks = getattr(result, "total_tokens", 0) total_call_tokens += toks print(f" [window {w_idx}] [{s}:{e}) — {len(sub_chunks)} chunks, " f"{sub_chars:,} chars, {toks} tokens — {elapsed:.1f}s") window_embs.append(result.results[0].embeddings) total_elapsed = time.time() - total_start print(f"[context] all windows done in {total_elapsed:.1f}s, " f"{total_call_tokens} total tokens") # Merge: for each chunk, pick the embedding from its most-central window context_embs: list[list[float]] = [] chunk_window_choice = [] for i in range(n): w_idx = assign_chunk_to_window(i, windows) chunk_window_choice.append(w_idx) s, _ = windows[w_idx] context_embs.append(window_embs[w_idx][i - s]) print(f"[merge] window distribution: " f"{[chunk_window_choice.count(j) for j in range(len(windows))]}") # Run queries print("\n" + "=" * 100) print(f"{'Q':<3} {'baseline (voyage-3)':<48} {'context-3 (windowed)':<48}") print("=" * 100) rank_overlaps = [] for q_idx, query in enumerate(QUERIES, 1): q_baseline = voyage.embed( [query], model=BASELINE_MODEL, input_type="query" ).embeddings[0] q_context = voyage.contextualized_embed( inputs=[[query]], model=CONTEXT_MODEL, input_type="query", ).results[0].embeddings[0] scores_b = sorted( [(cosine(q_baseline, e), i) for i, e in enumerate(baseline_embs)], reverse=True, ) scores_c = sorted( [(cosine(q_context, e), i) for i, e in enumerate(context_embs)], reverse=True, ) top10_b = [i for _, i in scores_b[:10]] top10_c = [i for _, i in scores_c[:10]] overlap = len(set(top10_b) & set(top10_c)) rank_overlaps.append(overlap) print(f"\n[Q{q_idx}] {query}") print(f" overlap top-10: {overlap}/10 | " f"avg score top-3: baseline=" f"{sum(s for s, _ in scores_b[:3])/3:.3f} " f"context-3={sum(s for s, _ in scores_c[:3])/3:.3f}") for rank in range(5): sb, ib = scores_b[rank] sc, ic = scores_c[rank] cb = chunks[ib].replace("\n", " ").strip()[:50] cc = chunks[ic].replace("\n", " ").strip()[:50] print(f" #{rank+1} [{indices[ib]:3d}] {sb:.3f} {cb:<55} " f"| [{indices[ic]:3d}] {sc:.3f} {cc}") print("\n" + "=" * 100) print("SUMMARY") print("=" * 100) avg = sum(rank_overlaps) / len(rank_overlaps) print(f"Avg overlap top-10: {avg:.1f}/10") print(f"Per-query overlap: {rank_overlaps}") print(f"Total context-3 tokens used: {total_call_tokens:,} " f"(in {len(windows)} calls)") print(f"\nNote: cosine across models not directly comparable. The") print(f"meaningful test is *which chunks bubble to the top* — read") print(f"the actual text above to judge relevance.") await pool.close() if __name__ == "__main__": asyncio.run(main())