"""Benchmark embedding models on case 1130-25 documents. Compares voyage-3-large (current), voyage-4-large, and voyage-law-2 on Hebrew legal text retrieval quality, timing, and cost. """ import json import os import time import sys from pathlib import Path import voyageai API_KEY = os.environ.get("VOYAGE_API_KEY", "pa-qbfhBDxW0tVtgzr_abMyw_AJO2gli9w3nnqyHuQOW-e") client = voyageai.Client(api_key=API_KEY) MODELS = [ "voyage-3-large", # current "voyage-4-large", # upgrade candidate "voyage-law-2", # legal specialist ] # Pricing per 1M tokens (from Voyage AI docs) PRICING = { "voyage-3-large": 0.06, "voyage-4-large": 0.12, "voyage-law-2": 0.12, } DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents") DOCUMENTS = { "כתב ערר קובר": DOCS_DIR / "2025-08-14-כתב-ערר-קובר.md", "כתב ערר מטמון": DOCS_DIR / "2025-10-22-כתב-ערר-מטמון.md", "תשובת ועדת הראל": DOCS_DIR / "2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md", "תשובת ליבמן": DOCS_DIR / "2025-09-01-כתב-תשובה-ליבמן-לערר.md", } # Test queries — real questions a judge would ask about this case QUERIES = [ "מהי הטענה המרכזית של העוררים בנוגע לחניה?", "מה עמדת הוועדה המקומית לגבי התכנית?", "האם יש פגיעה בזכויות הבנייה של השכנים?", "מהם התנאים שנקבעו בהיתר הבנייה?", "האם התכנית עומדת בתקן החניה?", "מה טענות המשיבים לגבי הגובה והצפיפות?", "האם נערך שימוע כדין לפני מתן ההחלטה?", "מהם הנימוקים לאישור התכנית על ידי הוועדה המקומית?", ] def chunk_text(text: str, chunk_size: int = 600, overlap: int = 100) -> list[str]: """Simple word-based chunking.""" words = text.split() chunks = [] i = 0 while i < len(words): chunk = " ".join(words[i:i + chunk_size]) chunks.append(chunk) i += chunk_size - overlap return chunks def cosine_sim(a: list[float], b: list[float]) -> float: dot = sum(x * y for x, y in zip(a, b)) norm_a = sum(x * x for x in a) ** 0.5 norm_b = sum(x * x for x in b) ** 0.5 return dot / (norm_a * norm_b) if norm_a and norm_b else 0.0 def main(): # Load and chunk documents print("=" * 70) print("Loading and chunking documents...") print("=" * 70) all_chunks = [] # (doc_name, chunk_index, text) for doc_name, doc_path in DOCUMENTS.items(): text = doc_path.read_text(encoding="utf-8") chunks = chunk_text(text) for i, chunk in enumerate(chunks): all_chunks.append((doc_name, i, chunk)) print(f" {doc_name}: {len(text):,} chars, {len(text.split()):,} words -> {len(chunks)} chunks") chunk_texts = [c[2] for c in all_chunks] total_chunks = len(chunk_texts) print(f"\nTotal: {total_chunks} chunks") # Estimate tokens (rough: 1 Hebrew word ~ 2-3 tokens) total_words = sum(len(t.split()) for t in chunk_texts) est_tokens_docs = int(total_words * 2.5) total_query_words = sum(len(q.split()) for q in QUERIES) est_tokens_queries = int(total_query_words * 2.5) print(f"Estimated tokens per model: ~{est_tokens_docs:,} (docs) + ~{est_tokens_queries:,} (queries)") results = {} for model in MODELS: print(f"\n{'=' * 70}") print(f"Model: {model}") print(f"{'=' * 70}") # Embed documents print(f" Embedding {total_chunks} chunks...") t0 = time.time() doc_embeddings = client.embed( chunk_texts, model=model, input_type="document", ) doc_time = time.time() - t0 doc_usage = doc_embeddings.total_tokens doc_embs = doc_embeddings.embeddings print(f" Done in {doc_time:.1f}s — {doc_usage:,} tokens used") # Embed queries print(f" Embedding {len(QUERIES)} queries...") t0 = time.time() query_embeddings = client.embed( QUERIES, model=model, input_type="query", ) query_time = time.time() - t0 query_usage = query_embeddings.total_tokens query_embs = query_embeddings.embeddings print(f" Done in {query_time:.1f}s — {query_usage:,} tokens used") total_tokens = doc_usage + query_usage cost = total_tokens / 1_000_000 * PRICING[model] # Search: for each query, rank chunks by similarity print(f"\n Search results:") query_results = [] for qi, query in enumerate(QUERIES): scores = [] for ci, doc_emb in enumerate(doc_embs): sim = cosine_sim(query_embs[qi], doc_emb) scores.append((sim, all_chunks[ci][0], all_chunks[ci][1], all_chunks[ci][2][:80])) scores.sort(reverse=True) top5 = scores[:5] query_results.append({ "query": query, "top5": [(s[0], s[1], s[2], s[3]) for s in top5], }) print(f"\n Q{qi+1}: {query}") for rank, (score, doc_name, chunk_idx, preview) in enumerate(top5): print(f" #{rank+1} [{score:.4f}] {doc_name} (chunk {chunk_idx}): {preview}...") results[model] = { "doc_time": doc_time, "query_time": query_time, "doc_tokens": doc_usage, "query_tokens": query_usage, "total_tokens": total_tokens, "cost_usd": cost, "dimensions": len(doc_embs[0]), "query_results": query_results, } # Summary comparison print(f"\n{'=' * 70}") print("SUMMARY") print(f"{'=' * 70}") print(f"\n{'Model':<25} {'Tokens':>10} {'Time':>8} {'Cost':>10} {'Dims':>6}") print("-" * 65) for model in MODELS: r = results[model] print(f"{model:<25} {r['total_tokens']:>10,} {r['doc_time']+r['query_time']:>7.1f}s ${r['cost_usd']:>8.5f} {r['dimensions']:>6}") # Compare top-1 agreement between models print(f"\n{'=' * 70}") print("TOP-1 AGREEMENT (which doc is ranked #1 for each query)") print(f"{'=' * 70}") print(f"\n{'Query':<50}", end="") for model in MODELS: print(f" {model.split('-')[-1]:>10}", end="") print() print("-" * 85) for qi, query in enumerate(QUERIES): short_q = query[:48] print(f"{short_q:<50}", end="") for model in MODELS: top1_doc = results[model]["query_results"][qi]["top5"][0][1] # Shorten doc name short_doc = top1_doc[:10] print(f" {short_doc:>10}", end="") print() # Score distribution comparison print(f"\n{'=' * 70}") print("AVERAGE TOP-5 SCORES PER MODEL") print(f"{'=' * 70}") for model in MODELS: all_top5_scores = [] for qr in results[model]["query_results"]: for score, _, _, _ in qr["top5"]: all_top5_scores.append(score) avg = sum(all_top5_scores) / len(all_top5_scores) top1_scores = [qr["top5"][0][0] for qr in results[model]["query_results"]] avg_top1 = sum(top1_scores) / len(top1_scores) print(f" {model:<25} avg top-1: {avg_top1:.4f} avg top-5: {avg:.4f}") # Save full results output_path = Path("/home/chaim/legal-ai/data/benchmark-embeddings.json") serializable = {} for model, r in results.items(): serializable[model] = { "doc_time": r["doc_time"], "query_time": r["query_time"], "doc_tokens": r["doc_tokens"], "query_tokens": r["query_tokens"], "total_tokens": r["total_tokens"], "cost_usd": r["cost_usd"], "dimensions": r["dimensions"], "queries": [ { "query": qr["query"], "top5": [{"score": s, "doc": d, "chunk": c, "preview": p} for s, d, c, p in qr["top5"]], } for qr in r["query_results"] ], } output_path.write_text(json.dumps(serializable, ensure_ascii=False, indent=2)) print(f"\nFull results saved to {output_path}") if __name__ == "__main__": main()