Active scripts (5): auto-sync-cases.sh, backup-db.sh, restore-db.sh, notify.py, bidi_table.py Archived (17): one-time migration/seeding scripts whose functionality is now in MCP server or web API. Moved to scripts/.archive/ Deleted (5): zero-value scripts (duplicates, hardcoded single-case, debug scripts) Added scripts/SCRIPTS.md — registry of all scripts with purpose, status, and what superseded them. CLAUDE.md updated with rule: any script change requires SCRIPTS.md update. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
233 lines
8.2 KiB
Python
233 lines
8.2 KiB
Python
"""Benchmark embedding models on case 1130-25 documents.
|
|
|
|
Compares voyage-3-large (current), voyage-4-large, and voyage-law-2
|
|
on Hebrew legal text retrieval quality, timing, and cost.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import time
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import voyageai
|
|
|
|
API_KEY = os.environ.get("VOYAGE_API_KEY", "pa-qbfhBDxW0tVtgzr_abMyw_AJO2gli9w3nnqyHuQOW-e")
|
|
client = voyageai.Client(api_key=API_KEY)
|
|
|
|
MODELS = [
|
|
"voyage-3-large", # current
|
|
"voyage-4-large", # upgrade candidate
|
|
"voyage-law-2", # legal specialist
|
|
]
|
|
|
|
# Pricing per 1M tokens (from Voyage AI docs)
|
|
PRICING = {
|
|
"voyage-3-large": 0.06,
|
|
"voyage-4-large": 0.12,
|
|
"voyage-law-2": 0.12,
|
|
}
|
|
|
|
DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
|
|
|
|
DOCUMENTS = {
|
|
"כתב ערר קובר": DOCS_DIR / "2025-08-14-כתב-ערר-קובר.md",
|
|
"כתב ערר מטמון": DOCS_DIR / "2025-10-22-כתב-ערר-מטמון.md",
|
|
"תשובת ועדת הראל": DOCS_DIR / "2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md",
|
|
"תשובת ליבמן": DOCS_DIR / "2025-09-01-כתב-תשובה-ליבמן-לערר.md",
|
|
}
|
|
|
|
# Test queries — real questions a judge would ask about this case
|
|
QUERIES = [
|
|
"מהי הטענה המרכזית של העוררים בנוגע לחניה?",
|
|
"מה עמדת הוועדה המקומית לגבי התכנית?",
|
|
"האם יש פגיעה בזכויות הבנייה של השכנים?",
|
|
"מהם התנאים שנקבעו בהיתר הבנייה?",
|
|
"האם התכנית עומדת בתקן החניה?",
|
|
"מה טענות המשיבים לגבי הגובה והצפיפות?",
|
|
"האם נערך שימוע כדין לפני מתן ההחלטה?",
|
|
"מהם הנימוקים לאישור התכנית על ידי הוועדה המקומית?",
|
|
]
|
|
|
|
|
|
def chunk_text(text: str, chunk_size: int = 600, overlap: int = 100) -> list[str]:
|
|
"""Simple word-based chunking."""
|
|
words = text.split()
|
|
chunks = []
|
|
i = 0
|
|
while i < len(words):
|
|
chunk = " ".join(words[i:i + chunk_size])
|
|
chunks.append(chunk)
|
|
i += chunk_size - overlap
|
|
return chunks
|
|
|
|
|
|
def cosine_sim(a: list[float], b: list[float]) -> float:
|
|
dot = sum(x * y for x, y in zip(a, b))
|
|
norm_a = sum(x * x for x in a) ** 0.5
|
|
norm_b = sum(x * x for x in b) ** 0.5
|
|
return dot / (norm_a * norm_b) if norm_a and norm_b else 0.0
|
|
|
|
|
|
def main():
|
|
# Load and chunk documents
|
|
print("=" * 70)
|
|
print("Loading and chunking documents...")
|
|
print("=" * 70)
|
|
|
|
all_chunks = [] # (doc_name, chunk_index, text)
|
|
for doc_name, doc_path in DOCUMENTS.items():
|
|
text = doc_path.read_text(encoding="utf-8")
|
|
chunks = chunk_text(text)
|
|
for i, chunk in enumerate(chunks):
|
|
all_chunks.append((doc_name, i, chunk))
|
|
print(f" {doc_name}: {len(text):,} chars, {len(text.split()):,} words -> {len(chunks)} chunks")
|
|
|
|
chunk_texts = [c[2] for c in all_chunks]
|
|
total_chunks = len(chunk_texts)
|
|
print(f"\nTotal: {total_chunks} chunks")
|
|
|
|
# Estimate tokens (rough: 1 Hebrew word ~ 2-3 tokens)
|
|
total_words = sum(len(t.split()) for t in chunk_texts)
|
|
est_tokens_docs = int(total_words * 2.5)
|
|
total_query_words = sum(len(q.split()) for q in QUERIES)
|
|
est_tokens_queries = int(total_query_words * 2.5)
|
|
|
|
print(f"Estimated tokens per model: ~{est_tokens_docs:,} (docs) + ~{est_tokens_queries:,} (queries)")
|
|
|
|
results = {}
|
|
|
|
for model in MODELS:
|
|
print(f"\n{'=' * 70}")
|
|
print(f"Model: {model}")
|
|
print(f"{'=' * 70}")
|
|
|
|
# Embed documents
|
|
print(f" Embedding {total_chunks} chunks...")
|
|
t0 = time.time()
|
|
doc_embeddings = client.embed(
|
|
chunk_texts,
|
|
model=model,
|
|
input_type="document",
|
|
)
|
|
doc_time = time.time() - t0
|
|
doc_usage = doc_embeddings.total_tokens
|
|
doc_embs = doc_embeddings.embeddings
|
|
print(f" Done in {doc_time:.1f}s — {doc_usage:,} tokens used")
|
|
|
|
# Embed queries
|
|
print(f" Embedding {len(QUERIES)} queries...")
|
|
t0 = time.time()
|
|
query_embeddings = client.embed(
|
|
QUERIES,
|
|
model=model,
|
|
input_type="query",
|
|
)
|
|
query_time = time.time() - t0
|
|
query_usage = query_embeddings.total_tokens
|
|
query_embs = query_embeddings.embeddings
|
|
print(f" Done in {query_time:.1f}s — {query_usage:,} tokens used")
|
|
|
|
total_tokens = doc_usage + query_usage
|
|
cost = total_tokens / 1_000_000 * PRICING[model]
|
|
|
|
# Search: for each query, rank chunks by similarity
|
|
print(f"\n Search results:")
|
|
query_results = []
|
|
for qi, query in enumerate(QUERIES):
|
|
scores = []
|
|
for ci, doc_emb in enumerate(doc_embs):
|
|
sim = cosine_sim(query_embs[qi], doc_emb)
|
|
scores.append((sim, all_chunks[ci][0], all_chunks[ci][1], all_chunks[ci][2][:80]))
|
|
scores.sort(reverse=True)
|
|
top5 = scores[:5]
|
|
query_results.append({
|
|
"query": query,
|
|
"top5": [(s[0], s[1], s[2], s[3]) for s in top5],
|
|
})
|
|
print(f"\n Q{qi+1}: {query}")
|
|
for rank, (score, doc_name, chunk_idx, preview) in enumerate(top5):
|
|
print(f" #{rank+1} [{score:.4f}] {doc_name} (chunk {chunk_idx}): {preview}...")
|
|
|
|
results[model] = {
|
|
"doc_time": doc_time,
|
|
"query_time": query_time,
|
|
"doc_tokens": doc_usage,
|
|
"query_tokens": query_usage,
|
|
"total_tokens": total_tokens,
|
|
"cost_usd": cost,
|
|
"dimensions": len(doc_embs[0]),
|
|
"query_results": query_results,
|
|
}
|
|
|
|
# Summary comparison
|
|
print(f"\n{'=' * 70}")
|
|
print("SUMMARY")
|
|
print(f"{'=' * 70}")
|
|
print(f"\n{'Model':<25} {'Tokens':>10} {'Time':>8} {'Cost':>10} {'Dims':>6}")
|
|
print("-" * 65)
|
|
for model in MODELS:
|
|
r = results[model]
|
|
print(f"{model:<25} {r['total_tokens']:>10,} {r['doc_time']+r['query_time']:>7.1f}s ${r['cost_usd']:>8.5f} {r['dimensions']:>6}")
|
|
|
|
# Compare top-1 agreement between models
|
|
print(f"\n{'=' * 70}")
|
|
print("TOP-1 AGREEMENT (which doc is ranked #1 for each query)")
|
|
print(f"{'=' * 70}")
|
|
print(f"\n{'Query':<50}", end="")
|
|
for model in MODELS:
|
|
print(f" {model.split('-')[-1]:>10}", end="")
|
|
print()
|
|
print("-" * 85)
|
|
|
|
for qi, query in enumerate(QUERIES):
|
|
short_q = query[:48]
|
|
print(f"{short_q:<50}", end="")
|
|
for model in MODELS:
|
|
top1_doc = results[model]["query_results"][qi]["top5"][0][1]
|
|
# Shorten doc name
|
|
short_doc = top1_doc[:10]
|
|
print(f" {short_doc:>10}", end="")
|
|
print()
|
|
|
|
# Score distribution comparison
|
|
print(f"\n{'=' * 70}")
|
|
print("AVERAGE TOP-5 SCORES PER MODEL")
|
|
print(f"{'=' * 70}")
|
|
for model in MODELS:
|
|
all_top5_scores = []
|
|
for qr in results[model]["query_results"]:
|
|
for score, _, _, _ in qr["top5"]:
|
|
all_top5_scores.append(score)
|
|
avg = sum(all_top5_scores) / len(all_top5_scores)
|
|
top1_scores = [qr["top5"][0][0] for qr in results[model]["query_results"]]
|
|
avg_top1 = sum(top1_scores) / len(top1_scores)
|
|
print(f" {model:<25} avg top-1: {avg_top1:.4f} avg top-5: {avg:.4f}")
|
|
|
|
# Save full results
|
|
output_path = Path("/home/chaim/legal-ai/data/benchmark-embeddings.json")
|
|
serializable = {}
|
|
for model, r in results.items():
|
|
serializable[model] = {
|
|
"doc_time": r["doc_time"],
|
|
"query_time": r["query_time"],
|
|
"doc_tokens": r["doc_tokens"],
|
|
"query_tokens": r["query_tokens"],
|
|
"total_tokens": r["total_tokens"],
|
|
"cost_usd": r["cost_usd"],
|
|
"dimensions": r["dimensions"],
|
|
"queries": [
|
|
{
|
|
"query": qr["query"],
|
|
"top5": [{"score": s, "doc": d, "chunk": c, "preview": p} for s, d, c, p in qr["top5"]],
|
|
}
|
|
for qr in r["query_results"]
|
|
],
|
|
}
|
|
output_path.write_text(json.dumps(serializable, ensure_ascii=False, indent=2))
|
|
print(f"\nFull results saved to {output_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|