Clean up scripts/: archive 17, delete 5, add SCRIPTS.md registry

Active scripts (5): auto-sync-cases.sh, backup-db.sh, restore-db.sh,
notify.py, bidi_table.py

Archived (17): one-time migration/seeding scripts whose functionality
is now in MCP server or web API. Moved to scripts/.archive/

Deleted (5): zero-value scripts (duplicates, hardcoded single-case,
debug scripts)

Added scripts/SCRIPTS.md — registry of all scripts with purpose,
status, and what superseded them. CLAUDE.md updated with rule:
any script change requires SCRIPTS.md update.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-14 16:30:19 +00:00
parent 38e79bbf92
commit 5c9a5d702a
24 changed files with 62 additions and 578 deletions

View File

@@ -0,0 +1,232 @@
"""Benchmark embedding models on case 1130-25 documents.
Compares voyage-3-large (current), voyage-4-large, and voyage-law-2
on Hebrew legal text retrieval quality, timing, and cost.
"""
import json
import os
import time
import sys
from pathlib import Path
import voyageai
API_KEY = os.environ.get("VOYAGE_API_KEY", "pa-qbfhBDxW0tVtgzr_abMyw_AJO2gli9w3nnqyHuQOW-e")
client = voyageai.Client(api_key=API_KEY)
MODELS = [
"voyage-3-large", # current
"voyage-4-large", # upgrade candidate
"voyage-law-2", # legal specialist
]
# Pricing per 1M tokens (from Voyage AI docs)
PRICING = {
"voyage-3-large": 0.06,
"voyage-4-large": 0.12,
"voyage-law-2": 0.12,
}
DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
DOCUMENTS = {
"כתב ערר קובר": DOCS_DIR / "2025-08-14-כתב-ערר-קובר.md",
"כתב ערר מטמון": DOCS_DIR / "2025-10-22-כתב-ערר-מטמון.md",
"תשובת ועדת הראל": DOCS_DIR / "2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md",
"תשובת ליבמן": DOCS_DIR / "2025-09-01-כתב-תשובה-ליבמן-לערר.md",
}
# Test queries — real questions a judge would ask about this case
QUERIES = [
"מהי הטענה המרכזית של העוררים בנוגע לחניה?",
"מה עמדת הוועדה המקומית לגבי התכנית?",
"האם יש פגיעה בזכויות הבנייה של השכנים?",
"מהם התנאים שנקבעו בהיתר הבנייה?",
"האם התכנית עומדת בתקן החניה?",
"מה טענות המשיבים לגבי הגובה והצפיפות?",
"האם נערך שימוע כדין לפני מתן ההחלטה?",
"מהם הנימוקים לאישור התכנית על ידי הוועדה המקומית?",
]
def chunk_text(text: str, chunk_size: int = 600, overlap: int = 100) -> list[str]:
"""Simple word-based chunking."""
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk = " ".join(words[i:i + chunk_size])
chunks.append(chunk)
i += chunk_size - overlap
return chunks
def cosine_sim(a: list[float], b: list[float]) -> float:
dot = sum(x * y for x, y in zip(a, b))
norm_a = sum(x * x for x in a) ** 0.5
norm_b = sum(x * x for x in b) ** 0.5
return dot / (norm_a * norm_b) if norm_a and norm_b else 0.0
def main():
# Load and chunk documents
print("=" * 70)
print("Loading and chunking documents...")
print("=" * 70)
all_chunks = [] # (doc_name, chunk_index, text)
for doc_name, doc_path in DOCUMENTS.items():
text = doc_path.read_text(encoding="utf-8")
chunks = chunk_text(text)
for i, chunk in enumerate(chunks):
all_chunks.append((doc_name, i, chunk))
print(f" {doc_name}: {len(text):,} chars, {len(text.split()):,} words -> {len(chunks)} chunks")
chunk_texts = [c[2] for c in all_chunks]
total_chunks = len(chunk_texts)
print(f"\nTotal: {total_chunks} chunks")
# Estimate tokens (rough: 1 Hebrew word ~ 2-3 tokens)
total_words = sum(len(t.split()) for t in chunk_texts)
est_tokens_docs = int(total_words * 2.5)
total_query_words = sum(len(q.split()) for q in QUERIES)
est_tokens_queries = int(total_query_words * 2.5)
print(f"Estimated tokens per model: ~{est_tokens_docs:,} (docs) + ~{est_tokens_queries:,} (queries)")
results = {}
for model in MODELS:
print(f"\n{'=' * 70}")
print(f"Model: {model}")
print(f"{'=' * 70}")
# Embed documents
print(f" Embedding {total_chunks} chunks...")
t0 = time.time()
doc_embeddings = client.embed(
chunk_texts,
model=model,
input_type="document",
)
doc_time = time.time() - t0
doc_usage = doc_embeddings.total_tokens
doc_embs = doc_embeddings.embeddings
print(f" Done in {doc_time:.1f}s — {doc_usage:,} tokens used")
# Embed queries
print(f" Embedding {len(QUERIES)} queries...")
t0 = time.time()
query_embeddings = client.embed(
QUERIES,
model=model,
input_type="query",
)
query_time = time.time() - t0
query_usage = query_embeddings.total_tokens
query_embs = query_embeddings.embeddings
print(f" Done in {query_time:.1f}s — {query_usage:,} tokens used")
total_tokens = doc_usage + query_usage
cost = total_tokens / 1_000_000 * PRICING[model]
# Search: for each query, rank chunks by similarity
print(f"\n Search results:")
query_results = []
for qi, query in enumerate(QUERIES):
scores = []
for ci, doc_emb in enumerate(doc_embs):
sim = cosine_sim(query_embs[qi], doc_emb)
scores.append((sim, all_chunks[ci][0], all_chunks[ci][1], all_chunks[ci][2][:80]))
scores.sort(reverse=True)
top5 = scores[:5]
query_results.append({
"query": query,
"top5": [(s[0], s[1], s[2], s[3]) for s in top5],
})
print(f"\n Q{qi+1}: {query}")
for rank, (score, doc_name, chunk_idx, preview) in enumerate(top5):
print(f" #{rank+1} [{score:.4f}] {doc_name} (chunk {chunk_idx}): {preview}...")
results[model] = {
"doc_time": doc_time,
"query_time": query_time,
"doc_tokens": doc_usage,
"query_tokens": query_usage,
"total_tokens": total_tokens,
"cost_usd": cost,
"dimensions": len(doc_embs[0]),
"query_results": query_results,
}
# Summary comparison
print(f"\n{'=' * 70}")
print("SUMMARY")
print(f"{'=' * 70}")
print(f"\n{'Model':<25} {'Tokens':>10} {'Time':>8} {'Cost':>10} {'Dims':>6}")
print("-" * 65)
for model in MODELS:
r = results[model]
print(f"{model:<25} {r['total_tokens']:>10,} {r['doc_time']+r['query_time']:>7.1f}s ${r['cost_usd']:>8.5f} {r['dimensions']:>6}")
# Compare top-1 agreement between models
print(f"\n{'=' * 70}")
print("TOP-1 AGREEMENT (which doc is ranked #1 for each query)")
print(f"{'=' * 70}")
print(f"\n{'Query':<50}", end="")
for model in MODELS:
print(f" {model.split('-')[-1]:>10}", end="")
print()
print("-" * 85)
for qi, query in enumerate(QUERIES):
short_q = query[:48]
print(f"{short_q:<50}", end="")
for model in MODELS:
top1_doc = results[model]["query_results"][qi]["top5"][0][1]
# Shorten doc name
short_doc = top1_doc[:10]
print(f" {short_doc:>10}", end="")
print()
# Score distribution comparison
print(f"\n{'=' * 70}")
print("AVERAGE TOP-5 SCORES PER MODEL")
print(f"{'=' * 70}")
for model in MODELS:
all_top5_scores = []
for qr in results[model]["query_results"]:
for score, _, _, _ in qr["top5"]:
all_top5_scores.append(score)
avg = sum(all_top5_scores) / len(all_top5_scores)
top1_scores = [qr["top5"][0][0] for qr in results[model]["query_results"]]
avg_top1 = sum(top1_scores) / len(top1_scores)
print(f" {model:<25} avg top-1: {avg_top1:.4f} avg top-5: {avg:.4f}")
# Save full results
output_path = Path("/home/chaim/legal-ai/data/benchmark-embeddings.json")
serializable = {}
for model, r in results.items():
serializable[model] = {
"doc_time": r["doc_time"],
"query_time": r["query_time"],
"doc_tokens": r["doc_tokens"],
"query_tokens": r["query_tokens"],
"total_tokens": r["total_tokens"],
"cost_usd": r["cost_usd"],
"dimensions": r["dimensions"],
"queries": [
{
"query": qr["query"],
"top5": [{"score": s, "doc": d, "chunk": c, "preview": p} for s, d, c, p in qr["top5"]],
}
for qr in r["query_results"]
],
}
output_path.write_text(json.dumps(serializable, ensure_ascii=False, indent=2))
print(f"\nFull results saved to {output_path}")
if __name__ == "__main__":
main()