Clean up scripts/: archive 17, delete 5, add SCRIPTS.md registry

Active scripts (5): auto-sync-cases.sh, backup-db.sh, restore-db.sh, notify.py, bidi_table.py Archived (17): one-time migration/seeding scripts whose functionality is now in MCP server or web API. Moved to scripts/.archive/ Deleted (5): zero-value scripts (duplicates, hardcoded single-case, debug scripts) Added scripts/SCRIPTS.md — registry of all scripts with purpose, status, and what superseded them. CLAUDE.md updated with rule: any script change requires SCRIPTS.md update. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 16:30:19 +00:00
parent 38e79bbf92
commit 5c9a5d702a
24 changed files with 62 additions and 578 deletions
--- a/scripts/.archive/benchmark_embeddings.py
+++ b/scripts/.archive/benchmark_embeddings.py
@@ -0,0 +1,232 @@
+"""Benchmark embedding models on case 1130-25 documents.
+
+Compares voyage-3-large (current), voyage-4-large, and voyage-law-2
+on Hebrew legal text retrieval quality, timing, and cost.
+"""
+
+import json
+import os
+import time
+import sys
+from pathlib import Path
+
+import voyageai
+
+API_KEY = os.environ.get("VOYAGE_API_KEY", "pa-qbfhBDxW0tVtgzr_abMyw_AJO2gli9w3nnqyHuQOW-e")
+client = voyageai.Client(api_key=API_KEY)
+
+MODELS = [
+    "voyage-3-large",    # current
+    "voyage-4-large",    # upgrade candidate
+    "voyage-law-2",      # legal specialist
+]
+
+# Pricing per 1M tokens (from Voyage AI docs)
+PRICING = {
+    "voyage-3-large": 0.06,
+    "voyage-4-large": 0.12,
+    "voyage-law-2": 0.12,
+}
+
+DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
+
+DOCUMENTS = {
+    "כתב ערר קובר": DOCS_DIR / "2025-08-14-כתב-ערר-קובר.md",
+    "כתב ערר מטמון": DOCS_DIR / "2025-10-22-כתב-ערר-מטמון.md",
+    "תשובת ועדת הראל": DOCS_DIR / "2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md",
+    "תשובת ליבמן": DOCS_DIR / "2025-09-01-כתב-תשובה-ליבמן-לערר.md",
+}
+
+# Test queries — real questions a judge would ask about this case
+QUERIES = [
+    "מהי הטענה המרכזית של העוררים בנוגע לחניה?",
+    "מה עמדת הוועדה המקומית לגבי התכנית?",
+    "האם יש פגיעה בזכויות הבנייה של השכנים?",
+    "מהם התנאים שנקבעו בהיתר הבנייה?",
+    "האם התכנית עומדת בתקן החניה?",
+    "מה טענות המשיבים לגבי הגובה והצפיפות?",
+    "האם נערך שימוע כדין לפני מתן ההחלטה?",
+    "מהם הנימוקים לאישור התכנית על ידי הוועדה המקומית?",
+]
+
+
+def chunk_text(text: str, chunk_size: int = 600, overlap: int = 100) -> list[str]:
+    """Simple word-based chunking."""
+    words = text.split()
+    chunks = []
+    i = 0
+    while i < len(words):
+        chunk = " ".join(words[i:i + chunk_size])
+        chunks.append(chunk)
+        i += chunk_size - overlap
+    return chunks
+
+
+def cosine_sim(a: list[float], b: list[float]) -> float:
+    dot = sum(x * y for x, y in zip(a, b))
+    norm_a = sum(x * x for x in a) ** 0.5
+    norm_b = sum(x * x for x in b) ** 0.5
+    return dot / (norm_a * norm_b) if norm_a and norm_b else 0.0
+
+
+def main():
+    # Load and chunk documents
+    print("=" * 70)
+    print("Loading and chunking documents...")
+    print("=" * 70)
+
+    all_chunks = []  # (doc_name, chunk_index, text)
+    for doc_name, doc_path in DOCUMENTS.items():
+        text = doc_path.read_text(encoding="utf-8")
+        chunks = chunk_text(text)
+        for i, chunk in enumerate(chunks):
+            all_chunks.append((doc_name, i, chunk))
+        print(f"  {doc_name}: {len(text):,} chars, {len(text.split()):,} words -> {len(chunks)} chunks")
+
+    chunk_texts = [c[2] for c in all_chunks]
+    total_chunks = len(chunk_texts)
+    print(f"\nTotal: {total_chunks} chunks")
+
+    # Estimate tokens (rough: 1 Hebrew word ~ 2-3 tokens)
+    total_words = sum(len(t.split()) for t in chunk_texts)
+    est_tokens_docs = int(total_words * 2.5)
+    total_query_words = sum(len(q.split()) for q in QUERIES)
+    est_tokens_queries = int(total_query_words * 2.5)
+
+    print(f"Estimated tokens per model: ~{est_tokens_docs:,} (docs) + ~{est_tokens_queries:,} (queries)")
+
+    results = {}
+
+    for model in MODELS:
+        print(f"\n{'=' * 70}")
+        print(f"Model: {model}")
+        print(f"{'=' * 70}")
+
+        # Embed documents
+        print(f"  Embedding {total_chunks} chunks...")
+        t0 = time.time()
+        doc_embeddings = client.embed(
+            chunk_texts,
+            model=model,
+            input_type="document",
+        )
+        doc_time = time.time() - t0
+        doc_usage = doc_embeddings.total_tokens
+        doc_embs = doc_embeddings.embeddings
+        print(f"  Done in {doc_time:.1f}s — {doc_usage:,} tokens used")
+
+        # Embed queries
+        print(f"  Embedding {len(QUERIES)} queries...")
+        t0 = time.time()
+        query_embeddings = client.embed(
+            QUERIES,
+            model=model,
+            input_type="query",
+        )
+        query_time = time.time() - t0
+        query_usage = query_embeddings.total_tokens
+        query_embs = query_embeddings.embeddings
+        print(f"  Done in {query_time:.1f}s — {query_usage:,} tokens used")
+
+        total_tokens = doc_usage + query_usage
+        cost = total_tokens / 1_000_000 * PRICING[model]
+
+        # Search: for each query, rank chunks by similarity
+        print(f"\n  Search results:")
+        query_results = []
+        for qi, query in enumerate(QUERIES):
+            scores = []
+            for ci, doc_emb in enumerate(doc_embs):
+                sim = cosine_sim(query_embs[qi], doc_emb)
+                scores.append((sim, all_chunks[ci][0], all_chunks[ci][1], all_chunks[ci][2][:80]))
+            scores.sort(reverse=True)
+            top5 = scores[:5]
+            query_results.append({
+                "query": query,
+                "top5": [(s[0], s[1], s[2], s[3]) for s in top5],
+            })
+            print(f"\n  Q{qi+1}: {query}")
+            for rank, (score, doc_name, chunk_idx, preview) in enumerate(top5):
+                print(f"    #{rank+1} [{score:.4f}] {doc_name} (chunk {chunk_idx}): {preview}...")
+
+        results[model] = {
+            "doc_time": doc_time,
+            "query_time": query_time,
+            "doc_tokens": doc_usage,
+            "query_tokens": query_usage,
+            "total_tokens": total_tokens,
+            "cost_usd": cost,
+            "dimensions": len(doc_embs[0]),
+            "query_results": query_results,
+        }
+
+    # Summary comparison
+    print(f"\n{'=' * 70}")
+    print("SUMMARY")
+    print(f"{'=' * 70}")
+    print(f"\n{'Model':<25} {'Tokens':>10} {'Time':>8} {'Cost':>10} {'Dims':>6}")
+    print("-" * 65)
+    for model in MODELS:
+        r = results[model]
+        print(f"{model:<25} {r['total_tokens']:>10,} {r['doc_time']+r['query_time']:>7.1f}s ${r['cost_usd']:>8.5f} {r['dimensions']:>6}")
+
+    # Compare top-1 agreement between models
+    print(f"\n{'=' * 70}")
+    print("TOP-1 AGREEMENT (which doc is ranked #1 for each query)")
+    print(f"{'=' * 70}")
+    print(f"\n{'Query':<50}", end="")
+    for model in MODELS:
+        print(f" {model.split('-')[-1]:>10}", end="")
+    print()
+    print("-" * 85)
+
+    for qi, query in enumerate(QUERIES):
+        short_q = query[:48]
+        print(f"{short_q:<50}", end="")
+        for model in MODELS:
+            top1_doc = results[model]["query_results"][qi]["top5"][0][1]
+            # Shorten doc name
+            short_doc = top1_doc[:10]
+            print(f" {short_doc:>10}", end="")
+        print()
+
+    # Score distribution comparison
+    print(f"\n{'=' * 70}")
+    print("AVERAGE TOP-5 SCORES PER MODEL")
+    print(f"{'=' * 70}")
+    for model in MODELS:
+        all_top5_scores = []
+        for qr in results[model]["query_results"]:
+            for score, _, _, _ in qr["top5"]:
+                all_top5_scores.append(score)
+        avg = sum(all_top5_scores) / len(all_top5_scores)
+        top1_scores = [qr["top5"][0][0] for qr in results[model]["query_results"]]
+        avg_top1 = sum(top1_scores) / len(top1_scores)
+        print(f"  {model:<25} avg top-1: {avg_top1:.4f}  avg top-5: {avg:.4f}")
+
+    # Save full results
+    output_path = Path("/home/chaim/legal-ai/data/benchmark-embeddings.json")
+    serializable = {}
+    for model, r in results.items():
+        serializable[model] = {
+            "doc_time": r["doc_time"],
+            "query_time": r["query_time"],
+            "doc_tokens": r["doc_tokens"],
+            "query_tokens": r["query_tokens"],
+            "total_tokens": r["total_tokens"],
+            "cost_usd": r["cost_usd"],
+            "dimensions": r["dimensions"],
+            "queries": [
+                {
+                    "query": qr["query"],
+                    "top5": [{"score": s, "doc": d, "chunk": c, "preview": p} for s, d, c, p in qr["top5"]],
+                }
+                for qr in r["query_results"]
+            ],
+        }
+    output_path.write_text(json.dumps(serializable, ensure_ascii=False, indent=2))
+    print(f"\nFull results saved to {output_path}")
+
+
+if __name__ == "__main__":
+    main()