Clean up scripts/: archive 17, delete 5, add SCRIPTS.md registry
Active scripts (5): auto-sync-cases.sh, backup-db.sh, restore-db.sh, notify.py, bidi_table.py Archived (17): one-time migration/seeding scripts whose functionality is now in MCP server or web API. Moved to scripts/.archive/ Deleted (5): zero-value scripts (duplicates, hardcoded single-case, debug scripts) Added scripts/SCRIPTS.md — registry of all scripts with purpose, status, and what superseded them. CLAUDE.md updated with rule: any script change requires SCRIPTS.md update. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
203
scripts/.archive/benchmark_new_vs_old.py
Normal file
203
scripts/.archive/benchmark_new_vs_old.py
Normal file
@@ -0,0 +1,203 @@
|
||||
"""Compare Google Vision extractions vs existing MDs, then benchmark voyage-law-2."""
|
||||
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import voyageai
|
||||
|
||||
API_KEY = "pa-qbfhBDxW0tVtgzr_abMyw_AJO2gli9w3nnqyHuQOW-e"
|
||||
client = voyageai.Client(api_key=API_KEY)
|
||||
MODEL = "voyage-law-2"
|
||||
|
||||
DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
|
||||
GOOGLE_DIR = DOCS_DIR / "extracted"
|
||||
|
||||
# Map new (Google Vision) files to existing MDs
|
||||
PAIRS = [
|
||||
("מרק קובר-כתב ערר.md", "2025-08-14-כתב-ערר-קובר.md"),
|
||||
("תשובה לערר מטעם המשיבים.md", "2025-09-01-כתב-תשובה-ליבמן-לערר.md"),
|
||||
("תשובת הועדה המרחבית לערר.md", "2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md"),
|
||||
("תשובת המשיב-יצחק מטמון.md", "2025-10-22-כתב-ערר-מטמון.md"),
|
||||
("השלמת טיעון מטעם משיבים 2-3.md", "2025-12-23-השלמת-טיעון-ליבמן.md"),
|
||||
("תשובה מטעם העורר להשלמת טיעון.md", "2025-12-08-תגובת-קובר-לבקשת-השלמת-טיעון.md"),
|
||||
("בקשה להשלמת טיעון ממשיבים 2-3.md", "2025-12-03-בקשה-להשלמת-טיעון-ליבמן.md"),
|
||||
("השלמת טיעון מטעם הוועדה המקומית.md", "2026-02-04-השלמת-טיעון-ועדת-הראל.md"),
|
||||
("תגובת העורר לתשובת ועדת הראל להשלמת הטיעון ערר.md", "2026-02-10-תגובת-קובר-להשלמת-טיעון-הראל.md"),
|
||||
("כתב תשובה-השלמת טיעון מטעם המשיב יצחק מטמון.md", "2026-02-12-כתב-תשובה-השלמת-טיעון-מטמון.md"),
|
||||
("בקשת העורר לדחיית השלמת הטיעון במלואה.md", "2026-01-13-תגובת-קובר-לדחיית-השלמת-טיעון.md"),
|
||||
("1130-25-החלטה לתיקון פרוטוקול.md", "2025-11-27-החלטה-לתיקון-פרוטוקול.md"),
|
||||
("החלטת ביניים 1130-25.md", "2025-12-31-החלטת-ביניים.md"),
|
||||
("1130-25-פרוטוקול ועדת ערר והחלטה.md", "2025-10-27-פרוטוקול-דיון-ועדת-ערר.md"),
|
||||
("פרוטוקול ועדה מקומית לדיון בתכנית 152-1257682.md", "2025-07-23-פרוטוקול-ועדה-מקומית-הראל.md"),
|
||||
]
|
||||
|
||||
QUERIES = [
|
||||
"מהי הטענה המרכזית של העוררים בנוגע לחניה?",
|
||||
"מה עמדת הוועדה המקומית לגבי התכנית?",
|
||||
"האם יש פגיעה בזכויות הבנייה של השכנים?",
|
||||
"מהם התנאים שנקבעו בהיתר הבנייה?",
|
||||
"האם התכנית עומדת בתקן החניה?",
|
||||
"מה טענות המשיבים לגבי הגובה והצפיפות?",
|
||||
"האם נערך שימוע כדין לפני מתן ההחלטה?",
|
||||
"מהם הנימוקים לאישור התכנית על ידי הוועדה המקומית?",
|
||||
]
|
||||
|
||||
|
||||
def cosine_sim(a, b):
|
||||
dot = sum(x * y for x, y in zip(a, b))
|
||||
na = sum(x * x for x in a) ** 0.5
|
||||
nb = sum(x * x for x in b) ** 0.5
|
||||
return dot / (na * nb) if na and nb else 0.0
|
||||
|
||||
|
||||
def chunk_text(text, chunk_size=600, overlap=100):
|
||||
words = text.split()
|
||||
chunks = []
|
||||
i = 0
|
||||
while i < len(words):
|
||||
chunks.append(" ".join(words[i:i + chunk_size]))
|
||||
i += chunk_size - overlap
|
||||
return chunks
|
||||
|
||||
|
||||
def word_overlap(a, b):
|
||||
wa, wb = set(a.split()), set(b.split())
|
||||
if not wa or not wb:
|
||||
return 0.0
|
||||
return len(wa & wb) / max(len(wa), len(wb))
|
||||
|
||||
|
||||
def main():
|
||||
# ── Part 1: Document comparison ──
|
||||
print("=" * 70)
|
||||
print("PART 1: DOCUMENT COMPARISON (Google Vision vs Existing)")
|
||||
print("=" * 70)
|
||||
|
||||
comparison_results = []
|
||||
all_new_chunks = []
|
||||
all_old_chunks = []
|
||||
|
||||
for new_name, old_name in PAIRS:
|
||||
new_path = GOOGLE_DIR / new_name
|
||||
old_path = DOCS_DIR / old_name
|
||||
|
||||
if not new_path.exists():
|
||||
continue
|
||||
if not old_path.exists():
|
||||
print(f" SKIP (no existing): {old_name}")
|
||||
continue
|
||||
|
||||
new_text = new_path.read_text(encoding="utf-8")
|
||||
old_text = old_path.read_text(encoding="utf-8")
|
||||
|
||||
new_words = len(new_text.split())
|
||||
old_words = len(old_text.split())
|
||||
overlap = word_overlap(new_text, old_text)
|
||||
|
||||
short_name = old_name[:40]
|
||||
diff = new_words - old_words
|
||||
diff_pct = (diff / old_words * 100) if old_words else 0
|
||||
|
||||
comparison_results.append({
|
||||
"name": short_name,
|
||||
"old_words": old_words,
|
||||
"new_words": new_words,
|
||||
"diff": diff,
|
||||
"diff_pct": diff_pct,
|
||||
"overlap": overlap,
|
||||
})
|
||||
|
||||
# Chunk for embedding
|
||||
new_chunks = chunk_text(new_text)
|
||||
old_chunks = chunk_text(old_text)
|
||||
for i, c in enumerate(new_chunks):
|
||||
all_new_chunks.append((short_name, i, c))
|
||||
for i, c in enumerate(old_chunks):
|
||||
all_old_chunks.append((short_name, i, c))
|
||||
|
||||
print(f"\n{'Document':<42} {'Old':>6} {'New':>6} {'Diff':>8} {'Overlap':>8}")
|
||||
print("-" * 72)
|
||||
for r in comparison_results:
|
||||
print(f" {r['name']:<40} {r['old_words']:>6} {r['new_words']:>6} {r['diff']:>+7} ({r['diff_pct']:>+.0f}%) {r['overlap']:>7.0%}")
|
||||
|
||||
# ── Part 2: Embedding benchmark ──
|
||||
print(f"\n{'=' * 70}")
|
||||
print("PART 2: VOYAGE-LAW-2 EMBEDDING BENCHMARK")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
new_texts = [c[2] for c in all_new_chunks]
|
||||
old_texts = [c[2] for c in all_old_chunks]
|
||||
|
||||
print(f"\nNew chunks: {len(new_texts)}, Old chunks: {len(old_texts)}")
|
||||
|
||||
def embed_batched(texts, label):
|
||||
BATCH = 20
|
||||
all_embs = []
|
||||
total_tokens = 0
|
||||
t0 = time.time()
|
||||
for i in range(0, len(texts), BATCH):
|
||||
batch = texts[i:i+BATCH]
|
||||
result = client.embed(batch, model=MODEL, input_type="document")
|
||||
all_embs.extend(result.embeddings)
|
||||
total_tokens += result.total_tokens
|
||||
elapsed = time.time() - t0
|
||||
print(f" {label}: {len(texts)} chunks, {total_tokens:,} tokens, {elapsed:.1f}s")
|
||||
return all_embs, total_tokens, elapsed
|
||||
|
||||
# Embed new
|
||||
print("Embedding NEW (Google Vision) chunks...")
|
||||
new_embs, new_tokens, new_time = embed_batched(new_texts, "NEW")
|
||||
|
||||
# Embed old
|
||||
print("Embedding OLD (existing) chunks...")
|
||||
old_embs, old_tokens, old_time = embed_batched(old_texts, "OLD")
|
||||
|
||||
# Embed queries
|
||||
print(f"Embedding {len(QUERIES)} queries...")
|
||||
q_result = client.embed(QUERIES, model=MODEL, input_type="query")
|
||||
q_embs = q_result.embeddings
|
||||
|
||||
# Search and compare
|
||||
print(f"\n{'=' * 70}")
|
||||
print("PART 3: SEARCH QUALITY COMPARISON")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
for qi, query in enumerate(QUERIES):
|
||||
# Score against new
|
||||
new_scores = [(cosine_sim(q_embs[qi], e), all_new_chunks[i][0], all_new_chunks[i][2][:60]) for i, e in enumerate(new_embs)]
|
||||
new_scores.sort(reverse=True)
|
||||
|
||||
# Score against old
|
||||
old_scores = [(cosine_sim(q_embs[qi], e), all_old_chunks[i][0], all_old_chunks[i][2][:60]) for i, e in enumerate(old_embs)]
|
||||
old_scores.sort(reverse=True)
|
||||
|
||||
print(f"\nQ{qi+1}: {query}")
|
||||
print(f" {'NEW top-1':>10}: [{new_scores[0][0]:.4f}] {new_scores[0][1]}")
|
||||
print(f" {'OLD top-1':>10}: [{old_scores[0][0]:.4f}] {old_scores[0][1]}")
|
||||
if new_scores[0][0] > old_scores[0][0]:
|
||||
print(f" >> NEW better by {new_scores[0][0] - old_scores[0][0]:.4f}")
|
||||
else:
|
||||
print(f" >> OLD better by {old_scores[0][0] - new_scores[0][0]:.4f}")
|
||||
|
||||
# Summary
|
||||
new_avg = sum(max(cosine_sim(q_embs[qi], e) for e in new_embs) for qi in range(len(QUERIES))) / len(QUERIES)
|
||||
old_avg = sum(max(cosine_sim(q_embs[qi], e) for e in old_embs) for qi in range(len(QUERIES))) / len(QUERIES)
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print("SUMMARY")
|
||||
print(f"{'=' * 70}")
|
||||
print(f" {'Metric':<30} {'Old (existing)':>15} {'New (Google Vision)':>20}")
|
||||
print(f" {'-' * 65}")
|
||||
print(f" {'Total chunks':<30} {len(old_texts):>15} {len(new_texts):>20}")
|
||||
print(f" {'Total tokens':<30} {old_tokens:>15,} {new_tokens:>20,}")
|
||||
print(f" {'Embed time':<30} {old_time:>14.1f}s {new_time:>19.1f}s")
|
||||
print(f" {'Avg top-1 score':<30} {old_avg:>15.4f} {new_avg:>20.4f}")
|
||||
print(f" {'Score difference':<30} {'':>15} {new_avg - old_avg:>+20.4f}")
|
||||
|
||||
est_cost = (new_tokens + old_tokens) / 1_000_000 * 0.12
|
||||
print(f"\n Embedding cost: ${est_cost:.3f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user