Files
legal-ai/scripts/benchmark_new_vs_old.py
Chaim 22e819363e Flatten cases directory structure and unify paths
- Remove cases/new|in-progress|completed subdivision (status managed in DB)
- Rename documents/original → documents/originals (consistent plural)
- Move exports from global data/exports/ into cases/{num}/exports/
- Add documents/research/ for case law and analysis files
- Update all agents, scripts, config, web API endpoints, and DB paths

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 14:33:27 +00:00

204 lines
8.3 KiB
Python

"""Compare Google Vision extractions vs existing MDs, then benchmark voyage-law-2."""
import json
import time
from pathlib import Path
import voyageai
API_KEY = "pa-qbfhBDxW0tVtgzr_abMyw_AJO2gli9w3nnqyHuQOW-e"
client = voyageai.Client(api_key=API_KEY)
MODEL = "voyage-law-2"
DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
GOOGLE_DIR = DOCS_DIR / "extracted"
# Map new (Google Vision) files to existing MDs
PAIRS = [
("מרק קובר-כתב ערר.md", "2025-08-14-כתב-ערר-קובר.md"),
("תשובה לערר מטעם המשיבים.md", "2025-09-01-כתב-תשובה-ליבמן-לערר.md"),
("תשובת הועדה המרחבית לערר.md", "2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md"),
("תשובת המשיב-יצחק מטמון.md", "2025-10-22-כתב-ערר-מטמון.md"),
("השלמת טיעון מטעם משיבים 2-3.md", "2025-12-23-השלמת-טיעון-ליבמן.md"),
("תשובה מטעם העורר להשלמת טיעון.md", "2025-12-08-תגובת-קובר-לבקשת-השלמת-טיעון.md"),
("בקשה להשלמת טיעון ממשיבים 2-3.md", "2025-12-03-בקשה-להשלמת-טיעון-ליבמן.md"),
("השלמת טיעון מטעם הוועדה המקומית.md", "2026-02-04-השלמת-טיעון-ועדת-הראל.md"),
("תגובת העורר לתשובת ועדת הראל להשלמת הטיעון ערר.md", "2026-02-10-תגובת-קובר-להשלמת-טיעון-הראל.md"),
("כתב תשובה-השלמת טיעון מטעם המשיב יצחק מטמון.md", "2026-02-12-כתב-תשובה-השלמת-טיעון-מטמון.md"),
("בקשת העורר לדחיית השלמת הטיעון במלואה.md", "2026-01-13-תגובת-קובר-לדחיית-השלמת-טיעון.md"),
("1130-25-החלטה לתיקון פרוטוקול.md", "2025-11-27-החלטה-לתיקון-פרוטוקול.md"),
("החלטת ביניים 1130-25.md", "2025-12-31-החלטת-ביניים.md"),
("1130-25-פרוטוקול ועדת ערר והחלטה.md", "2025-10-27-פרוטוקול-דיון-ועדת-ערר.md"),
("פרוטוקול ועדה מקומית לדיון בתכנית 152-1257682.md", "2025-07-23-פרוטוקול-ועדה-מקומית-הראל.md"),
]
QUERIES = [
"מהי הטענה המרכזית של העוררים בנוגע לחניה?",
"מה עמדת הוועדה המקומית לגבי התכנית?",
"האם יש פגיעה בזכויות הבנייה של השכנים?",
"מהם התנאים שנקבעו בהיתר הבנייה?",
"האם התכנית עומדת בתקן החניה?",
"מה טענות המשיבים לגבי הגובה והצפיפות?",
"האם נערך שימוע כדין לפני מתן ההחלטה?",
"מהם הנימוקים לאישור התכנית על ידי הוועדה המקומית?",
]
def cosine_sim(a, b):
dot = sum(x * y for x, y in zip(a, b))
na = sum(x * x for x in a) ** 0.5
nb = sum(x * x for x in b) ** 0.5
return dot / (na * nb) if na and nb else 0.0
def chunk_text(text, chunk_size=600, overlap=100):
words = text.split()
chunks = []
i = 0
while i < len(words):
chunks.append(" ".join(words[i:i + chunk_size]))
i += chunk_size - overlap
return chunks
def word_overlap(a, b):
wa, wb = set(a.split()), set(b.split())
if not wa or not wb:
return 0.0
return len(wa & wb) / max(len(wa), len(wb))
def main():
# ── Part 1: Document comparison ──
print("=" * 70)
print("PART 1: DOCUMENT COMPARISON (Google Vision vs Existing)")
print("=" * 70)
comparison_results = []
all_new_chunks = []
all_old_chunks = []
for new_name, old_name in PAIRS:
new_path = GOOGLE_DIR / new_name
old_path = DOCS_DIR / old_name
if not new_path.exists():
continue
if not old_path.exists():
print(f" SKIP (no existing): {old_name}")
continue
new_text = new_path.read_text(encoding="utf-8")
old_text = old_path.read_text(encoding="utf-8")
new_words = len(new_text.split())
old_words = len(old_text.split())
overlap = word_overlap(new_text, old_text)
short_name = old_name[:40]
diff = new_words - old_words
diff_pct = (diff / old_words * 100) if old_words else 0
comparison_results.append({
"name": short_name,
"old_words": old_words,
"new_words": new_words,
"diff": diff,
"diff_pct": diff_pct,
"overlap": overlap,
})
# Chunk for embedding
new_chunks = chunk_text(new_text)
old_chunks = chunk_text(old_text)
for i, c in enumerate(new_chunks):
all_new_chunks.append((short_name, i, c))
for i, c in enumerate(old_chunks):
all_old_chunks.append((short_name, i, c))
print(f"\n{'Document':<42} {'Old':>6} {'New':>6} {'Diff':>8} {'Overlap':>8}")
print("-" * 72)
for r in comparison_results:
print(f" {r['name']:<40} {r['old_words']:>6} {r['new_words']:>6} {r['diff']:>+7} ({r['diff_pct']:>+.0f}%) {r['overlap']:>7.0%}")
# ── Part 2: Embedding benchmark ──
print(f"\n{'=' * 70}")
print("PART 2: VOYAGE-LAW-2 EMBEDDING BENCHMARK")
print(f"{'=' * 70}")
new_texts = [c[2] for c in all_new_chunks]
old_texts = [c[2] for c in all_old_chunks]
print(f"\nNew chunks: {len(new_texts)}, Old chunks: {len(old_texts)}")
def embed_batched(texts, label):
BATCH = 20
all_embs = []
total_tokens = 0
t0 = time.time()
for i in range(0, len(texts), BATCH):
batch = texts[i:i+BATCH]
result = client.embed(batch, model=MODEL, input_type="document")
all_embs.extend(result.embeddings)
total_tokens += result.total_tokens
elapsed = time.time() - t0
print(f" {label}: {len(texts)} chunks, {total_tokens:,} tokens, {elapsed:.1f}s")
return all_embs, total_tokens, elapsed
# Embed new
print("Embedding NEW (Google Vision) chunks...")
new_embs, new_tokens, new_time = embed_batched(new_texts, "NEW")
# Embed old
print("Embedding OLD (existing) chunks...")
old_embs, old_tokens, old_time = embed_batched(old_texts, "OLD")
# Embed queries
print(f"Embedding {len(QUERIES)} queries...")
q_result = client.embed(QUERIES, model=MODEL, input_type="query")
q_embs = q_result.embeddings
# Search and compare
print(f"\n{'=' * 70}")
print("PART 3: SEARCH QUALITY COMPARISON")
print(f"{'=' * 70}")
for qi, query in enumerate(QUERIES):
# Score against new
new_scores = [(cosine_sim(q_embs[qi], e), all_new_chunks[i][0], all_new_chunks[i][2][:60]) for i, e in enumerate(new_embs)]
new_scores.sort(reverse=True)
# Score against old
old_scores = [(cosine_sim(q_embs[qi], e), all_old_chunks[i][0], all_old_chunks[i][2][:60]) for i, e in enumerate(old_embs)]
old_scores.sort(reverse=True)
print(f"\nQ{qi+1}: {query}")
print(f" {'NEW top-1':>10}: [{new_scores[0][0]:.4f}] {new_scores[0][1]}")
print(f" {'OLD top-1':>10}: [{old_scores[0][0]:.4f}] {old_scores[0][1]}")
if new_scores[0][0] > old_scores[0][0]:
print(f" >> NEW better by {new_scores[0][0] - old_scores[0][0]:.4f}")
else:
print(f" >> OLD better by {old_scores[0][0] - new_scores[0][0]:.4f}")
# Summary
new_avg = sum(max(cosine_sim(q_embs[qi], e) for e in new_embs) for qi in range(len(QUERIES))) / len(QUERIES)
old_avg = sum(max(cosine_sim(q_embs[qi], e) for e in old_embs) for qi in range(len(QUERIES))) / len(QUERIES)
print(f"\n{'=' * 70}")
print("SUMMARY")
print(f"{'=' * 70}")
print(f" {'Metric':<30} {'Old (existing)':>15} {'New (Google Vision)':>20}")
print(f" {'-' * 65}")
print(f" {'Total chunks':<30} {len(old_texts):>15} {len(new_texts):>20}")
print(f" {'Total tokens':<30} {old_tokens:>15,} {new_tokens:>20,}")
print(f" {'Embed time':<30} {old_time:>14.1f}s {new_time:>19.1f}s")
print(f" {'Avg top-1 score':<30} {old_avg:>15.4f} {new_avg:>20.4f}")
print(f" {'Score difference':<30} {'':>15} {new_avg - old_avg:>+20.4f}")
est_cost = (new_tokens + old_tokens) / 1_000_000 * 0.12
print(f"\n Embedding cost: ${est_cost:.3f}")
if __name__ == "__main__":
main()