Flatten cases directory structure and unify paths

- Remove cases/new|in-progress|completed subdivision (status managed in DB)
- Rename documents/original → documents/originals (consistent plural)
- Move exports from global data/exports/ into cases/{num}/exports/
- Add documents/research/ for case law and analysis files
- Update all agents, scripts, config, web API endpoints, and DB paths

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-09 14:33:27 +00:00
parent 4d674bf475
commit 22e819363e
17 changed files with 1203 additions and 62 deletions

View File

@@ -0,0 +1,232 @@
"""Benchmark embedding models on case 1130-25 documents.
Compares voyage-3-large (current), voyage-4-large, and voyage-law-2
on Hebrew legal text retrieval quality, timing, and cost.
"""
import json
import os
import time
import sys
from pathlib import Path
import voyageai
API_KEY = os.environ.get("VOYAGE_API_KEY", "pa-qbfhBDxW0tVtgzr_abMyw_AJO2gli9w3nnqyHuQOW-e")
client = voyageai.Client(api_key=API_KEY)
MODELS = [
"voyage-3-large", # current
"voyage-4-large", # upgrade candidate
"voyage-law-2", # legal specialist
]
# Pricing per 1M tokens (from Voyage AI docs)
PRICING = {
"voyage-3-large": 0.06,
"voyage-4-large": 0.12,
"voyage-law-2": 0.12,
}
DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
DOCUMENTS = {
"כתב ערר קובר": DOCS_DIR / "2025-08-14-כתב-ערר-קובר.md",
"כתב ערר מטמון": DOCS_DIR / "2025-10-22-כתב-ערר-מטמון.md",
"תשובת ועדת הראל": DOCS_DIR / "2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md",
"תשובת ליבמן": DOCS_DIR / "2025-09-01-כתב-תשובה-ליבמן-לערר.md",
}
# Test queries — real questions a judge would ask about this case
QUERIES = [
"מהי הטענה המרכזית של העוררים בנוגע לחניה?",
"מה עמדת הוועדה המקומית לגבי התכנית?",
"האם יש פגיעה בזכויות הבנייה של השכנים?",
"מהם התנאים שנקבעו בהיתר הבנייה?",
"האם התכנית עומדת בתקן החניה?",
"מה טענות המשיבים לגבי הגובה והצפיפות?",
"האם נערך שימוע כדין לפני מתן ההחלטה?",
"מהם הנימוקים לאישור התכנית על ידי הוועדה המקומית?",
]
def chunk_text(text: str, chunk_size: int = 600, overlap: int = 100) -> list[str]:
"""Simple word-based chunking."""
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk = " ".join(words[i:i + chunk_size])
chunks.append(chunk)
i += chunk_size - overlap
return chunks
def cosine_sim(a: list[float], b: list[float]) -> float:
dot = sum(x * y for x, y in zip(a, b))
norm_a = sum(x * x for x in a) ** 0.5
norm_b = sum(x * x for x in b) ** 0.5
return dot / (norm_a * norm_b) if norm_a and norm_b else 0.0
def main():
# Load and chunk documents
print("=" * 70)
print("Loading and chunking documents...")
print("=" * 70)
all_chunks = [] # (doc_name, chunk_index, text)
for doc_name, doc_path in DOCUMENTS.items():
text = doc_path.read_text(encoding="utf-8")
chunks = chunk_text(text)
for i, chunk in enumerate(chunks):
all_chunks.append((doc_name, i, chunk))
print(f" {doc_name}: {len(text):,} chars, {len(text.split()):,} words -> {len(chunks)} chunks")
chunk_texts = [c[2] for c in all_chunks]
total_chunks = len(chunk_texts)
print(f"\nTotal: {total_chunks} chunks")
# Estimate tokens (rough: 1 Hebrew word ~ 2-3 tokens)
total_words = sum(len(t.split()) for t in chunk_texts)
est_tokens_docs = int(total_words * 2.5)
total_query_words = sum(len(q.split()) for q in QUERIES)
est_tokens_queries = int(total_query_words * 2.5)
print(f"Estimated tokens per model: ~{est_tokens_docs:,} (docs) + ~{est_tokens_queries:,} (queries)")
results = {}
for model in MODELS:
print(f"\n{'=' * 70}")
print(f"Model: {model}")
print(f"{'=' * 70}")
# Embed documents
print(f" Embedding {total_chunks} chunks...")
t0 = time.time()
doc_embeddings = client.embed(
chunk_texts,
model=model,
input_type="document",
)
doc_time = time.time() - t0
doc_usage = doc_embeddings.total_tokens
doc_embs = doc_embeddings.embeddings
print(f" Done in {doc_time:.1f}s — {doc_usage:,} tokens used")
# Embed queries
print(f" Embedding {len(QUERIES)} queries...")
t0 = time.time()
query_embeddings = client.embed(
QUERIES,
model=model,
input_type="query",
)
query_time = time.time() - t0
query_usage = query_embeddings.total_tokens
query_embs = query_embeddings.embeddings
print(f" Done in {query_time:.1f}s — {query_usage:,} tokens used")
total_tokens = doc_usage + query_usage
cost = total_tokens / 1_000_000 * PRICING[model]
# Search: for each query, rank chunks by similarity
print(f"\n Search results:")
query_results = []
for qi, query in enumerate(QUERIES):
scores = []
for ci, doc_emb in enumerate(doc_embs):
sim = cosine_sim(query_embs[qi], doc_emb)
scores.append((sim, all_chunks[ci][0], all_chunks[ci][1], all_chunks[ci][2][:80]))
scores.sort(reverse=True)
top5 = scores[:5]
query_results.append({
"query": query,
"top5": [(s[0], s[1], s[2], s[3]) for s in top5],
})
print(f"\n Q{qi+1}: {query}")
for rank, (score, doc_name, chunk_idx, preview) in enumerate(top5):
print(f" #{rank+1} [{score:.4f}] {doc_name} (chunk {chunk_idx}): {preview}...")
results[model] = {
"doc_time": doc_time,
"query_time": query_time,
"doc_tokens": doc_usage,
"query_tokens": query_usage,
"total_tokens": total_tokens,
"cost_usd": cost,
"dimensions": len(doc_embs[0]),
"query_results": query_results,
}
# Summary comparison
print(f"\n{'=' * 70}")
print("SUMMARY")
print(f"{'=' * 70}")
print(f"\n{'Model':<25} {'Tokens':>10} {'Time':>8} {'Cost':>10} {'Dims':>6}")
print("-" * 65)
for model in MODELS:
r = results[model]
print(f"{model:<25} {r['total_tokens']:>10,} {r['doc_time']+r['query_time']:>7.1f}s ${r['cost_usd']:>8.5f} {r['dimensions']:>6}")
# Compare top-1 agreement between models
print(f"\n{'=' * 70}")
print("TOP-1 AGREEMENT (which doc is ranked #1 for each query)")
print(f"{'=' * 70}")
print(f"\n{'Query':<50}", end="")
for model in MODELS:
print(f" {model.split('-')[-1]:>10}", end="")
print()
print("-" * 85)
for qi, query in enumerate(QUERIES):
short_q = query[:48]
print(f"{short_q:<50}", end="")
for model in MODELS:
top1_doc = results[model]["query_results"][qi]["top5"][0][1]
# Shorten doc name
short_doc = top1_doc[:10]
print(f" {short_doc:>10}", end="")
print()
# Score distribution comparison
print(f"\n{'=' * 70}")
print("AVERAGE TOP-5 SCORES PER MODEL")
print(f"{'=' * 70}")
for model in MODELS:
all_top5_scores = []
for qr in results[model]["query_results"]:
for score, _, _, _ in qr["top5"]:
all_top5_scores.append(score)
avg = sum(all_top5_scores) / len(all_top5_scores)
top1_scores = [qr["top5"][0][0] for qr in results[model]["query_results"]]
avg_top1 = sum(top1_scores) / len(top1_scores)
print(f" {model:<25} avg top-1: {avg_top1:.4f} avg top-5: {avg:.4f}")
# Save full results
output_path = Path("/home/chaim/legal-ai/data/benchmark-embeddings.json")
serializable = {}
for model, r in results.items():
serializable[model] = {
"doc_time": r["doc_time"],
"query_time": r["query_time"],
"doc_tokens": r["doc_tokens"],
"query_tokens": r["query_tokens"],
"total_tokens": r["total_tokens"],
"cost_usd": r["cost_usd"],
"dimensions": r["dimensions"],
"queries": [
{
"query": qr["query"],
"top5": [{"score": s, "doc": d, "chunk": c, "preview": p} for s, d, c, p in qr["top5"]],
}
for qr in r["query_results"]
],
}
output_path.write_text(json.dumps(serializable, ensure_ascii=False, indent=2))
print(f"\nFull results saved to {output_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,203 @@
"""Compare Google Vision extractions vs existing MDs, then benchmark voyage-law-2."""
import json
import time
from pathlib import Path
import voyageai
API_KEY = "pa-qbfhBDxW0tVtgzr_abMyw_AJO2gli9w3nnqyHuQOW-e"
client = voyageai.Client(api_key=API_KEY)
MODEL = "voyage-law-2"
DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
GOOGLE_DIR = DOCS_DIR / "extracted"
# Map new (Google Vision) files to existing MDs
PAIRS = [
("מרק קובר-כתב ערר.md", "2025-08-14-כתב-ערר-קובר.md"),
("תשובה לערר מטעם המשיבים.md", "2025-09-01-כתב-תשובה-ליבמן-לערר.md"),
("תשובת הועדה המרחבית לערר.md", "2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md"),
("תשובת המשיב-יצחק מטמון.md", "2025-10-22-כתב-ערר-מטמון.md"),
("השלמת טיעון מטעם משיבים 2-3.md", "2025-12-23-השלמת-טיעון-ליבמן.md"),
("תשובה מטעם העורר להשלמת טיעון.md", "2025-12-08-תגובת-קובר-לבקשת-השלמת-טיעון.md"),
("בקשה להשלמת טיעון ממשיבים 2-3.md", "2025-12-03-בקשה-להשלמת-טיעון-ליבמן.md"),
("השלמת טיעון מטעם הוועדה המקומית.md", "2026-02-04-השלמת-טיעון-ועדת-הראל.md"),
("תגובת העורר לתשובת ועדת הראל להשלמת הטיעון ערר.md", "2026-02-10-תגובת-קובר-להשלמת-טיעון-הראל.md"),
("כתב תשובה-השלמת טיעון מטעם המשיב יצחק מטמון.md", "2026-02-12-כתב-תשובה-השלמת-טיעון-מטמון.md"),
("בקשת העורר לדחיית השלמת הטיעון במלואה.md", "2026-01-13-תגובת-קובר-לדחיית-השלמת-טיעון.md"),
("1130-25-החלטה לתיקון פרוטוקול.md", "2025-11-27-החלטה-לתיקון-פרוטוקול.md"),
("החלטת ביניים 1130-25.md", "2025-12-31-החלטת-ביניים.md"),
("1130-25-פרוטוקול ועדת ערר והחלטה.md", "2025-10-27-פרוטוקול-דיון-ועדת-ערר.md"),
("פרוטוקול ועדה מקומית לדיון בתכנית 152-1257682.md", "2025-07-23-פרוטוקול-ועדה-מקומית-הראל.md"),
]
QUERIES = [
"מהי הטענה המרכזית של העוררים בנוגע לחניה?",
"מה עמדת הוועדה המקומית לגבי התכנית?",
"האם יש פגיעה בזכויות הבנייה של השכנים?",
"מהם התנאים שנקבעו בהיתר הבנייה?",
"האם התכנית עומדת בתקן החניה?",
"מה טענות המשיבים לגבי הגובה והצפיפות?",
"האם נערך שימוע כדין לפני מתן ההחלטה?",
"מהם הנימוקים לאישור התכנית על ידי הוועדה המקומית?",
]
def cosine_sim(a, b):
dot = sum(x * y for x, y in zip(a, b))
na = sum(x * x for x in a) ** 0.5
nb = sum(x * x for x in b) ** 0.5
return dot / (na * nb) if na and nb else 0.0
def chunk_text(text, chunk_size=600, overlap=100):
words = text.split()
chunks = []
i = 0
while i < len(words):
chunks.append(" ".join(words[i:i + chunk_size]))
i += chunk_size - overlap
return chunks
def word_overlap(a, b):
wa, wb = set(a.split()), set(b.split())
if not wa or not wb:
return 0.0
return len(wa & wb) / max(len(wa), len(wb))
def main():
# ── Part 1: Document comparison ──
print("=" * 70)
print("PART 1: DOCUMENT COMPARISON (Google Vision vs Existing)")
print("=" * 70)
comparison_results = []
all_new_chunks = []
all_old_chunks = []
for new_name, old_name in PAIRS:
new_path = GOOGLE_DIR / new_name
old_path = DOCS_DIR / old_name
if not new_path.exists():
continue
if not old_path.exists():
print(f" SKIP (no existing): {old_name}")
continue
new_text = new_path.read_text(encoding="utf-8")
old_text = old_path.read_text(encoding="utf-8")
new_words = len(new_text.split())
old_words = len(old_text.split())
overlap = word_overlap(new_text, old_text)
short_name = old_name[:40]
diff = new_words - old_words
diff_pct = (diff / old_words * 100) if old_words else 0
comparison_results.append({
"name": short_name,
"old_words": old_words,
"new_words": new_words,
"diff": diff,
"diff_pct": diff_pct,
"overlap": overlap,
})
# Chunk for embedding
new_chunks = chunk_text(new_text)
old_chunks = chunk_text(old_text)
for i, c in enumerate(new_chunks):
all_new_chunks.append((short_name, i, c))
for i, c in enumerate(old_chunks):
all_old_chunks.append((short_name, i, c))
print(f"\n{'Document':<42} {'Old':>6} {'New':>6} {'Diff':>8} {'Overlap':>8}")
print("-" * 72)
for r in comparison_results:
print(f" {r['name']:<40} {r['old_words']:>6} {r['new_words']:>6} {r['diff']:>+7} ({r['diff_pct']:>+.0f}%) {r['overlap']:>7.0%}")
# ── Part 2: Embedding benchmark ──
print(f"\n{'=' * 70}")
print("PART 2: VOYAGE-LAW-2 EMBEDDING BENCHMARK")
print(f"{'=' * 70}")
new_texts = [c[2] for c in all_new_chunks]
old_texts = [c[2] for c in all_old_chunks]
print(f"\nNew chunks: {len(new_texts)}, Old chunks: {len(old_texts)}")
def embed_batched(texts, label):
BATCH = 20
all_embs = []
total_tokens = 0
t0 = time.time()
for i in range(0, len(texts), BATCH):
batch = texts[i:i+BATCH]
result = client.embed(batch, model=MODEL, input_type="document")
all_embs.extend(result.embeddings)
total_tokens += result.total_tokens
elapsed = time.time() - t0
print(f" {label}: {len(texts)} chunks, {total_tokens:,} tokens, {elapsed:.1f}s")
return all_embs, total_tokens, elapsed
# Embed new
print("Embedding NEW (Google Vision) chunks...")
new_embs, new_tokens, new_time = embed_batched(new_texts, "NEW")
# Embed old
print("Embedding OLD (existing) chunks...")
old_embs, old_tokens, old_time = embed_batched(old_texts, "OLD")
# Embed queries
print(f"Embedding {len(QUERIES)} queries...")
q_result = client.embed(QUERIES, model=MODEL, input_type="query")
q_embs = q_result.embeddings
# Search and compare
print(f"\n{'=' * 70}")
print("PART 3: SEARCH QUALITY COMPARISON")
print(f"{'=' * 70}")
for qi, query in enumerate(QUERIES):
# Score against new
new_scores = [(cosine_sim(q_embs[qi], e), all_new_chunks[i][0], all_new_chunks[i][2][:60]) for i, e in enumerate(new_embs)]
new_scores.sort(reverse=True)
# Score against old
old_scores = [(cosine_sim(q_embs[qi], e), all_old_chunks[i][0], all_old_chunks[i][2][:60]) for i, e in enumerate(old_embs)]
old_scores.sort(reverse=True)
print(f"\nQ{qi+1}: {query}")
print(f" {'NEW top-1':>10}: [{new_scores[0][0]:.4f}] {new_scores[0][1]}")
print(f" {'OLD top-1':>10}: [{old_scores[0][0]:.4f}] {old_scores[0][1]}")
if new_scores[0][0] > old_scores[0][0]:
print(f" >> NEW better by {new_scores[0][0] - old_scores[0][0]:.4f}")
else:
print(f" >> OLD better by {old_scores[0][0] - new_scores[0][0]:.4f}")
# Summary
new_avg = sum(max(cosine_sim(q_embs[qi], e) for e in new_embs) for qi in range(len(QUERIES))) / len(QUERIES)
old_avg = sum(max(cosine_sim(q_embs[qi], e) for e in old_embs) for qi in range(len(QUERIES))) / len(QUERIES)
print(f"\n{'=' * 70}")
print("SUMMARY")
print(f"{'=' * 70}")
print(f" {'Metric':<30} {'Old (existing)':>15} {'New (Google Vision)':>20}")
print(f" {'-' * 65}")
print(f" {'Total chunks':<30} {len(old_texts):>15} {len(new_texts):>20}")
print(f" {'Total tokens':<30} {old_tokens:>15,} {new_tokens:>20,}")
print(f" {'Embed time':<30} {old_time:>14.1f}s {new_time:>19.1f}s")
print(f" {'Avg top-1 score':<30} {old_avg:>15.4f} {new_avg:>20.4f}")
print(f" {'Score difference':<30} {'':>15} {new_avg - old_avg:>+20.4f}")
est_cost = (new_tokens + old_tokens) / 1_000_000 * 0.12
print(f"\n Embedding cost: ${est_cost:.3f}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,126 @@
"""Compare existing MD files with freshly extracted text from PDFs."""
import difflib
from pathlib import Path
DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
EXTRACTED_DIR = DOCS_DIR / "extracted"
# Map: existing MD -> extracted MD
PAIRS = [
("2025-08-14-כתב-ערר-קובר.md", "מרק קובר-כתב ערר.md", "Appeal - Kuber"),
("2025-09-01-כתב-תשובה-ליבמן-לערר.md", "תשובה לערר מטעם המשיבים.md", "Response - Livman"),
("2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md", "תשובת הועדה המרחבית לערר.md", "Response - Committee"),
("2025-10-22-כתב-ערר-מטמון.md", "תשובת המשיב-יצחק מטמון.md", "Response - Matmon"),
]
def normalize(text: str) -> str:
"""Normalize text for comparison."""
# Remove markdown formatting, extra whitespace
lines = text.strip().split("\n")
lines = [l.strip() for l in lines if l.strip()]
return "\n".join(lines)
def word_overlap(a: str, b: str) -> float:
"""Calculate word-level overlap ratio."""
words_a = set(a.split())
words_b = set(b.split())
if not words_a or not words_b:
return 0.0
intersection = words_a & words_b
return len(intersection) / max(len(words_a), len(words_b))
def main():
print(f"{'=' * 70}")
print("COMPARISON: Existing MD vs Fresh PDF Extraction")
print(f"{'=' * 70}\n")
summary = []
for existing_name, extracted_name, label in PAIRS:
existing_path = DOCS_DIR / existing_name
extracted_path = EXTRACTED_DIR / extracted_name
if not existing_path.exists():
print(f"SKIP: {existing_name} not found")
continue
if not extracted_path.exists():
print(f"SKIP: {extracted_name} not found")
continue
existing_text = existing_path.read_text(encoding="utf-8")
extracted_text = extracted_path.read_text(encoding="utf-8")
existing_norm = normalize(existing_text)
extracted_norm = normalize(extracted_text)
# Stats
existing_chars = len(existing_text)
extracted_chars = len(extracted_text)
existing_words = len(existing_text.split())
extracted_words = len(extracted_text.split())
# Similarity
overlap = word_overlap(existing_norm, extracted_norm)
# Sequence matcher ratio (slower but more accurate)
# Use first 5000 chars for speed
sm = difflib.SequenceMatcher(None, existing_norm[:5000], extracted_norm[:5000])
seq_ratio = sm.ratio()
# Find lines in extracted but not in existing (new content)
existing_lines = set(existing_norm.split("\n"))
extracted_lines = set(extracted_norm.split("\n"))
new_lines = extracted_lines - existing_lines
missing_lines = existing_lines - extracted_lines
print(f"{'=' * 70}")
print(f" {label}")
print(f" Existing: {existing_name}")
print(f" Extracted: {extracted_name}")
print(f"{'=' * 70}")
print(f" {'Metric':<30} {'Existing MD':>15} {'Fresh PDF':>15} {'Diff':>10}")
print(f" {'-' * 70}")
print(f" {'Characters':<30} {existing_chars:>15,} {extracted_chars:>15,} {extracted_chars - existing_chars:>+10,}")
print(f" {'Words':<30} {existing_words:>15,} {extracted_words:>15,} {extracted_words - existing_words:>+10,}")
print(f" {'Lines':<30} {len(existing_lines):>15,} {len(extracted_lines):>15,} {len(extracted_lines) - len(existing_lines):>+10,}")
print(f" {'Word overlap':<30} {overlap:>15.1%}")
print(f" {'Sequence similarity':<30} {seq_ratio:>15.1%}")
print(f" {'Lines only in fresh PDF':<30} {len(new_lines):>15}")
print(f" {'Lines only in existing MD':<30} {len(missing_lines):>15}")
# Show sample differences
if new_lines:
print(f"\n Sample lines ONLY in fresh extraction (first 3):")
for line in sorted(new_lines)[:3]:
print(f" + {line[:100]}")
if missing_lines:
print(f"\n Sample lines ONLY in existing MD (first 3):")
for line in sorted(missing_lines)[:3]:
print(f" - {line[:100]}")
print()
summary.append({
"label": label,
"existing_words": existing_words,
"extracted_words": extracted_words,
"word_overlap": overlap,
"seq_similarity": seq_ratio,
})
# Summary table
print(f"\n{'=' * 70}")
print("SUMMARY")
print(f"{'=' * 70}")
print(f" {'Document':<25} {'Existing':>10} {'Fresh':>10} {'Overlap':>10} {'Similarity':>12}")
print(f" {'-' * 67}")
for s in summary:
print(f" {s['label']:<25} {s['existing_words']:>10,} {s['extracted_words']:>10,} {s['word_overlap']:>10.1%} {s['seq_similarity']:>12.1%}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,128 @@
"""Extract ALL PDFs from originals using Google Cloud Vision OCR.
Forces OCR on all pages (ignoring broken text layers).
Then runs voyage-law-2 embedding benchmark comparing old vs new.
"""
import asyncio
import json
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
from dotenv import load_dotenv
load_dotenv(Path.home() / ".env")
import fitz
from google.cloud import vision
from legal_mcp import config
API_KEY = config.GOOGLE_CLOUD_VISION_API_KEY
client = vision.ImageAnnotatorClient(client_options={"api_key": API_KEY})
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"
# Hebrew abbreviation quote fixer
import re
_ABBREV_FIXES = {
'עוהייד': 'עוה"ד', 'עוייד': 'עו"ד', 'הנייל': 'הנ"ל',
'מצייב': 'מצ"ב', 'ביהמייש': 'ביהמ"ש', 'תייז': 'ת"ז',
'עייי': 'ע"י', 'אחייכ': 'אח"כ', 'סייק': 'ס"ק',
'דייר': 'ד"ר', 'כדוייח': 'כדו"ח', 'חווייד': 'חוו"ד',
'מייר': 'מ"ר', 'יחייד': 'יח"ד', 'בייכ': 'ב"כ',
}
_ABBREV_PAT = re.compile('|'.join(re.escape(k) for k in sorted(_ABBREV_FIXES, key=len, reverse=True)))
def fix_quotes(text):
return _ABBREV_PAT.sub(lambda m: _ABBREV_FIXES[m.group()], text)
def ocr_page(image_bytes, page_num):
image = vision.Image(content=image_bytes)
response = client.document_text_detection(
image=image,
image_context=vision.ImageContext(language_hints=["he"]),
)
if response.error.message:
print(f" ERROR page {page_num}: {response.error.message}")
return ""
text = response.full_text_annotation.text if response.full_text_annotation else ""
return fix_quotes(text)
def process_pdf(pdf_path):
doc = fitz.open(str(pdf_path))
page_count = len(doc)
pages_text = []
t0 = time.time()
for i in range(page_count):
page = doc[i]
pix = page.get_pixmap(dpi=300)
img_bytes = pix.tobytes("png")
pt = time.time()
text = ocr_page(img_bytes, i + 1)
elapsed = time.time() - pt
pages_text.append(text)
print(f" Page {i+1}/{page_count}: {len(text):,} chars, {elapsed:.1f}s")
doc.close()
total_time = time.time() - t0
return "\n\n".join(pages_text), page_count, total_time
def main():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
print(f"Found {len(pdfs)} PDFs\n")
results = []
total_pages = 0
total_time = 0.0
for pdf in pdfs:
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
# Skip already extracted
if out_file.exists() and out_file.stat().st_size > 100:
text = out_file.read_text(encoding="utf-8")
doc = fitz.open(str(pdf))
pages = len(doc)
doc.close()
print(f"SKIP (exists): {pdf.name} ({pages} pages, {len(text):,} chars)")
results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": len(text.split()), "time": 0, "skipped": True})
total_pages += pages
continue
print(f"{'=' * 60}")
print(f" {pdf.name} ({pdf.stat().st_size:,} bytes)")
text, pages, elapsed = process_pdf(pdf)
total_pages += pages
total_time += elapsed
out_file.write_text(text, encoding="utf-8")
words = len(text.split())
print(f" Result: {pages} pages, {len(text):,} chars, {words:,} words, {elapsed:.1f}s")
print(f" Saved: {out_file.name}\n")
results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": words, "time": elapsed, "skipped": False})
print(f"\n{'=' * 60}")
print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
est_cost = total_pages * 0.0015
print(f"Estimated cost: ${est_cost:.2f}")
# Save results
Path("/home/chaim/legal-ai/data/google-vision-extraction.json").write_text(
json.dumps(results, ensure_ascii=False, indent=2)
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,66 @@
"""Extract text from PDF using Google Cloud Vision API."""
import io
import time
from pathlib import Path
import fitz # PyMuPDF for rendering pages to images
from google.cloud import vision
API_KEY = "AIzaSyDZgUsxsy_FHkkREU7R_oQLJALU3_V26j8"
PDF_PATH = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals/מרק קובר-כתב ערר.pdf")
OUTPUT_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/extracted")
def main():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
client = vision.ImageAnnotatorClient(
client_options={"api_key": API_KEY}
)
doc = fitz.open(str(PDF_PATH))
page_count = len(doc)
print(f"Processing: {PDF_PATH.name} ({page_count} pages)\n")
pages_text = []
total_time = 0.0
for i in range(page_count):
page = doc[i]
pix = page.get_pixmap(dpi=300)
img_bytes = pix.tobytes("png")
image = vision.Image(content=img_bytes)
print(f" Page {i+1}/{page_count}...", end=" ", flush=True)
t0 = time.time()
response = client.document_text_detection(
image=image,
image_context={"language_hints": ["he"]}
)
elapsed = time.time() - t0
total_time += elapsed
if response.error.message:
print(f"ERROR: {response.error.message}")
pages_text.append("")
continue
text = response.full_text_annotation.text if response.full_text_annotation else ""
pages_text.append(text)
print(f"{len(text):,} chars, {elapsed:.1f}s")
doc.close()
full_text = "\n\n".join(pages_text)
out_file = OUTPUT_DIR / f"{PDF_PATH.stem}.md"
out_file.write_text(full_text, encoding="utf-8")
print(f"\nTotal: {len(full_text):,} chars, {len(full_text.split()):,} words, {total_time:.1f}s")
print(f"Saved: {out_file}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,54 @@
"""Extract text from a single PDF using Google Cloud Vision API."""
import sys
import time
from pathlib import Path
import fitz
from google.cloud import vision
API_KEY = "AIzaSyDZgUsxsy_FHkkREU7R_oQLJALU3_V26j8"
OUTPUT_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/extracted")
def main():
pdf_path = Path(sys.argv[1])
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
client = vision.ImageAnnotatorClient(client_options={"api_key": API_KEY})
doc = fitz.open(str(pdf_path))
page_count = len(doc)
print(f"Processing: {pdf_path.name} ({page_count} pages)\n")
pages_text = []
total_time = 0.0
for i in range(page_count):
page = doc[i]
pix = page.get_pixmap(dpi=300)
img_bytes = pix.tobytes("png")
image = vision.Image(content=img_bytes)
print(f" Page {i+1}/{page_count}...", end=" ", flush=True)
t0 = time.time()
response = client.document_text_detection(image=image, image_context={"language_hints": ["he"]})
elapsed = time.time() - t0
total_time += elapsed
if response.error.message:
print(f"ERROR: {response.error.message}")
pages_text.append("")
continue
text = response.full_text_annotation.text if response.full_text_annotation else ""
pages_text.append(text)
print(f"{len(text):,} chars, {elapsed:.1f}s")
doc.close()
full_text = "\n\n".join(pages_text)
out_file = OUTPUT_DIR / f"{pdf_path.stem}.md"
out_file.write_text(full_text, encoding="utf-8")
print(f"\nTotal: {len(full_text):,} chars, {len(full_text.split()):,} words, {total_time:.1f}s")
print(f"Saved: {out_file}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,66 @@
"""Extract text from original PDF files using Claude Opus Vision OCR."""
import asyncio
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
from dotenv import load_dotenv
load_dotenv(Path.home() / ".env")
from legal_mcp.services import extractor
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
OUTPUT_DIR = ORIGINALS_DIR / "extracted"
async def main():
OUTPUT_DIR.mkdir(exist_ok=True)
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
print(f"Found {len(pdfs)} PDFs\n")
total_cost = 0.0
total_pages = 0
total_time = 0.0
for pdf in pdfs:
print(f"{'=' * 60}")
print(f"Processing: {pdf.name}")
print(f" Size: {pdf.stat().st_size:,} bytes")
t0 = time.time()
text, page_count = await extractor.extract_text(str(pdf))
elapsed = time.time() - t0
total_pages += page_count
total_time += elapsed
# Estimate cost (Opus: $15/M input, $75/M output, ~1000 tokens per image)
# Rough: ~$0.05 per page for image input + output
est_cost = page_count * 0.05
total_cost += est_cost
# Save extracted text
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
out_file.write_text(text, encoding="utf-8")
print(f" Pages: {page_count}")
print(f" Extracted: {len(text):,} chars, {len(text.split()):,} words")
print(f" Time: {elapsed:.1f}s ({elapsed/max(page_count,1):.1f}s/page)")
print(f" Est. cost: ${est_cost:.3f}")
print(f" Saved to: {out_file.name}")
print()
print(f"{'=' * 60}")
print(f"TOTAL")
print(f" Documents: {len(pdfs)}")
print(f" Pages: {total_pages}")
print(f" Time: {total_time:.1f}s")
print(f" Est. cost: ${total_cost:.3f}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,113 @@
"""Extract text from original PDF files using Claude Opus Vision OCR on ALL pages.
Forces Vision OCR regardless of embedded text layer (which may be broken).
"""
import asyncio
import base64
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
from dotenv import load_dotenv
load_dotenv(Path.home() / ".env")
import anthropic
import fitz
from legal_mcp import config
client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
MODEL = "claude-opus-4-20250514"
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"
async def ocr_page(image_bytes: bytes, page_num: int) -> str:
b64_image = base64.b64encode(image_bytes).decode("utf-8")
message = client.messages.create(
model=MODEL,
max_tokens=4096,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {"type": "base64", "media_type": "image/png", "data": b64_image},
},
{
"type": "text",
"text": (
"חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
"שמור על מבנה הפסקאות המקורי. "
"אם יש כותרות, סמן אותן. "
"החזר רק את הטקסט המחולץ, ללא הערות נוספות."
),
},
],
}],
)
return message.content[0].text
async def process_pdf(pdf_path: Path) -> tuple[str, int, float, int, int]:
doc = fitz.open(str(pdf_path))
page_count = len(doc)
pages_text = []
total_input = 0
total_output = 0
t0 = time.time()
for i in range(page_count):
page = doc[i]
pix = page.get_pixmap(dpi=200)
img_bytes = pix.tobytes("png")
print(f" Page {i+1}/{page_count}...", end=" ", flush=True)
pt = time.time()
text = await ocr_page(img_bytes, i + 1)
elapsed = time.time() - pt
pages_text.append(text)
print(f"{len(text):,} chars, {elapsed:.1f}s")
doc.close()
total_time = time.time() - t0
full_text = "\n\n".join(pages_text)
return full_text, page_count, total_time
async def main():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
print(f"Found {len(pdfs)} PDFs — extracting ALL pages with {MODEL}\n")
total_pages = 0
total_time = 0.0
for pdf in pdfs:
print(f"{'=' * 60}")
print(f" {pdf.name} ({pdf.stat().st_size:,} bytes)")
print(f"{'=' * 60}")
text, pages, elapsed = await process_pdf(pdf)
total_pages += pages
total_time += elapsed
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
out_file.write_text(text, encoding="utf-8")
print(f" Result: {pages} pages, {len(text):,} chars, {len(text.split()):,} words")
print(f" Time: {elapsed:.1f}s ({elapsed/max(pages,1):.1f}s/page)")
print(f" Saved: {out_file.name}\n")
print(f"{'=' * 60}")
print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
est_cost = total_pages * 0.05
print(f"Estimated cost: ${est_cost:.2f}")
if __name__ == "__main__":
asyncio.run(main())