Active scripts (5): auto-sync-cases.sh, backup-db.sh, restore-db.sh, notify.py, bidi_table.py Archived (17): one-time migration/seeding scripts whose functionality is now in MCP server or web API. Moved to scripts/.archive/ Deleted (5): zero-value scripts (duplicates, hardcoded single-case, debug scripts) Added scripts/SCRIPTS.md — registry of all scripts with purpose, status, and what superseded them. CLAUDE.md updated with rule: any script change requires SCRIPTS.md update. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
129 lines
4.1 KiB
Python
129 lines
4.1 KiB
Python
"""Extract ALL PDFs from originals using Google Cloud Vision OCR.
|
||
Forces OCR on all pages (ignoring broken text layers).
|
||
Then runs voyage-law-2 embedding benchmark comparing old vs new.
|
||
"""
|
||
|
||
import asyncio
|
||
import json
|
||
import sys
|
||
import time
|
||
from pathlib import Path
|
||
|
||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
|
||
|
||
from dotenv import load_dotenv
|
||
load_dotenv(Path.home() / ".env")
|
||
|
||
import fitz
|
||
from google.cloud import vision
|
||
from legal_mcp import config
|
||
|
||
API_KEY = config.GOOGLE_CLOUD_VISION_API_KEY
|
||
client = vision.ImageAnnotatorClient(client_options={"api_key": API_KEY})
|
||
|
||
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
|
||
OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"
|
||
|
||
# Hebrew abbreviation quote fixer
|
||
import re
|
||
_ABBREV_FIXES = {
|
||
'עוהייד': 'עוה"ד', 'עוייד': 'עו"ד', 'הנייל': 'הנ"ל',
|
||
'מצייב': 'מצ"ב', 'ביהמייש': 'ביהמ"ש', 'תייז': 'ת"ז',
|
||
'עייי': 'ע"י', 'אחייכ': 'אח"כ', 'סייק': 'ס"ק',
|
||
'דייר': 'ד"ר', 'כדוייח': 'כדו"ח', 'חווייד': 'חוו"ד',
|
||
'מייר': 'מ"ר', 'יחייד': 'יח"ד', 'בייכ': 'ב"כ',
|
||
}
|
||
_ABBREV_PAT = re.compile('|'.join(re.escape(k) for k in sorted(_ABBREV_FIXES, key=len, reverse=True)))
|
||
|
||
def fix_quotes(text):
|
||
return _ABBREV_PAT.sub(lambda m: _ABBREV_FIXES[m.group()], text)
|
||
|
||
|
||
def ocr_page(image_bytes, page_num):
|
||
image = vision.Image(content=image_bytes)
|
||
response = client.document_text_detection(
|
||
image=image,
|
||
image_context=vision.ImageContext(language_hints=["he"]),
|
||
)
|
||
if response.error.message:
|
||
print(f" ERROR page {page_num}: {response.error.message}")
|
||
return ""
|
||
text = response.full_text_annotation.text if response.full_text_annotation else ""
|
||
return fix_quotes(text)
|
||
|
||
|
||
def process_pdf(pdf_path):
|
||
doc = fitz.open(str(pdf_path))
|
||
page_count = len(doc)
|
||
pages_text = []
|
||
t0 = time.time()
|
||
|
||
for i in range(page_count):
|
||
page = doc[i]
|
||
pix = page.get_pixmap(dpi=300)
|
||
img_bytes = pix.tobytes("png")
|
||
|
||
pt = time.time()
|
||
text = ocr_page(img_bytes, i + 1)
|
||
elapsed = time.time() - pt
|
||
pages_text.append(text)
|
||
print(f" Page {i+1}/{page_count}: {len(text):,} chars, {elapsed:.1f}s")
|
||
|
||
doc.close()
|
||
total_time = time.time() - t0
|
||
return "\n\n".join(pages_text), page_count, total_time
|
||
|
||
|
||
def main():
|
||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
|
||
print(f"Found {len(pdfs)} PDFs\n")
|
||
|
||
results = []
|
||
total_pages = 0
|
||
total_time = 0.0
|
||
|
||
for pdf in pdfs:
|
||
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
|
||
|
||
# Skip already extracted
|
||
if out_file.exists() and out_file.stat().st_size > 100:
|
||
text = out_file.read_text(encoding="utf-8")
|
||
doc = fitz.open(str(pdf))
|
||
pages = len(doc)
|
||
doc.close()
|
||
print(f"SKIP (exists): {pdf.name} ({pages} pages, {len(text):,} chars)")
|
||
results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": len(text.split()), "time": 0, "skipped": True})
|
||
total_pages += pages
|
||
continue
|
||
|
||
print(f"{'=' * 60}")
|
||
print(f" {pdf.name} ({pdf.stat().st_size:,} bytes)")
|
||
|
||
text, pages, elapsed = process_pdf(pdf)
|
||
total_pages += pages
|
||
total_time += elapsed
|
||
|
||
out_file.write_text(text, encoding="utf-8")
|
||
|
||
words = len(text.split())
|
||
print(f" Result: {pages} pages, {len(text):,} chars, {words:,} words, {elapsed:.1f}s")
|
||
print(f" Saved: {out_file.name}\n")
|
||
|
||
results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": words, "time": elapsed, "skipped": False})
|
||
|
||
print(f"\n{'=' * 60}")
|
||
print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
|
||
est_cost = total_pages * 0.0015
|
||
print(f"Estimated cost: ${est_cost:.2f}")
|
||
|
||
# Save results
|
||
Path("/home/chaim/legal-ai/data/google-vision-extraction.json").write_text(
|
||
json.dumps(results, ensure_ascii=False, indent=2)
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|