"""Extract ALL PDFs from originals using Google Cloud Vision OCR. Forces OCR on all pages (ignoring broken text layers). Then runs voyage-law-2 embedding benchmark comparing old vs new. """ import asyncio import json import sys import time from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src")) from dotenv import load_dotenv load_dotenv(Path.home() / ".env") import fitz from google.cloud import vision from legal_mcp import config API_KEY = config.GOOGLE_CLOUD_VISION_API_KEY client = vision.ImageAnnotatorClient(client_options={"api_key": API_KEY}) ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals") OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted" # Hebrew abbreviation quote fixer import re _ABBREV_FIXES = { 'עוהייד': 'עוה"ד', 'עוייד': 'עו"ד', 'הנייל': 'הנ"ל', 'מצייב': 'מצ"ב', 'ביהמייש': 'ביהמ"ש', 'תייז': 'ת"ז', 'עייי': 'ע"י', 'אחייכ': 'אח"כ', 'סייק': 'ס"ק', 'דייר': 'ד"ר', 'כדוייח': 'כדו"ח', 'חווייד': 'חוו"ד', 'מייר': 'מ"ר', 'יחייד': 'יח"ד', 'בייכ': 'ב"כ', } _ABBREV_PAT = re.compile('|'.join(re.escape(k) for k in sorted(_ABBREV_FIXES, key=len, reverse=True))) def fix_quotes(text): return _ABBREV_PAT.sub(lambda m: _ABBREV_FIXES[m.group()], text) def ocr_page(image_bytes, page_num): image = vision.Image(content=image_bytes) response = client.document_text_detection( image=image, image_context=vision.ImageContext(language_hints=["he"]), ) if response.error.message: print(f" ERROR page {page_num}: {response.error.message}") return "" text = response.full_text_annotation.text if response.full_text_annotation else "" return fix_quotes(text) def process_pdf(pdf_path): doc = fitz.open(str(pdf_path)) page_count = len(doc) pages_text = [] t0 = time.time() for i in range(page_count): page = doc[i] pix = page.get_pixmap(dpi=300) img_bytes = pix.tobytes("png") pt = time.time() text = ocr_page(img_bytes, i + 1) elapsed = time.time() - pt pages_text.append(text) print(f" Page {i+1}/{page_count}: {len(text):,} chars, {elapsed:.1f}s") doc.close() total_time = time.time() - t0 return "\n\n".join(pages_text), page_count, total_time def main(): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) pdfs = sorted(ORIGINALS_DIR.glob("*.pdf")) print(f"Found {len(pdfs)} PDFs\n") results = [] total_pages = 0 total_time = 0.0 for pdf in pdfs: out_file = OUTPUT_DIR / f"{pdf.stem}.md" # Skip already extracted if out_file.exists() and out_file.stat().st_size > 100: text = out_file.read_text(encoding="utf-8") doc = fitz.open(str(pdf)) pages = len(doc) doc.close() print(f"SKIP (exists): {pdf.name} ({pages} pages, {len(text):,} chars)") results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": len(text.split()), "time": 0, "skipped": True}) total_pages += pages continue print(f"{'=' * 60}") print(f" {pdf.name} ({pdf.stat().st_size:,} bytes)") text, pages, elapsed = process_pdf(pdf) total_pages += pages total_time += elapsed out_file.write_text(text, encoding="utf-8") words = len(text.split()) print(f" Result: {pages} pages, {len(text):,} chars, {words:,} words, {elapsed:.1f}s") print(f" Saved: {out_file.name}\n") results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": words, "time": elapsed, "skipped": False}) print(f"\n{'=' * 60}") print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s") est_cost = total_pages * 0.0015 print(f"Estimated cost: ${est_cost:.2f}") # Save results Path("/home/chaim/legal-ai/data/google-vision-extraction.json").write_text( json.dumps(results, ensure_ascii=False, indent=2) ) if __name__ == "__main__": main()