legal-ai/scripts/extract_all_google_vision.py

"""Extract ALL PDFs from originals using Google Cloud Vision OCR.
Forces OCR on all pages (ignoring broken text layers).
Then runs voyage-law-2 embedding benchmark comparing old vs new.
"""

import asyncio
import json
import sys
import time
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))

from dotenv import load_dotenv
load_dotenv(Path.home() / ".env")

import fitz
from google.cloud import vision
from legal_mcp import config

API_KEY = config.GOOGLE_CLOUD_VISION_API_KEY
client = vision.ImageAnnotatorClient(client_options={"api_key": API_KEY})

ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"

# Hebrew abbreviation quote fixer
import re
_ABBREV_FIXES = {
    'עוהייד': 'עוה"ד', 'עוייד': 'עו"ד', 'הנייל': 'הנ"ל',
    'מצייב': 'מצ"ב', 'ביהמייש': 'ביהמ"ש', 'תייז': 'ת"ז',
    'עייי': 'ע"י', 'אחייכ': 'אח"כ', 'סייק': 'ס"ק',
    'דייר': 'ד"ר', 'כדוייח': 'כדו"ח', 'חווייד': 'חוו"ד',
    'מייר': 'מ"ר', 'יחייד': 'יח"ד', 'בייכ': 'ב"כ',
}
_ABBREV_PAT = re.compile('|'.join(re.escape(k) for k in sorted(_ABBREV_FIXES, key=len, reverse=True)))

def fix_quotes(text):
    return _ABBREV_PAT.sub(lambda m: _ABBREV_FIXES[m.group()], text)


def ocr_page(image_bytes, page_num):
    image = vision.Image(content=image_bytes)
    response = client.document_text_detection(
        image=image,
        image_context=vision.ImageContext(language_hints=["he"]),
    )
    if response.error.message:
        print(f"    ERROR page {page_num}: {response.error.message}")
        return ""
    text = response.full_text_annotation.text if response.full_text_annotation else ""
    return fix_quotes(text)


def process_pdf(pdf_path):
    doc = fitz.open(str(pdf_path))
    page_count = len(doc)
    pages_text = []
    t0 = time.time()

    for i in range(page_count):
        page = doc[i]
        pix = page.get_pixmap(dpi=300)
        img_bytes = pix.tobytes("png")

        pt = time.time()
        text = ocr_page(img_bytes, i + 1)
        elapsed = time.time() - pt
        pages_text.append(text)
        print(f"    Page {i+1}/{page_count}: {len(text):,} chars, {elapsed:.1f}s")

    doc.close()
    total_time = time.time() - t0
    return "\n\n".join(pages_text), page_count, total_time


def main():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
    print(f"Found {len(pdfs)} PDFs\n")

    results = []
    total_pages = 0
    total_time = 0.0

    for pdf in pdfs:
        out_file = OUTPUT_DIR / f"{pdf.stem}.md"

        # Skip already extracted
        if out_file.exists() and out_file.stat().st_size > 100:
            text = out_file.read_text(encoding="utf-8")
            doc = fitz.open(str(pdf))
            pages = len(doc)
            doc.close()
            print(f"SKIP (exists): {pdf.name} ({pages} pages, {len(text):,} chars)")
            results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": len(text.split()), "time": 0, "skipped": True})
            total_pages += pages
            continue

        print(f"{'=' * 60}")
        print(f"  {pdf.name} ({pdf.stat().st_size:,} bytes)")

        text, pages, elapsed = process_pdf(pdf)
        total_pages += pages
        total_time += elapsed

        out_file.write_text(text, encoding="utf-8")

        words = len(text.split())
        print(f"  Result: {pages} pages, {len(text):,} chars, {words:,} words, {elapsed:.1f}s")
        print(f"  Saved: {out_file.name}\n")

        results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": words, "time": elapsed, "skipped": False})

    print(f"\n{'=' * 60}")
    print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
    est_cost = total_pages * 0.0015
    print(f"Estimated cost: ${est_cost:.2f}")

    # Save results
    Path("/home/chaim/legal-ai/data/google-vision-extraction.json").write_text(
        json.dumps(results, ensure_ascii=False, indent=2)
    )


if __name__ == "__main__":
    main()