legal-ai/scripts/extract_google_vision.py

"""Extract text from PDF using Google Cloud Vision API."""

import io
import time
from pathlib import Path

import fitz  # PyMuPDF for rendering pages to images
from google.cloud import vision

API_KEY = "AIzaSyDZgUsxsy_FHkkREU7R_oQLJALU3_V26j8"

PDF_PATH = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals/מרק קובר-כתב ערר.pdf")
OUTPUT_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/extracted")


def main():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    client = vision.ImageAnnotatorClient(
        client_options={"api_key": API_KEY}
    )

    doc = fitz.open(str(PDF_PATH))
    page_count = len(doc)
    print(f"Processing: {PDF_PATH.name} ({page_count} pages)\n")

    pages_text = []
    total_time = 0.0

    for i in range(page_count):
        page = doc[i]
        pix = page.get_pixmap(dpi=300)
        img_bytes = pix.tobytes("png")

        image = vision.Image(content=img_bytes)

        print(f"  Page {i+1}/{page_count}...", end=" ", flush=True)
        t0 = time.time()
        response = client.document_text_detection(
            image=image,
            image_context={"language_hints": ["he"]}
        )
        elapsed = time.time() - t0
        total_time += elapsed

        if response.error.message:
            print(f"ERROR: {response.error.message}")
            pages_text.append("")
            continue

        text = response.full_text_annotation.text if response.full_text_annotation else ""
        pages_text.append(text)
        print(f"{len(text):,} chars, {elapsed:.1f}s")

    doc.close()

    full_text = "\n\n".join(pages_text)
    out_file = OUTPUT_DIR / f"{PDF_PATH.stem}.md"
    out_file.write_text(full_text, encoding="utf-8")

    print(f"\nTotal: {len(full_text):,} chars, {len(full_text.split()):,} words, {total_time:.1f}s")
    print(f"Saved: {out_file}")


if __name__ == "__main__":
    main()