"""Extract text from PDF using Google Cloud Vision API.""" import io import time from pathlib import Path import fitz # PyMuPDF for rendering pages to images from google.cloud import vision API_KEY = "AIzaSyDZgUsxsy_FHkkREU7R_oQLJALU3_V26j8" PDF_PATH = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals/מרק קובר-כתב ערר.pdf") OUTPUT_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/extracted") def main(): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) client = vision.ImageAnnotatorClient( client_options={"api_key": API_KEY} ) doc = fitz.open(str(PDF_PATH)) page_count = len(doc) print(f"Processing: {PDF_PATH.name} ({page_count} pages)\n") pages_text = [] total_time = 0.0 for i in range(page_count): page = doc[i] pix = page.get_pixmap(dpi=300) img_bytes = pix.tobytes("png") image = vision.Image(content=img_bytes) print(f" Page {i+1}/{page_count}...", end=" ", flush=True) t0 = time.time() response = client.document_text_detection( image=image, image_context={"language_hints": ["he"]} ) elapsed = time.time() - t0 total_time += elapsed if response.error.message: print(f"ERROR: {response.error.message}") pages_text.append("") continue text = response.full_text_annotation.text if response.full_text_annotation else "" pages_text.append(text) print(f"{len(text):,} chars, {elapsed:.1f}s") doc.close() full_text = "\n\n".join(pages_text) out_file = OUTPUT_DIR / f"{PDF_PATH.stem}.md" out_file.write_text(full_text, encoding="utf-8") print(f"\nTotal: {len(full_text):,} chars, {len(full_text.split()):,} words, {total_time:.1f}s") print(f"Saved: {out_file}") if __name__ == "__main__": main()