"""Extract text from a single PDF using Google Cloud Vision API.""" import sys import time from pathlib import Path import fitz from google.cloud import vision API_KEY = "AIzaSyDZgUsxsy_FHkkREU7R_oQLJALU3_V26j8" OUTPUT_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/extracted") def main(): pdf_path = Path(sys.argv[1]) OUTPUT_DIR.mkdir(parents=True, exist_ok=True) client = vision.ImageAnnotatorClient(client_options={"api_key": API_KEY}) doc = fitz.open(str(pdf_path)) page_count = len(doc) print(f"Processing: {pdf_path.name} ({page_count} pages)\n") pages_text = [] total_time = 0.0 for i in range(page_count): page = doc[i] pix = page.get_pixmap(dpi=300) img_bytes = pix.tobytes("png") image = vision.Image(content=img_bytes) print(f" Page {i+1}/{page_count}...", end=" ", flush=True) t0 = time.time() response = client.document_text_detection(image=image, image_context={"language_hints": ["he"]}) elapsed = time.time() - t0 total_time += elapsed if response.error.message: print(f"ERROR: {response.error.message}") pages_text.append("") continue text = response.full_text_annotation.text if response.full_text_annotation else "" pages_text.append(text) print(f"{len(text):,} chars, {elapsed:.1f}s") doc.close() full_text = "\n\n".join(pages_text) out_file = OUTPUT_DIR / f"{pdf_path.stem}.md" out_file.write_text(full_text, encoding="utf-8") print(f"\nTotal: {len(full_text):,} chars, {len(full_text.split()):,} words, {total_time:.1f}s") print(f"Saved: {out_file}") if __name__ == "__main__": main()