"""Extract text from original PDF files using Claude Opus Vision OCR on ALL pages. Forces Vision OCR regardless of embedded text layer (which may be broken). """ import asyncio import base64 import sys import time from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src")) from dotenv import load_dotenv load_dotenv(Path.home() / ".env") import anthropic import fitz from legal_mcp import config client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY) MODEL = "claude-opus-4-20250514" ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals") OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted" async def ocr_page(image_bytes: bytes, page_num: int) -> str: b64_image = base64.b64encode(image_bytes).decode("utf-8") message = client.messages.create( model=MODEL, max_tokens=4096, messages=[{ "role": "user", "content": [ { "type": "image", "source": {"type": "base64", "media_type": "image/png", "data": b64_image}, }, { "type": "text", "text": ( "חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. " "שמור על מבנה הפסקאות המקורי. " "אם יש כותרות, סמן אותן. " "החזר רק את הטקסט המחולץ, ללא הערות נוספות." ), }, ], }], ) return message.content[0].text async def process_pdf(pdf_path: Path) -> tuple[str, int, float, int, int]: doc = fitz.open(str(pdf_path)) page_count = len(doc) pages_text = [] total_input = 0 total_output = 0 t0 = time.time() for i in range(page_count): page = doc[i] pix = page.get_pixmap(dpi=200) img_bytes = pix.tobytes("png") print(f" Page {i+1}/{page_count}...", end=" ", flush=True) pt = time.time() text = await ocr_page(img_bytes, i + 1) elapsed = time.time() - pt pages_text.append(text) print(f"{len(text):,} chars, {elapsed:.1f}s") doc.close() total_time = time.time() - t0 full_text = "\n\n".join(pages_text) return full_text, page_count, total_time async def main(): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) pdfs = sorted(ORIGINALS_DIR.glob("*.pdf")) print(f"Found {len(pdfs)} PDFs — extracting ALL pages with {MODEL}\n") total_pages = 0 total_time = 0.0 for pdf in pdfs: print(f"{'=' * 60}") print(f" {pdf.name} ({pdf.stat().st_size:,} bytes)") print(f"{'=' * 60}") text, pages, elapsed = await process_pdf(pdf) total_pages += pages total_time += elapsed out_file = OUTPUT_DIR / f"{pdf.stem}.md" out_file.write_text(text, encoding="utf-8") print(f" Result: {pages} pages, {len(text):,} chars, {len(text.split()):,} words") print(f" Time: {elapsed:.1f}s ({elapsed/max(pages,1):.1f}s/page)") print(f" Saved: {out_file.name}\n") print(f"{'=' * 60}") print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s") est_cost = total_pages * 0.05 print(f"Estimated cost: ${est_cost:.2f}") if __name__ == "__main__": asyncio.run(main())