"""Extract text from original PDF files using Claude Opus Vision OCR.""" import asyncio import sys import time from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src")) from dotenv import load_dotenv load_dotenv(Path.home() / ".env") from legal_mcp.services import extractor ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals") OUTPUT_DIR = ORIGINALS_DIR / "extracted" async def main(): OUTPUT_DIR.mkdir(exist_ok=True) pdfs = sorted(ORIGINALS_DIR.glob("*.pdf")) print(f"Found {len(pdfs)} PDFs\n") total_cost = 0.0 total_pages = 0 total_time = 0.0 for pdf in pdfs: print(f"{'=' * 60}") print(f"Processing: {pdf.name}") print(f" Size: {pdf.stat().st_size:,} bytes") t0 = time.time() text, page_count = await extractor.extract_text(str(pdf)) elapsed = time.time() - t0 total_pages += page_count total_time += elapsed # Estimate cost (Opus: $15/M input, $75/M output, ~1000 tokens per image) # Rough: ~$0.05 per page for image input + output est_cost = page_count * 0.05 total_cost += est_cost # Save extracted text out_file = OUTPUT_DIR / f"{pdf.stem}.md" out_file.write_text(text, encoding="utf-8") print(f" Pages: {page_count}") print(f" Extracted: {len(text):,} chars, {len(text.split()):,} words") print(f" Time: {elapsed:.1f}s ({elapsed/max(page_count,1):.1f}s/page)") print(f" Est. cost: ${est_cost:.3f}") print(f" Saved to: {out_file.name}") print() print(f"{'=' * 60}") print(f"TOTAL") print(f" Documents: {len(pdfs)}") print(f" Pages: {total_pages}") print(f" Time: {total_time:.1f}s") print(f" Est. cost: ${total_cost:.3f}") if __name__ == "__main__": asyncio.run(main())