Files
legal-ai/scripts/extract_originals.py
Chaim 22e819363e Flatten cases directory structure and unify paths
- Remove cases/new|in-progress|completed subdivision (status managed in DB)
- Rename documents/original → documents/originals (consistent plural)
- Move exports from global data/exports/ into cases/{num}/exports/
- Add documents/research/ for case law and analysis files
- Update all agents, scripts, config, web API endpoints, and DB paths

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 14:33:27 +00:00

67 lines
1.9 KiB
Python

"""Extract text from original PDF files using Claude Opus Vision OCR."""
import asyncio
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
from dotenv import load_dotenv
load_dotenv(Path.home() / ".env")
from legal_mcp.services import extractor
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
OUTPUT_DIR = ORIGINALS_DIR / "extracted"
async def main():
OUTPUT_DIR.mkdir(exist_ok=True)
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
print(f"Found {len(pdfs)} PDFs\n")
total_cost = 0.0
total_pages = 0
total_time = 0.0
for pdf in pdfs:
print(f"{'=' * 60}")
print(f"Processing: {pdf.name}")
print(f" Size: {pdf.stat().st_size:,} bytes")
t0 = time.time()
text, page_count = await extractor.extract_text(str(pdf))
elapsed = time.time() - t0
total_pages += page_count
total_time += elapsed
# Estimate cost (Opus: $15/M input, $75/M output, ~1000 tokens per image)
# Rough: ~$0.05 per page for image input + output
est_cost = page_count * 0.05
total_cost += est_cost
# Save extracted text
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
out_file.write_text(text, encoding="utf-8")
print(f" Pages: {page_count}")
print(f" Extracted: {len(text):,} chars, {len(text.split()):,} words")
print(f" Time: {elapsed:.1f}s ({elapsed/max(page_count,1):.1f}s/page)")
print(f" Est. cost: ${est_cost:.3f}")
print(f" Saved to: {out_file.name}")
print()
print(f"{'=' * 60}")
print(f"TOTAL")
print(f" Documents: {len(pdfs)}")
print(f" Pages: {total_pages}")
print(f" Time: {total_time:.1f}s")
print(f" Est. cost: ${total_cost:.3f}")
if __name__ == "__main__":
asyncio.run(main())