Active scripts (5): auto-sync-cases.sh, backup-db.sh, restore-db.sh, notify.py, bidi_table.py Archived (17): one-time migration/seeding scripts whose functionality is now in MCP server or web API. Moved to scripts/.archive/ Deleted (5): zero-value scripts (duplicates, hardcoded single-case, debug scripts) Added scripts/SCRIPTS.md — registry of all scripts with purpose, status, and what superseded them. CLAUDE.md updated with rule: any script change requires SCRIPTS.md update. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
67 lines
1.9 KiB
Python
67 lines
1.9 KiB
Python
"""Extract text from original PDF files using Claude Opus Vision OCR."""
|
|
|
|
import asyncio
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
|
|
|
|
from dotenv import load_dotenv
|
|
load_dotenv(Path.home() / ".env")
|
|
|
|
from legal_mcp.services import extractor
|
|
|
|
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
|
|
OUTPUT_DIR = ORIGINALS_DIR / "extracted"
|
|
|
|
|
|
async def main():
|
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
|
|
|
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
|
|
print(f"Found {len(pdfs)} PDFs\n")
|
|
|
|
total_cost = 0.0
|
|
total_pages = 0
|
|
total_time = 0.0
|
|
|
|
for pdf in pdfs:
|
|
print(f"{'=' * 60}")
|
|
print(f"Processing: {pdf.name}")
|
|
print(f" Size: {pdf.stat().st_size:,} bytes")
|
|
|
|
t0 = time.time()
|
|
text, page_count = await extractor.extract_text(str(pdf))
|
|
elapsed = time.time() - t0
|
|
|
|
total_pages += page_count
|
|
total_time += elapsed
|
|
|
|
# Estimate cost (Opus: $15/M input, $75/M output, ~1000 tokens per image)
|
|
# Rough: ~$0.05 per page for image input + output
|
|
est_cost = page_count * 0.05
|
|
total_cost += est_cost
|
|
|
|
# Save extracted text
|
|
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
|
|
out_file.write_text(text, encoding="utf-8")
|
|
|
|
print(f" Pages: {page_count}")
|
|
print(f" Extracted: {len(text):,} chars, {len(text.split()):,} words")
|
|
print(f" Time: {elapsed:.1f}s ({elapsed/max(page_count,1):.1f}s/page)")
|
|
print(f" Est. cost: ${est_cost:.3f}")
|
|
print(f" Saved to: {out_file.name}")
|
|
print()
|
|
|
|
print(f"{'=' * 60}")
|
|
print(f"TOTAL")
|
|
print(f" Documents: {len(pdfs)}")
|
|
print(f" Pages: {total_pages}")
|
|
print(f" Time: {total_time:.1f}s")
|
|
print(f" Est. cost: ${total_cost:.3f}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|