Files
legal-ai/scripts/.archive/extract_originals_ocr.py
Chaim 5c9a5d702a Clean up scripts/: archive 17, delete 5, add SCRIPTS.md registry
Active scripts (5): auto-sync-cases.sh, backup-db.sh, restore-db.sh,
notify.py, bidi_table.py

Archived (17): one-time migration/seeding scripts whose functionality
is now in MCP server or web API. Moved to scripts/.archive/

Deleted (5): zero-value scripts (duplicates, hardcoded single-case,
debug scripts)

Added scripts/SCRIPTS.md — registry of all scripts with purpose,
status, and what superseded them. CLAUDE.md updated with rule:
any script change requires SCRIPTS.md update.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 16:30:19 +00:00

114 lines
3.4 KiB
Python

"""Extract text from original PDF files using Claude Opus Vision OCR on ALL pages.
Forces Vision OCR regardless of embedded text layer (which may be broken).
"""
import asyncio
import base64
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
from dotenv import load_dotenv
load_dotenv(Path.home() / ".env")
import anthropic
import fitz
from legal_mcp import config
client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
MODEL = "claude-opus-4-20250514"
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"
async def ocr_page(image_bytes: bytes, page_num: int) -> str:
b64_image = base64.b64encode(image_bytes).decode("utf-8")
message = client.messages.create(
model=MODEL,
max_tokens=4096,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {"type": "base64", "media_type": "image/png", "data": b64_image},
},
{
"type": "text",
"text": (
"חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
"שמור על מבנה הפסקאות המקורי. "
"אם יש כותרות, סמן אותן. "
"החזר רק את הטקסט המחולץ, ללא הערות נוספות."
),
},
],
}],
)
return message.content[0].text
async def process_pdf(pdf_path: Path) -> tuple[str, int, float, int, int]:
doc = fitz.open(str(pdf_path))
page_count = len(doc)
pages_text = []
total_input = 0
total_output = 0
t0 = time.time()
for i in range(page_count):
page = doc[i]
pix = page.get_pixmap(dpi=200)
img_bytes = pix.tobytes("png")
print(f" Page {i+1}/{page_count}...", end=" ", flush=True)
pt = time.time()
text = await ocr_page(img_bytes, i + 1)
elapsed = time.time() - pt
pages_text.append(text)
print(f"{len(text):,} chars, {elapsed:.1f}s")
doc.close()
total_time = time.time() - t0
full_text = "\n\n".join(pages_text)
return full_text, page_count, total_time
async def main():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
print(f"Found {len(pdfs)} PDFs — extracting ALL pages with {MODEL}\n")
total_pages = 0
total_time = 0.0
for pdf in pdfs:
print(f"{'=' * 60}")
print(f" {pdf.name} ({pdf.stat().st_size:,} bytes)")
print(f"{'=' * 60}")
text, pages, elapsed = await process_pdf(pdf)
total_pages += pages
total_time += elapsed
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
out_file.write_text(text, encoding="utf-8")
print(f" Result: {pages} pages, {len(text):,} chars, {len(text.split()):,} words")
print(f" Time: {elapsed:.1f}s ({elapsed/max(pages,1):.1f}s/page)")
print(f" Saved: {out_file.name}\n")
print(f"{'=' * 60}")
print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
est_cost = total_pages * 0.05
print(f"Estimated cost: ${est_cost:.2f}")
if __name__ == "__main__":
asyncio.run(main())