- Remove cases/new|in-progress|completed subdivision (status managed in DB)
- Rename documents/original → documents/originals (consistent plural)
- Move exports from global data/exports/ into cases/{num}/exports/
- Add documents/research/ for case law and analysis files
- Update all agents, scripts, config, web API endpoints, and DB paths
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
114 lines
3.4 KiB
Python
114 lines
3.4 KiB
Python
"""Extract text from original PDF files using Claude Opus Vision OCR on ALL pages.
|
|
|
|
Forces Vision OCR regardless of embedded text layer (which may be broken).
|
|
"""
|
|
|
|
import asyncio
|
|
import base64
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
|
|
|
|
from dotenv import load_dotenv
|
|
load_dotenv(Path.home() / ".env")
|
|
|
|
import anthropic
|
|
import fitz
|
|
from legal_mcp import config
|
|
|
|
client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
|
|
MODEL = "claude-opus-4-20250514"
|
|
|
|
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
|
|
OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"
|
|
|
|
|
|
async def ocr_page(image_bytes: bytes, page_num: int) -> str:
|
|
b64_image = base64.b64encode(image_bytes).decode("utf-8")
|
|
message = client.messages.create(
|
|
model=MODEL,
|
|
max_tokens=4096,
|
|
messages=[{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "image",
|
|
"source": {"type": "base64", "media_type": "image/png", "data": b64_image},
|
|
},
|
|
{
|
|
"type": "text",
|
|
"text": (
|
|
"חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
|
|
"שמור על מבנה הפסקאות המקורי. "
|
|
"אם יש כותרות, סמן אותן. "
|
|
"החזר רק את הטקסט המחולץ, ללא הערות נוספות."
|
|
),
|
|
},
|
|
],
|
|
}],
|
|
)
|
|
return message.content[0].text
|
|
|
|
|
|
async def process_pdf(pdf_path: Path) -> tuple[str, int, float, int, int]:
|
|
doc = fitz.open(str(pdf_path))
|
|
page_count = len(doc)
|
|
pages_text = []
|
|
total_input = 0
|
|
total_output = 0
|
|
|
|
t0 = time.time()
|
|
for i in range(page_count):
|
|
page = doc[i]
|
|
pix = page.get_pixmap(dpi=200)
|
|
img_bytes = pix.tobytes("png")
|
|
|
|
print(f" Page {i+1}/{page_count}...", end=" ", flush=True)
|
|
pt = time.time()
|
|
text = await ocr_page(img_bytes, i + 1)
|
|
elapsed = time.time() - pt
|
|
pages_text.append(text)
|
|
print(f"{len(text):,} chars, {elapsed:.1f}s")
|
|
|
|
doc.close()
|
|
total_time = time.time() - t0
|
|
full_text = "\n\n".join(pages_text)
|
|
return full_text, page_count, total_time
|
|
|
|
|
|
async def main():
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
|
|
print(f"Found {len(pdfs)} PDFs — extracting ALL pages with {MODEL}\n")
|
|
|
|
total_pages = 0
|
|
total_time = 0.0
|
|
|
|
for pdf in pdfs:
|
|
print(f"{'=' * 60}")
|
|
print(f" {pdf.name} ({pdf.stat().st_size:,} bytes)")
|
|
print(f"{'=' * 60}")
|
|
|
|
text, pages, elapsed = await process_pdf(pdf)
|
|
total_pages += pages
|
|
total_time += elapsed
|
|
|
|
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
|
|
out_file.write_text(text, encoding="utf-8")
|
|
|
|
print(f" Result: {pages} pages, {len(text):,} chars, {len(text.split()):,} words")
|
|
print(f" Time: {elapsed:.1f}s ({elapsed/max(pages,1):.1f}s/page)")
|
|
print(f" Saved: {out_file.name}\n")
|
|
|
|
print(f"{'=' * 60}")
|
|
print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
|
|
est_cost = total_pages * 0.05
|
|
print(f"Estimated cost: ${est_cost:.2f}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|