Clean up scripts/: archive 17, delete 5, add SCRIPTS.md registry
Active scripts (5): auto-sync-cases.sh, backup-db.sh, restore-db.sh, notify.py, bidi_table.py Archived (17): one-time migration/seeding scripts whose functionality is now in MCP server or web API. Moved to scripts/.archive/ Deleted (5): zero-value scripts (duplicates, hardcoded single-case, debug scripts) Added scripts/SCRIPTS.md — registry of all scripts with purpose, status, and what superseded them. CLAUDE.md updated with rule: any script change requires SCRIPTS.md update. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
113
scripts/.archive/extract_originals_ocr.py
Normal file
113
scripts/.archive/extract_originals_ocr.py
Normal file
@@ -0,0 +1,113 @@
|
||||
"""Extract text from original PDF files using Claude Opus Vision OCR on ALL pages.
|
||||
|
||||
Forces Vision OCR regardless of embedded text layer (which may be broken).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path.home() / ".env")
|
||||
|
||||
import anthropic
|
||||
import fitz
|
||||
from legal_mcp import config
|
||||
|
||||
client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
|
||||
MODEL = "claude-opus-4-20250514"
|
||||
|
||||
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
|
||||
OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"
|
||||
|
||||
|
||||
async def ocr_page(image_bytes: bytes, page_num: int) -> str:
|
||||
b64_image = base64.b64encode(image_bytes).decode("utf-8")
|
||||
message = client.messages.create(
|
||||
model=MODEL,
|
||||
max_tokens=4096,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"source": {"type": "base64", "media_type": "image/png", "data": b64_image},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": (
|
||||
"חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
|
||||
"שמור על מבנה הפסקאות המקורי. "
|
||||
"אם יש כותרות, סמן אותן. "
|
||||
"החזר רק את הטקסט המחולץ, ללא הערות נוספות."
|
||||
),
|
||||
},
|
||||
],
|
||||
}],
|
||||
)
|
||||
return message.content[0].text
|
||||
|
||||
|
||||
async def process_pdf(pdf_path: Path) -> tuple[str, int, float, int, int]:
|
||||
doc = fitz.open(str(pdf_path))
|
||||
page_count = len(doc)
|
||||
pages_text = []
|
||||
total_input = 0
|
||||
total_output = 0
|
||||
|
||||
t0 = time.time()
|
||||
for i in range(page_count):
|
||||
page = doc[i]
|
||||
pix = page.get_pixmap(dpi=200)
|
||||
img_bytes = pix.tobytes("png")
|
||||
|
||||
print(f" Page {i+1}/{page_count}...", end=" ", flush=True)
|
||||
pt = time.time()
|
||||
text = await ocr_page(img_bytes, i + 1)
|
||||
elapsed = time.time() - pt
|
||||
pages_text.append(text)
|
||||
print(f"{len(text):,} chars, {elapsed:.1f}s")
|
||||
|
||||
doc.close()
|
||||
total_time = time.time() - t0
|
||||
full_text = "\n\n".join(pages_text)
|
||||
return full_text, page_count, total_time
|
||||
|
||||
|
||||
async def main():
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
|
||||
print(f"Found {len(pdfs)} PDFs — extracting ALL pages with {MODEL}\n")
|
||||
|
||||
total_pages = 0
|
||||
total_time = 0.0
|
||||
|
||||
for pdf in pdfs:
|
||||
print(f"{'=' * 60}")
|
||||
print(f" {pdf.name} ({pdf.stat().st_size:,} bytes)")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
text, pages, elapsed = await process_pdf(pdf)
|
||||
total_pages += pages
|
||||
total_time += elapsed
|
||||
|
||||
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
|
||||
out_file.write_text(text, encoding="utf-8")
|
||||
|
||||
print(f" Result: {pages} pages, {len(text):,} chars, {len(text.split()):,} words")
|
||||
print(f" Time: {elapsed:.1f}s ({elapsed/max(pages,1):.1f}s/page)")
|
||||
print(f" Saved: {out_file.name}\n")
|
||||
|
||||
print(f"{'=' * 60}")
|
||||
print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
|
||||
est_cost = total_pages * 0.05
|
||||
print(f"Estimated cost: ${est_cost:.2f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user