Clean up scripts/: archive 17, delete 5, add SCRIPTS.md registry

Active scripts (5): auto-sync-cases.sh, backup-db.sh, restore-db.sh, notify.py, bidi_table.py Archived (17): one-time migration/seeding scripts whose functionality is now in MCP server or web API. Moved to scripts/.archive/ Deleted (5): zero-value scripts (duplicates, hardcoded single-case, debug scripts) Added scripts/SCRIPTS.md — registry of all scripts with purpose, status, and what superseded them. CLAUDE.md updated with rule: any script change requires SCRIPTS.md update. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 16:30:19 +00:00
parent 38e79bbf92
commit 5c9a5d702a
24 changed files with 62 additions and 578 deletions
--- a/scripts/.archive/extract_originals_ocr.py
+++ b/scripts/.archive/extract_originals_ocr.py
@@ -0,0 +1,113 @@
+"""Extract text from original PDF files using Claude Opus Vision OCR on ALL pages.
+
+Forces Vision OCR regardless of embedded text layer (which may be broken).
+"""
+
+import asyncio
+import base64
+import sys
+import time
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
+
+from dotenv import load_dotenv
+load_dotenv(Path.home() / ".env")
+
+import anthropic
+import fitz
+from legal_mcp import config
+
+client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
+MODEL = "claude-opus-4-20250514"
+
+ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
+OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"
+
+
+async def ocr_page(image_bytes: bytes, page_num: int) -> str:
+    b64_image = base64.b64encode(image_bytes).decode("utf-8")
+    message = client.messages.create(
+        model=MODEL,
+        max_tokens=4096,
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "source": {"type": "base64", "media_type": "image/png", "data": b64_image},
+                },
+                {
+                    "type": "text",
+                    "text": (
+                        "חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
+                        "שמור על מבנה הפסקאות המקורי. "
+                        "אם יש כותרות, סמן אותן. "
+                        "החזר רק את הטקסט המחולץ, ללא הערות נוספות."
+                    ),
+                },
+            ],
+        }],
+    )
+    return message.content[0].text
+
+
+async def process_pdf(pdf_path: Path) -> tuple[str, int, float, int, int]:
+    doc = fitz.open(str(pdf_path))
+    page_count = len(doc)
+    pages_text = []
+    total_input = 0
+    total_output = 0
+
+    t0 = time.time()
+    for i in range(page_count):
+        page = doc[i]
+        pix = page.get_pixmap(dpi=200)
+        img_bytes = pix.tobytes("png")
+
+        print(f"    Page {i+1}/{page_count}...", end=" ", flush=True)
+        pt = time.time()
+        text = await ocr_page(img_bytes, i + 1)
+        elapsed = time.time() - pt
+        pages_text.append(text)
+        print(f"{len(text):,} chars, {elapsed:.1f}s")
+
+    doc.close()
+    total_time = time.time() - t0
+    full_text = "\n\n".join(pages_text)
+    return full_text, page_count, total_time
+
+
+async def main():
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+    pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
+    print(f"Found {len(pdfs)} PDFs — extracting ALL pages with {MODEL}\n")
+
+    total_pages = 0
+    total_time = 0.0
+
+    for pdf in pdfs:
+        print(f"{'=' * 60}")
+        print(f"  {pdf.name} ({pdf.stat().st_size:,} bytes)")
+        print(f"{'=' * 60}")
+
+        text, pages, elapsed = await process_pdf(pdf)
+        total_pages += pages
+        total_time += elapsed
+
+        out_file = OUTPUT_DIR / f"{pdf.stem}.md"
+        out_file.write_text(text, encoding="utf-8")
+
+        print(f"  Result: {pages} pages, {len(text):,} chars, {len(text.split()):,} words")
+        print(f"  Time: {elapsed:.1f}s ({elapsed/max(pages,1):.1f}s/page)")
+        print(f"  Saved: {out_file.name}\n")
+
+    print(f"{'=' * 60}")
+    print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
+    est_cost = total_pages * 0.05
+    print(f"Estimated cost: ${est_cost:.2f}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())