Flatten cases directory structure and unify paths

- Remove cases/new|in-progress|completed subdivision (status managed in DB) - Rename documents/original → documents/originals (consistent plural) - Move exports from global data/exports/ into cases/{num}/exports/ - Add documents/research/ for case law and analysis files - Update all agents, scripts, config, web API endpoints, and DB paths Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 14:33:27 +00:00
parent 4d674bf475
commit 22e819363e
17 changed files with 1203 additions and 62 deletions
--- a/scripts/extract_all_google_vision.py
+++ b/scripts/extract_all_google_vision.py
@@ -0,0 +1,128 @@
+"""Extract ALL PDFs from originals using Google Cloud Vision OCR.
+Forces OCR on all pages (ignoring broken text layers).
+Then runs voyage-law-2 embedding benchmark comparing old vs new.
+"""
+
+import asyncio
+import json
+import sys
+import time
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
+
+from dotenv import load_dotenv
+load_dotenv(Path.home() / ".env")
+
+import fitz
+from google.cloud import vision
+from legal_mcp import config
+
+API_KEY = config.GOOGLE_CLOUD_VISION_API_KEY
+client = vision.ImageAnnotatorClient(client_options={"api_key": API_KEY})
+
+ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
+OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"
+
+# Hebrew abbreviation quote fixer
+import re
+_ABBREV_FIXES = {
+    'עוהייד': 'עוה"ד', 'עוייד': 'עו"ד', 'הנייל': 'הנ"ל',
+    'מצייב': 'מצ"ב', 'ביהמייש': 'ביהמ"ש', 'תייז': 'ת"ז',
+    'עייי': 'ע"י', 'אחייכ': 'אח"כ', 'סייק': 'ס"ק',
+    'דייר': 'ד"ר', 'כדוייח': 'כדו"ח', 'חווייד': 'חוו"ד',
+    'מייר': 'מ"ר', 'יחייד': 'יח"ד', 'בייכ': 'ב"כ',
+}
+_ABBREV_PAT = re.compile('|'.join(re.escape(k) for k in sorted(_ABBREV_FIXES, key=len, reverse=True)))
+
+def fix_quotes(text):
+    return _ABBREV_PAT.sub(lambda m: _ABBREV_FIXES[m.group()], text)
+
+
+def ocr_page(image_bytes, page_num):
+    image = vision.Image(content=image_bytes)
+    response = client.document_text_detection(
+        image=image,
+        image_context=vision.ImageContext(language_hints=["he"]),
+    )
+    if response.error.message:
+        print(f"    ERROR page {page_num}: {response.error.message}")
+        return ""
+    text = response.full_text_annotation.text if response.full_text_annotation else ""
+    return fix_quotes(text)
+
+
+def process_pdf(pdf_path):
+    doc = fitz.open(str(pdf_path))
+    page_count = len(doc)
+    pages_text = []
+    t0 = time.time()
+
+    for i in range(page_count):
+        page = doc[i]
+        pix = page.get_pixmap(dpi=300)
+        img_bytes = pix.tobytes("png")
+
+        pt = time.time()
+        text = ocr_page(img_bytes, i + 1)
+        elapsed = time.time() - pt
+        pages_text.append(text)
+        print(f"    Page {i+1}/{page_count}: {len(text):,} chars, {elapsed:.1f}s")
+
+    doc.close()
+    total_time = time.time() - t0
+    return "\n\n".join(pages_text), page_count, total_time
+
+
+def main():
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+    pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
+    print(f"Found {len(pdfs)} PDFs\n")
+
+    results = []
+    total_pages = 0
+    total_time = 0.0
+
+    for pdf in pdfs:
+        out_file = OUTPUT_DIR / f"{pdf.stem}.md"
+
+        # Skip already extracted
+        if out_file.exists() and out_file.stat().st_size > 100:
+            text = out_file.read_text(encoding="utf-8")
+            doc = fitz.open(str(pdf))
+            pages = len(doc)
+            doc.close()
+            print(f"SKIP (exists): {pdf.name} ({pages} pages, {len(text):,} chars)")
+            results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": len(text.split()), "time": 0, "skipped": True})
+            total_pages += pages
+            continue
+
+        print(f"{'=' * 60}")
+        print(f"  {pdf.name} ({pdf.stat().st_size:,} bytes)")
+
+        text, pages, elapsed = process_pdf(pdf)
+        total_pages += pages
+        total_time += elapsed
+
+        out_file.write_text(text, encoding="utf-8")
+
+        words = len(text.split())
+        print(f"  Result: {pages} pages, {len(text):,} chars, {words:,} words, {elapsed:.1f}s")
+        print(f"  Saved: {out_file.name}\n")
+
+        results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": words, "time": elapsed, "skipped": False})
+
+    print(f"\n{'=' * 60}")
+    print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
+    est_cost = total_pages * 0.0015
+    print(f"Estimated cost: ${est_cost:.2f}")
+
+    # Save results
+    Path("/home/chaim/legal-ai/data/google-vision-extraction.json").write_text(
+        json.dumps(results, ensure_ascii=False, indent=2)
+    )
+
+
+if __name__ == "__main__":
+    main()