Flatten cases directory structure and unify paths

- Remove cases/new|in-progress|completed subdivision (status managed in DB)
- Rename documents/original → documents/originals (consistent plural)
- Move exports from global data/exports/ into cases/{num}/exports/
- Add documents/research/ for case law and analysis files
- Update all agents, scripts, config, web API endpoints, and DB paths

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-09 14:33:27 +00:00
parent 4d674bf475
commit 22e819363e
17 changed files with 1203 additions and 62 deletions

View File

@@ -0,0 +1,128 @@
"""Extract ALL PDFs from originals using Google Cloud Vision OCR.
Forces OCR on all pages (ignoring broken text layers).
Then runs voyage-law-2 embedding benchmark comparing old vs new.
"""
import asyncio
import json
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
from dotenv import load_dotenv
load_dotenv(Path.home() / ".env")
import fitz
from google.cloud import vision
from legal_mcp import config
API_KEY = config.GOOGLE_CLOUD_VISION_API_KEY
client = vision.ImageAnnotatorClient(client_options={"api_key": API_KEY})
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"
# Hebrew abbreviation quote fixer
import re
_ABBREV_FIXES = {
'עוהייד': 'עוה"ד', 'עוייד': 'עו"ד', 'הנייל': 'הנ"ל',
'מצייב': 'מצ"ב', 'ביהמייש': 'ביהמ"ש', 'תייז': 'ת"ז',
'עייי': 'ע"י', 'אחייכ': 'אח"כ', 'סייק': 'ס"ק',
'דייר': 'ד"ר', 'כדוייח': 'כדו"ח', 'חווייד': 'חוו"ד',
'מייר': 'מ"ר', 'יחייד': 'יח"ד', 'בייכ': 'ב"כ',
}
_ABBREV_PAT = re.compile('|'.join(re.escape(k) for k in sorted(_ABBREV_FIXES, key=len, reverse=True)))
def fix_quotes(text):
return _ABBREV_PAT.sub(lambda m: _ABBREV_FIXES[m.group()], text)
def ocr_page(image_bytes, page_num):
image = vision.Image(content=image_bytes)
response = client.document_text_detection(
image=image,
image_context=vision.ImageContext(language_hints=["he"]),
)
if response.error.message:
print(f" ERROR page {page_num}: {response.error.message}")
return ""
text = response.full_text_annotation.text if response.full_text_annotation else ""
return fix_quotes(text)
def process_pdf(pdf_path):
doc = fitz.open(str(pdf_path))
page_count = len(doc)
pages_text = []
t0 = time.time()
for i in range(page_count):
page = doc[i]
pix = page.get_pixmap(dpi=300)
img_bytes = pix.tobytes("png")
pt = time.time()
text = ocr_page(img_bytes, i + 1)
elapsed = time.time() - pt
pages_text.append(text)
print(f" Page {i+1}/{page_count}: {len(text):,} chars, {elapsed:.1f}s")
doc.close()
total_time = time.time() - t0
return "\n\n".join(pages_text), page_count, total_time
def main():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
print(f"Found {len(pdfs)} PDFs\n")
results = []
total_pages = 0
total_time = 0.0
for pdf in pdfs:
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
# Skip already extracted
if out_file.exists() and out_file.stat().st_size > 100:
text = out_file.read_text(encoding="utf-8")
doc = fitz.open(str(pdf))
pages = len(doc)
doc.close()
print(f"SKIP (exists): {pdf.name} ({pages} pages, {len(text):,} chars)")
results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": len(text.split()), "time": 0, "skipped": True})
total_pages += pages
continue
print(f"{'=' * 60}")
print(f" {pdf.name} ({pdf.stat().st_size:,} bytes)")
text, pages, elapsed = process_pdf(pdf)
total_pages += pages
total_time += elapsed
out_file.write_text(text, encoding="utf-8")
words = len(text.split())
print(f" Result: {pages} pages, {len(text):,} chars, {words:,} words, {elapsed:.1f}s")
print(f" Saved: {out_file.name}\n")
results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": words, "time": elapsed, "skipped": False})
print(f"\n{'=' * 60}")
print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
est_cost = total_pages * 0.0015
print(f"Estimated cost: ${est_cost:.2f}")
# Save results
Path("/home/chaim/legal-ai/data/google-vision-extraction.json").write_text(
json.dumps(results, ensure_ascii=False, indent=2)
)
if __name__ == "__main__":
main()