Flatten cases directory structure and unify paths
- Remove cases/new|in-progress|completed subdivision (status managed in DB)
- Rename documents/original → documents/originals (consistent plural)
- Move exports from global data/exports/ into cases/{num}/exports/
- Add documents/research/ for case law and analysis files
- Update all agents, scripts, config, web API endpoints, and DB paths
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
128
scripts/extract_all_google_vision.py
Normal file
128
scripts/extract_all_google_vision.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""Extract ALL PDFs from originals using Google Cloud Vision OCR.
|
||||
Forces OCR on all pages (ignoring broken text layers).
|
||||
Then runs voyage-law-2 embedding benchmark comparing old vs new.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path.home() / ".env")
|
||||
|
||||
import fitz
|
||||
from google.cloud import vision
|
||||
from legal_mcp import config
|
||||
|
||||
API_KEY = config.GOOGLE_CLOUD_VISION_API_KEY
|
||||
client = vision.ImageAnnotatorClient(client_options={"api_key": API_KEY})
|
||||
|
||||
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
|
||||
OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"
|
||||
|
||||
# Hebrew abbreviation quote fixer
|
||||
import re
|
||||
_ABBREV_FIXES = {
|
||||
'עוהייד': 'עוה"ד', 'עוייד': 'עו"ד', 'הנייל': 'הנ"ל',
|
||||
'מצייב': 'מצ"ב', 'ביהמייש': 'ביהמ"ש', 'תייז': 'ת"ז',
|
||||
'עייי': 'ע"י', 'אחייכ': 'אח"כ', 'סייק': 'ס"ק',
|
||||
'דייר': 'ד"ר', 'כדוייח': 'כדו"ח', 'חווייד': 'חוו"ד',
|
||||
'מייר': 'מ"ר', 'יחייד': 'יח"ד', 'בייכ': 'ב"כ',
|
||||
}
|
||||
_ABBREV_PAT = re.compile('|'.join(re.escape(k) for k in sorted(_ABBREV_FIXES, key=len, reverse=True)))
|
||||
|
||||
def fix_quotes(text):
|
||||
return _ABBREV_PAT.sub(lambda m: _ABBREV_FIXES[m.group()], text)
|
||||
|
||||
|
||||
def ocr_page(image_bytes, page_num):
|
||||
image = vision.Image(content=image_bytes)
|
||||
response = client.document_text_detection(
|
||||
image=image,
|
||||
image_context=vision.ImageContext(language_hints=["he"]),
|
||||
)
|
||||
if response.error.message:
|
||||
print(f" ERROR page {page_num}: {response.error.message}")
|
||||
return ""
|
||||
text = response.full_text_annotation.text if response.full_text_annotation else ""
|
||||
return fix_quotes(text)
|
||||
|
||||
|
||||
def process_pdf(pdf_path):
|
||||
doc = fitz.open(str(pdf_path))
|
||||
page_count = len(doc)
|
||||
pages_text = []
|
||||
t0 = time.time()
|
||||
|
||||
for i in range(page_count):
|
||||
page = doc[i]
|
||||
pix = page.get_pixmap(dpi=300)
|
||||
img_bytes = pix.tobytes("png")
|
||||
|
||||
pt = time.time()
|
||||
text = ocr_page(img_bytes, i + 1)
|
||||
elapsed = time.time() - pt
|
||||
pages_text.append(text)
|
||||
print(f" Page {i+1}/{page_count}: {len(text):,} chars, {elapsed:.1f}s")
|
||||
|
||||
doc.close()
|
||||
total_time = time.time() - t0
|
||||
return "\n\n".join(pages_text), page_count, total_time
|
||||
|
||||
|
||||
def main():
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
|
||||
print(f"Found {len(pdfs)} PDFs\n")
|
||||
|
||||
results = []
|
||||
total_pages = 0
|
||||
total_time = 0.0
|
||||
|
||||
for pdf in pdfs:
|
||||
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
|
||||
|
||||
# Skip already extracted
|
||||
if out_file.exists() and out_file.stat().st_size > 100:
|
||||
text = out_file.read_text(encoding="utf-8")
|
||||
doc = fitz.open(str(pdf))
|
||||
pages = len(doc)
|
||||
doc.close()
|
||||
print(f"SKIP (exists): {pdf.name} ({pages} pages, {len(text):,} chars)")
|
||||
results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": len(text.split()), "time": 0, "skipped": True})
|
||||
total_pages += pages
|
||||
continue
|
||||
|
||||
print(f"{'=' * 60}")
|
||||
print(f" {pdf.name} ({pdf.stat().st_size:,} bytes)")
|
||||
|
||||
text, pages, elapsed = process_pdf(pdf)
|
||||
total_pages += pages
|
||||
total_time += elapsed
|
||||
|
||||
out_file.write_text(text, encoding="utf-8")
|
||||
|
||||
words = len(text.split())
|
||||
print(f" Result: {pages} pages, {len(text):,} chars, {words:,} words, {elapsed:.1f}s")
|
||||
print(f" Saved: {out_file.name}\n")
|
||||
|
||||
results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": words, "time": elapsed, "skipped": False})
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
|
||||
est_cost = total_pages * 0.0015
|
||||
print(f"Estimated cost: ${est_cost:.2f}")
|
||||
|
||||
# Save results
|
||||
Path("/home/chaim/legal-ai/data/google-vision-extraction.json").write_text(
|
||||
json.dumps(results, ensure_ascii=False, indent=2)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user