Files
legal-ai/scripts/extract_google_vision_single.py
Chaim 22e819363e Flatten cases directory structure and unify paths
- Remove cases/new|in-progress|completed subdivision (status managed in DB)
- Rename documents/original → documents/originals (consistent plural)
- Move exports from global data/exports/ into cases/{num}/exports/
- Add documents/research/ for case law and analysis files
- Update all agents, scripts, config, web API endpoints, and DB paths

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 14:33:27 +00:00

55 lines
1.7 KiB
Python

"""Extract text from a single PDF using Google Cloud Vision API."""
import sys
import time
from pathlib import Path
import fitz
from google.cloud import vision
API_KEY = "AIzaSyDZgUsxsy_FHkkREU7R_oQLJALU3_V26j8"
OUTPUT_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/extracted")
def main():
pdf_path = Path(sys.argv[1])
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
client = vision.ImageAnnotatorClient(client_options={"api_key": API_KEY})
doc = fitz.open(str(pdf_path))
page_count = len(doc)
print(f"Processing: {pdf_path.name} ({page_count} pages)\n")
pages_text = []
total_time = 0.0
for i in range(page_count):
page = doc[i]
pix = page.get_pixmap(dpi=300)
img_bytes = pix.tobytes("png")
image = vision.Image(content=img_bytes)
print(f" Page {i+1}/{page_count}...", end=" ", flush=True)
t0 = time.time()
response = client.document_text_detection(image=image, image_context={"language_hints": ["he"]})
elapsed = time.time() - t0
total_time += elapsed
if response.error.message:
print(f"ERROR: {response.error.message}")
pages_text.append("")
continue
text = response.full_text_annotation.text if response.full_text_annotation else ""
pages_text.append(text)
print(f"{len(text):,} chars, {elapsed:.1f}s")
doc.close()
full_text = "\n\n".join(pages_text)
out_file = OUTPUT_DIR / f"{pdf_path.stem}.md"
out_file.write_text(full_text, encoding="utf-8")
print(f"\nTotal: {len(full_text):,} chars, {len(full_text.split()):,} words, {total_time:.1f}s")
print(f"Saved: {out_file}")
if __name__ == "__main__":
main()