- Remove cases/new|in-progress|completed subdivision (status managed in DB)
- Rename documents/original → documents/originals (consistent plural)
- Move exports from global data/exports/ into cases/{num}/exports/
- Add documents/research/ for case law and analysis files
- Update all agents, scripts, config, web API endpoints, and DB paths
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
67 lines
1.9 KiB
Python
67 lines
1.9 KiB
Python
"""Extract text from PDF using Google Cloud Vision API."""
|
|
|
|
import io
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import fitz # PyMuPDF for rendering pages to images
|
|
from google.cloud import vision
|
|
|
|
API_KEY = "AIzaSyDZgUsxsy_FHkkREU7R_oQLJALU3_V26j8"
|
|
|
|
PDF_PATH = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals/מרק קובר-כתב ערר.pdf")
|
|
OUTPUT_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/extracted")
|
|
|
|
|
|
def main():
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
client = vision.ImageAnnotatorClient(
|
|
client_options={"api_key": API_KEY}
|
|
)
|
|
|
|
doc = fitz.open(str(PDF_PATH))
|
|
page_count = len(doc)
|
|
print(f"Processing: {PDF_PATH.name} ({page_count} pages)\n")
|
|
|
|
pages_text = []
|
|
total_time = 0.0
|
|
|
|
for i in range(page_count):
|
|
page = doc[i]
|
|
pix = page.get_pixmap(dpi=300)
|
|
img_bytes = pix.tobytes("png")
|
|
|
|
image = vision.Image(content=img_bytes)
|
|
|
|
print(f" Page {i+1}/{page_count}...", end=" ", flush=True)
|
|
t0 = time.time()
|
|
response = client.document_text_detection(
|
|
image=image,
|
|
image_context={"language_hints": ["he"]}
|
|
)
|
|
elapsed = time.time() - t0
|
|
total_time += elapsed
|
|
|
|
if response.error.message:
|
|
print(f"ERROR: {response.error.message}")
|
|
pages_text.append("")
|
|
continue
|
|
|
|
text = response.full_text_annotation.text if response.full_text_annotation else ""
|
|
pages_text.append(text)
|
|
print(f"{len(text):,} chars, {elapsed:.1f}s")
|
|
|
|
doc.close()
|
|
|
|
full_text = "\n\n".join(pages_text)
|
|
out_file = OUTPUT_DIR / f"{PDF_PATH.stem}.md"
|
|
out_file.write_text(full_text, encoding="utf-8")
|
|
|
|
print(f"\nTotal: {len(full_text):,} chars, {len(full_text.split()):,} words, {total_time:.1f}s")
|
|
print(f"Saved: {out_file}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|