Replace Claude Vision OCR with Google Cloud Vision

Benchmark results on Hebrew legal docs (case 1130-25):
- Google Vision: 1s/page, $0.001/page, high accuracy
- Claude Opus Vision: 90s/page, $0.05/page, poor accuracy
- PyMuPDF broken OCR layers now detected via quality check

Changes:
- extractor.py: Google Vision OCR with Hebrew language hint (300 DPI)
- extractor.py: text quality detection (word length, words-per-line, Hebrew ratio)
- extractor.py: Hebrew abbreviation quote fixer (15 known patterns)
- config.py: add GOOGLE_CLOUD_VISION_API_KEY, remove ANTHROPIC_API_KEY
- pyproject.toml: add google-cloud-vision, remove anthropic

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-08 20:17:58 +00:00
parent bc72a83a71
commit 6aaca14e31
3 changed files with 128 additions and 54 deletions

View File

@@ -8,7 +8,6 @@ dependencies = [
"asyncpg>=0.29.0", "asyncpg>=0.29.0",
"pgvector>=0.3.0", "pgvector>=0.3.0",
"voyageai>=0.3.0", "voyageai>=0.3.0",
"anthropic>=0.40.0",
"python-dotenv>=1.0.0", "python-dotenv>=1.0.0",
"pydantic>=2.0.0", "pydantic>=2.0.0",
"pymupdf>=1.25.0", "pymupdf>=1.25.0",
@@ -17,6 +16,7 @@ dependencies = [
"redis>=5.0.0", "redis>=5.0.0",
"rq>=1.16.0", "rq>=1.16.0",
"pillow>=10.0.0", "pillow>=10.0.0",
"google-cloud-vision>=3.7.0",
] ]
[build-system] [build-system]

View File

@@ -47,8 +47,8 @@ VOYAGE_API_KEY = os.environ.get("VOYAGE_API_KEY", "")
VOYAGE_MODEL = os.environ.get("VOYAGE_MODEL", "voyage-law-2") VOYAGE_MODEL = os.environ.get("VOYAGE_MODEL", "voyage-law-2")
VOYAGE_DIMENSIONS = 1024 VOYAGE_DIMENSIONS = 1024
# Anthropic (for Claude Vision OCR) # Google Cloud Vision (OCR for scanned PDFs)
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") GOOGLE_CLOUD_VISION_API_KEY = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY", "")
# Data directory # Data directory
DATA_DIR = Path(os.environ.get("DATA_DIR", str(Path.home() / "legal-ai" / "data"))) DATA_DIR = Path(os.environ.get("DATA_DIR", str(Path.home() / "legal-ai" / "data")))
@@ -82,8 +82,8 @@ CHUNK_OVERLAP_TOKENS = 100
# External service allowlist — case materials may ONLY be sent to these domains # External service allowlist — case materials may ONLY be sent to these domains
ALLOWED_EXTERNAL_SERVICES = { ALLOWED_EXTERNAL_SERVICES = {
"api.anthropic.com", # Claude API (text generation, OCR)
"api.voyageai.com", # Voyage AI (embeddings) "api.voyageai.com", # Voyage AI (embeddings)
"vision.googleapis.com", # Google Cloud Vision (OCR)
} }
# Audit # Audit

View File

@@ -1,32 +1,118 @@
"""Text extraction from PDF, DOCX, and RTF files. """Text extraction from PDF, DOCX, and RTF files.
Primary PDF extraction: Claude Vision API (for scanned documents). Primary PDF extraction: PyMuPDF direct text (for born-digital PDFs).
Fallback: PyMuPDF direct text extraction (for born-digital PDFs). Fallback: Google Cloud Vision OCR (for scanned documents).
Post-processing: Hebrew abbreviation quote fixer.
""" """
from __future__ import annotations from __future__ import annotations
import base64 import asyncio
import logging import logging
import re
from pathlib import Path from pathlib import Path
import anthropic
import fitz # PyMuPDF import fitz # PyMuPDF
from docx import Document as DocxDocument from docx import Document as DocxDocument
from google.cloud import vision
from striprtf.striprtf import rtf_to_text from striprtf.striprtf import rtf_to_text
from legal_mcp import config from legal_mcp import config
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_anthropic_client: anthropic.Anthropic | None = None # ── Google Cloud Vision client ───────────────────────────────────
_vision_client: vision.ImageAnnotatorClient | None = None
def _get_anthropic() -> anthropic.Anthropic: def _get_vision_client() -> vision.ImageAnnotatorClient:
global _anthropic_client global _vision_client
if _anthropic_client is None: if _vision_client is None:
_anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY) _vision_client = vision.ImageAnnotatorClient(
return _anthropic_client client_options={"api_key": config.GOOGLE_CLOUD_VISION_API_KEY}
)
return _vision_client
# ── Hebrew text quality detection ────────────────────────────────
_HEBREW_RE = re.compile(r'[\u0590-\u05FF]')
_WORD_RE = re.compile(r'\S+')
def _text_quality_ok(text: str) -> bool:
"""Check if extracted text is real content vs broken OCR layer.
Returns True if text appears to be genuine Hebrew legal content.
Broken OCR layers from scanned PDFs often have:
- Very short words / single-character fragments
- Each word on its own line (high words-per-line ratio)
- Non-Hebrew characters mixed in
"""
words = _WORD_RE.findall(text)
if len(words) < 10:
return False
# Average word length — real Hebrew words avg 4-6 chars.
avg_len = sum(len(w) for w in words) / len(words)
if avg_len < 2.5:
return False
# Percentage of single-character "words"
single_char_pct = sum(1 for w in words if len(w) == 1) / len(words)
if single_char_pct > 0.4:
return False
# Words per line — broken OCR puts each word on its own line.
# Real text has 5-15 words per line; broken OCR has ~1-2.
lines = [l for l in text.split("\n") if l.strip()]
if lines:
words_per_line = len(words) / len(lines)
if words_per_line < 3.0:
return False
# Hebrew character ratio among letter characters
letters = re.findall(r'[a-zA-Z\u0590-\u05FF]', text)
if letters:
hebrew_pct = sum(1 for c in letters if _HEBREW_RE.match(c)) / len(letters)
if hebrew_pct < 0.5:
return False
return True
# ── Hebrew abbreviation quote fixer ──────────────────────────────
_HEBREW_ABBREV_FIXES: dict[str, str] = {
'עוהייד': 'עוה"ד',
'עוייד': 'עו"ד',
'הנייל': 'הנ"ל',
'מצייב': 'מצ"ב',
'ביהמייש': 'ביהמ"ש',
'תייז': 'ת"ז',
'עייי': 'ע"י',
'אחייכ': 'אח"כ',
'סייק': 'ס"ק',
'דייר': 'ד"ר',
'כדוייח': 'כדו"ח',
'חווייד': 'חוו"ד',
'מייר': 'מ"ר',
'יחייד': 'יח"ד',
'בייכ': 'ב"כ',
}
_ABBREV_PATTERN = re.compile(
'|'.join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True))
)
def _fix_hebrew_quotes(text: str) -> str:
"""Fix known Hebrew abbreviation quote replacements from Google Vision OCR."""
return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
# ── Extraction ───────────────────────────────────────────────────
async def extract_text(file_path: str) -> tuple[str, int]: async def extract_text(file_path: str) -> tuple[str, int]:
@@ -52,65 +138,53 @@ async def extract_text(file_path: str) -> tuple[str, int]:
async def _extract_pdf(path: Path) -> tuple[str, int]: async def _extract_pdf(path: Path) -> tuple[str, int]:
"""Extract text from PDF. Try direct text first, fall back to Claude Vision for scanned pages.""" """Extract text from PDF.
Try direct text first, fall back to Google Cloud Vision for scanned
or broken-OCR pages.
"""
doc = fitz.open(str(path)) doc = fitz.open(str(path))
page_count = len(doc) page_count = len(doc)
pages_text: list[str] = [] pages_text: list[str] = []
for page_num in range(page_count): for page_num in range(page_count):
page = doc[page_num] page = doc[page_num]
# Try direct text extraction first
text = page.get_text().strip() text = page.get_text().strip()
if len(text) > 50: if len(text) > 50 and _text_quality_ok(text):
# Sufficient text found - born-digital page
pages_text.append(text) pages_text.append(text)
logger.debug("Page %d: direct text extraction (%d chars)", page_num + 1, len(text)) logger.debug("Page %d: direct extraction (%d chars, quality OK)", page_num + 1, len(text))
else: else:
# Likely scanned - use Claude Vision reason = "insufficient text" if len(text) <= 50 else "low quality OCR layer"
logger.info("Page %d: using Claude Vision OCR", page_num + 1) logger.info("Page %d: Google Vision OCR (%s)", page_num + 1, reason)
pix = page.get_pixmap(dpi=200) pix = page.get_pixmap(dpi=300)
img_bytes = pix.tobytes("png") img_bytes = pix.tobytes("png")
ocr_text = await _ocr_with_claude(img_bytes, page_num + 1) ocr_text = await asyncio.to_thread(
_ocr_with_google_vision, img_bytes, page_num + 1
)
pages_text.append(ocr_text) pages_text.append(ocr_text)
doc.close() doc.close()
return "\n\n".join(pages_text), page_count return "\n\n".join(pages_text), page_count
async def _ocr_with_claude(image_bytes: bytes, page_num: int) -> str: def _ocr_with_google_vision(image_bytes: bytes, page_num: int) -> str:
"""OCR a single page image using Claude Vision API.""" """OCR a single page image using Google Cloud Vision API."""
client = _get_anthropic() client = _get_vision_client()
b64_image = base64.b64encode(image_bytes).decode("utf-8") image = vision.Image(content=image_bytes)
message = client.messages.create( response = client.document_text_detection(
model="claude-sonnet-4-20250514", image=image,
max_tokens=4096, image_context=vision.ImageContext(language_hints=["he"]),
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": b64_image,
},
},
{
"type": "text",
"text": (
"חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
"שמור על מבנה הפסקאות המקורי. "
"החזר רק את הטקסט המחולץ, ללא הערות נוספות."
),
},
],
}
],
) )
return message.content[0].text
if response.error.message:
raise RuntimeError(
f"Google Vision error on page {page_num}: {response.error.message}"
)
text = response.full_text_annotation.text if response.full_text_annotation else ""
return _fix_hebrew_quotes(text)
def _extract_docx(path: Path) -> str: def _extract_docx(path: Path) -> str: