Replace Claude Vision OCR with Google Cloud Vision

Benchmark results on Hebrew legal docs (case 1130-25): - Google Vision: 1s/page, $0.001/page, high accuracy - Claude Opus Vision: 90s/page, $0.05/page, poor accuracy - PyMuPDF broken OCR layers now detected via quality check Changes: - extractor.py: Google Vision OCR with Hebrew language hint (300 DPI) - extractor.py: text quality detection (word length, words-per-line, Hebrew ratio) - extractor.py: Hebrew abbreviation quote fixer (15 known patterns) - config.py: add GOOGLE_CLOUD_VISION_API_KEY, remove ANTHROPIC_API_KEY - pyproject.toml: add google-cloud-vision, remove anthropic Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 20:17:58 +00:00
parent bc72a83a71
commit 6aaca14e31
3 changed files with 128 additions and 54 deletions
--- a/mcp-server/pyproject.toml
+++ b/mcp-server/pyproject.toml
@@ -8,7 +8,6 @@ dependencies = [
    "asyncpg>=0.29.0",
    "pgvector>=0.3.0",
    "voyageai>=0.3.0",
    "anthropic>=0.40.0",
    "python-dotenv>=1.0.0",
    "pydantic>=2.0.0",
    "pymupdf>=1.25.0",
@@ -17,6 +16,7 @@ dependencies = [
    "redis>=5.0.0",
    "rq>=1.16.0",
    "pillow>=10.0.0",
    "google-cloud-vision>=3.7.0",
 ]
 [build-system]
--- a/mcp-server/src/legal_mcp/config.py
+++ b/mcp-server/src/legal_mcp/config.py
@@ -47,8 +47,8 @@ VOYAGE_API_KEY = os.environ.get("VOYAGE_API_KEY", "")
 VOYAGE_MODEL = os.environ.get("VOYAGE_MODEL", "voyage-law-2")
 VOYAGE_DIMENSIONS = 1024
-# Anthropic (for Claude Vision OCR)
+# Google Cloud Vision (OCR for scanned PDFs)
-ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+GOOGLE_CLOUD_VISION_API_KEY = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY", "")
 # Data directory
 DATA_DIR = Path(os.environ.get("DATA_DIR", str(Path.home() / "legal-ai" / "data")))
@@ -82,8 +82,8 @@ CHUNK_OVERLAP_TOKENS = 100
 # External service allowlist — case materials may ONLY be sent to these domains
 ALLOWED_EXTERNAL_SERVICES = {
    "api.anthropic.com",        # Claude API (text generation, OCR)
    "api.voyageai.com",         # Voyage AI (embeddings)
    "vision.googleapis.com",    # Google Cloud Vision (OCR)
 }
 # Audit
--- a/mcp-server/src/legal_mcp/services/extractor.py
+++ b/mcp-server/src/legal_mcp/services/extractor.py
@@ -1,32 +1,118 @@
 """Text extraction from PDF, DOCX, and RTF files.
-Primary PDF extraction: Claude Vision API (for scanned documents).
+Primary PDF extraction: PyMuPDF direct text (for born-digital PDFs).
-Fallback: PyMuPDF direct text extraction (for born-digital PDFs).
+Fallback: Google Cloud Vision OCR (for scanned documents).
 Post-processing: Hebrew abbreviation quote fixer.
 """
 from __future__ import annotations
-import base64
+import asyncio
 import logging
 import re
 from pathlib import Path
 import anthropic
 import fitz  # PyMuPDF
 from docx import Document as DocxDocument
 from google.cloud import vision
 from striprtf.striprtf import rtf_to_text
 from legal_mcp import config
 logger = logging.getLogger(__name__)
-_anthropic_client: anthropic.Anthropic | None = None
+# ── Google Cloud Vision client ───────────────────────────────────
 _vision_client: vision.ImageAnnotatorClient | None = None
-def _get_anthropic() -> anthropic.Anthropic:
+def _get_vision_client() -> vision.ImageAnnotatorClient:
-    global _anthropic_client
+    global _vision_client
-    if _anthropic_client is None:
+    if _vision_client is None:
-        _anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
+        _vision_client = vision.ImageAnnotatorClient(
-    return _anthropic_client
+            client_options={"api_key": config.GOOGLE_CLOUD_VISION_API_KEY}
        )
    return _vision_client
 # ── Hebrew text quality detection ────────────────────────────────
 _HEBREW_RE = re.compile(r'[\u0590-\u05FF]')
 _WORD_RE = re.compile(r'\S+')
 def _text_quality_ok(text: str) -> bool:
    """Check if extracted text is real content vs broken OCR layer.
    Returns True if text appears to be genuine Hebrew legal content.
    Broken OCR layers from scanned PDFs often have:
    - Very short words / single-character fragments
    - Each word on its own line (high words-per-line ratio)
    - Non-Hebrew characters mixed in
    """
    words = _WORD_RE.findall(text)
    if len(words) < 10:
        return False
    # Average word length — real Hebrew words avg 4-6 chars.
    avg_len = sum(len(w) for w in words) / len(words)
    if avg_len < 2.5:
        return False
    # Percentage of single-character "words"
    single_char_pct = sum(1 for w in words if len(w) == 1) / len(words)
    if single_char_pct > 0.4:
        return False
    # Words per line — broken OCR puts each word on its own line.
    # Real text has 5-15 words per line; broken OCR has ~1-2.
    lines = [l for l in text.split("\n") if l.strip()]
    if lines:
        words_per_line = len(words) / len(lines)
        if words_per_line < 3.0:
            return False
    # Hebrew character ratio among letter characters
    letters = re.findall(r'[a-zA-Z\u0590-\u05FF]', text)
    if letters:
        hebrew_pct = sum(1 for c in letters if _HEBREW_RE.match(c)) / len(letters)
        if hebrew_pct < 0.5:
            return False
    return True
 # ── Hebrew abbreviation quote fixer ──────────────────────────────
 _HEBREW_ABBREV_FIXES: dict[str, str] = {
    'עוהייד': 'עוה"ד',
    'עוייד': 'עו"ד',
    'הנייל': 'הנ"ל',
    'מצייב': 'מצ"ב',
    'ביהמייש': 'ביהמ"ש',
    'תייז': 'ת"ז',
    'עייי': 'ע"י',
    'אחייכ': 'אח"כ',
    'סייק': 'ס"ק',
    'דייר': 'ד"ר',
    'כדוייח': 'כדו"ח',
    'חווייד': 'חוו"ד',
    'מייר': 'מ"ר',
    'יחייד': 'יח"ד',
    'בייכ': 'ב"כ',
 }
 _ABBREV_PATTERN = re.compile(
    '|'.join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True))
 )
 def _fix_hebrew_quotes(text: str) -> str:
    """Fix known Hebrew abbreviation quote replacements from Google Vision OCR."""
    return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
 # ── Extraction ───────────────────────────────────────────────────
 async def extract_text(file_path: str) -> tuple[str, int]:
@@ -52,65 +138,53 @@ async def extract_text(file_path: str) -> tuple[str, int]:
 async def _extract_pdf(path: Path) -> tuple[str, int]:
-    """Extract text from PDF. Try direct text first, fall back to Claude Vision for scanned pages."""
+    """Extract text from PDF.
    Try direct text first, fall back to Google Cloud Vision for scanned
    or broken-OCR pages.
    """
    doc = fitz.open(str(path))
    page_count = len(doc)
    pages_text: list[str] = []
    for page_num in range(page_count):
        page = doc[page_num]
        # Try direct text extraction first
        text = page.get_text().strip()
-        if len(text) > 50:
+        if len(text) > 50 and _text_quality_ok(text):
            # Sufficient text found - born-digital page
            pages_text.append(text)
-            logger.debug("Page %d: direct text extraction (%d chars)", page_num + 1, len(text))
+            logger.debug("Page %d: direct extraction (%d chars, quality OK)", page_num + 1, len(text))
        else:
-            # Likely scanned - use Claude Vision
+            reason = "insufficient text" if len(text) <= 50 else "low quality OCR layer"
-            logger.info("Page %d: using Claude Vision OCR", page_num + 1)
+            logger.info("Page %d: Google Vision OCR (%s)", page_num + 1, reason)
-            pix = page.get_pixmap(dpi=200)
+            pix = page.get_pixmap(dpi=300)
            img_bytes = pix.tobytes("png")
-            ocr_text = await _ocr_with_claude(img_bytes, page_num + 1)
+            ocr_text = await asyncio.to_thread(
                _ocr_with_google_vision, img_bytes, page_num + 1
            )
            pages_text.append(ocr_text)
    doc.close()
    return "\n\n".join(pages_text), page_count
-async def _ocr_with_claude(image_bytes: bytes, page_num: int) -> str:
+def _ocr_with_google_vision(image_bytes: bytes, page_num: int) -> str:
-    """OCR a single page image using Claude Vision API."""
+    """OCR a single page image using Google Cloud Vision API."""
-    client = _get_anthropic()
+    client = _get_vision_client()
-    b64_image = base64.b64encode(image_bytes).decode("utf-8")
+    image = vision.Image(content=image_bytes)
-    message = client.messages.create(
+    response = client.document_text_detection(
-        model="claude-sonnet-4-20250514",
+        image=image,
-        max_tokens=4096,
+        image_context=vision.ImageContext(language_hints=["he"]),
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/png",
                            "data": b64_image,
                        },
                    },
                    {
                        "type": "text",
                        "text": (
                            "חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
                            "שמור על מבנה הפסקאות המקורי. "
                            "החזר רק את הטקסט המחולץ, ללא הערות נוספות."
                        ),
                    },
                ],
            }
        ],
    )
-    return message.content[0].text
+
    if response.error.message:
        raise RuntimeError(
            f"Google Vision error on page {page_num}: {response.error.message}"
        )
    text = response.full_text_annotation.text if response.full_text_annotation else ""
    return _fix_hebrew_quotes(text)
 def _extract_docx(path: Path) -> str: