From 6aaca14e31f262e4425b5edc11c4575c21b11d59 Mon Sep 17 00:00:00 2001
From: Chaim <chaim@marcus-law.co.il>
Date: Wed, 8 Apr 2026 20:17:58 +0000
Subject: [PATCH] Replace Claude Vision OCR with Google Cloud Vision

Benchmark results on Hebrew legal docs (case 1130-25):
- Google Vision: 1s/page, $0.001/page, high accuracy
- Claude Opus Vision: 90s/page, $0.05/page, poor accuracy
- PyMuPDF broken OCR layers now detected via quality check

Changes:
- extractor.py: Google Vision OCR with Hebrew language hint (300 DPI)
- extractor.py: text quality detection (word length, words-per-line, Hebrew ratio)
- extractor.py: Hebrew abbreviation quote fixer (15 known patterns)
- config.py: add GOOGLE_CLOUD_VISION_API_KEY, remove ANTHROPIC_API_KEY
- pyproject.toml: add google-cloud-vision, remove anthropic

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 mcp-server/pyproject.toml                     |   2 +-
 mcp-server/src/legal_mcp/config.py            |   6 +-
 .../src/legal_mcp/services/extractor.py       | 174 +++++++++++++-----
 3 files changed, 128 insertions(+), 54 deletions(-)

diff --git a/mcp-server/pyproject.toml b/mcp-server/pyproject.toml
index 00fa7fd..88a7a69 100644
--- a/mcp-server/pyproject.toml
+++ b/mcp-server/pyproject.toml
@@ -8,7 +8,6 @@ dependencies = [
     "asyncpg>=0.29.0",
     "pgvector>=0.3.0",
     "voyageai>=0.3.0",
-    "anthropic>=0.40.0",
     "python-dotenv>=1.0.0",
     "pydantic>=2.0.0",
     "pymupdf>=1.25.0",
@@ -17,6 +16,7 @@ dependencies = [
     "redis>=5.0.0",
     "rq>=1.16.0",
     "pillow>=10.0.0",
+    "google-cloud-vision>=3.7.0",
 ]
 
 [build-system]
diff --git a/mcp-server/src/legal_mcp/config.py b/mcp-server/src/legal_mcp/config.py
index e8d53ca..a224388 100644
--- a/mcp-server/src/legal_mcp/config.py
+++ b/mcp-server/src/legal_mcp/config.py
@@ -47,8 +47,8 @@ VOYAGE_API_KEY = os.environ.get("VOYAGE_API_KEY", "")
 VOYAGE_MODEL = os.environ.get("VOYAGE_MODEL", "voyage-law-2")
 VOYAGE_DIMENSIONS = 1024
 
-# Anthropic (for Claude Vision OCR)
-ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+# Google Cloud Vision (OCR for scanned PDFs)
+GOOGLE_CLOUD_VISION_API_KEY = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY", "")
 
 # Data directory
 DATA_DIR = Path(os.environ.get("DATA_DIR", str(Path.home() / "legal-ai" / "data")))
@@ -82,8 +82,8 @@ CHUNK_OVERLAP_TOKENS = 100
 
 # External service allowlist — case materials may ONLY be sent to these domains
 ALLOWED_EXTERNAL_SERVICES = {
-    "api.anthropic.com",        # Claude API (text generation, OCR)
     "api.voyageai.com",         # Voyage AI (embeddings)
+    "vision.googleapis.com",    # Google Cloud Vision (OCR)
 }
 
 # Audit
diff --git a/mcp-server/src/legal_mcp/services/extractor.py b/mcp-server/src/legal_mcp/services/extractor.py
index 09df0f3..91bbed2 100644
--- a/mcp-server/src/legal_mcp/services/extractor.py
+++ b/mcp-server/src/legal_mcp/services/extractor.py
@@ -1,32 +1,118 @@
 """Text extraction from PDF, DOCX, and RTF files.
 
-Primary PDF extraction: Claude Vision API (for scanned documents).
-Fallback: PyMuPDF direct text extraction (for born-digital PDFs).
+Primary PDF extraction: PyMuPDF direct text (for born-digital PDFs).
+Fallback: Google Cloud Vision OCR (for scanned documents).
+Post-processing: Hebrew abbreviation quote fixer.
 """
 
 from __future__ import annotations
 
-import base64
+import asyncio
 import logging
+import re
 from pathlib import Path
 
-import anthropic
 import fitz  # PyMuPDF
 from docx import Document as DocxDocument
+from google.cloud import vision
 from striprtf.striprtf import rtf_to_text
 
 from legal_mcp import config
 
 logger = logging.getLogger(__name__)
 
-_anthropic_client: anthropic.Anthropic | None = None
+# ── Google Cloud Vision client ───────────────────────────────────
+
+_vision_client: vision.ImageAnnotatorClient | None = None
 
 
-def _get_anthropic() -> anthropic.Anthropic:
-    global _anthropic_client
-    if _anthropic_client is None:
-        _anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
-    return _anthropic_client
+def _get_vision_client() -> vision.ImageAnnotatorClient:
+    global _vision_client
+    if _vision_client is None:
+        _vision_client = vision.ImageAnnotatorClient(
+            client_options={"api_key": config.GOOGLE_CLOUD_VISION_API_KEY}
+        )
+    return _vision_client
+
+
+# ── Hebrew text quality detection ────────────────────────────────
+
+_HEBREW_RE = re.compile(r'[\u0590-\u05FF]')
+_WORD_RE = re.compile(r'\S+')
+
+
+def _text_quality_ok(text: str) -> bool:
+    """Check if extracted text is real content vs broken OCR layer.
+
+    Returns True if text appears to be genuine Hebrew legal content.
+    Broken OCR layers from scanned PDFs often have:
+    - Very short words / single-character fragments
+    - Each word on its own line (high words-per-line ratio)
+    - Non-Hebrew characters mixed in
+    """
+    words = _WORD_RE.findall(text)
+    if len(words) < 10:
+        return False
+
+    # Average word length — real Hebrew words avg 4-6 chars.
+    avg_len = sum(len(w) for w in words) / len(words)
+    if avg_len < 2.5:
+        return False
+
+    # Percentage of single-character "words"
+    single_char_pct = sum(1 for w in words if len(w) == 1) / len(words)
+    if single_char_pct > 0.4:
+        return False
+
+    # Words per line — broken OCR puts each word on its own line.
+    # Real text has 5-15 words per line; broken OCR has ~1-2.
+    lines = [l for l in text.split("\n") if l.strip()]
+    if lines:
+        words_per_line = len(words) / len(lines)
+        if words_per_line < 3.0:
+            return False
+
+    # Hebrew character ratio among letter characters
+    letters = re.findall(r'[a-zA-Z\u0590-\u05FF]', text)
+    if letters:
+        hebrew_pct = sum(1 for c in letters if _HEBREW_RE.match(c)) / len(letters)
+        if hebrew_pct < 0.5:
+            return False
+
+    return True
+
+
+# ── Hebrew abbreviation quote fixer ──────────────────────────────
+
+_HEBREW_ABBREV_FIXES: dict[str, str] = {
+    'עוהייד': 'עוה"ד',
+    'עוייד': 'עו"ד',
+    'הנייל': 'הנ"ל',
+    'מצייב': 'מצ"ב',
+    'ביהמייש': 'ביהמ"ש',
+    'תייז': 'ת"ז',
+    'עייי': 'ע"י',
+    'אחייכ': 'אח"כ',
+    'סייק': 'ס"ק',
+    'דייר': 'ד"ר',
+    'כדוייח': 'כדו"ח',
+    'חווייד': 'חוו"ד',
+    'מייר': 'מ"ר',
+    'יחייד': 'יח"ד',
+    'בייכ': 'ב"כ',
+}
+
+_ABBREV_PATTERN = re.compile(
+    '|'.join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True))
+)
+
+
+def _fix_hebrew_quotes(text: str) -> str:
+    """Fix known Hebrew abbreviation quote replacements from Google Vision OCR."""
+    return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
+
+
+# ── Extraction ───────────────────────────────────────────────────
 
 
 async def extract_text(file_path: str) -> tuple[str, int]:
@@ -52,65 +138,53 @@ async def extract_text(file_path: str) -> tuple[str, int]:
 
 
 async def _extract_pdf(path: Path) -> tuple[str, int]:
-    """Extract text from PDF. Try direct text first, fall back to Claude Vision for scanned pages."""
+    """Extract text from PDF.
+
+    Try direct text first, fall back to Google Cloud Vision for scanned
+    or broken-OCR pages.
+    """
     doc = fitz.open(str(path))
     page_count = len(doc)
     pages_text: list[str] = []
 
     for page_num in range(page_count):
         page = doc[page_num]
-        # Try direct text extraction first
         text = page.get_text().strip()
 
-        if len(text) > 50:
-            # Sufficient text found - born-digital page
+        if len(text) > 50 and _text_quality_ok(text):
             pages_text.append(text)
-            logger.debug("Page %d: direct text extraction (%d chars)", page_num + 1, len(text))
+            logger.debug("Page %d: direct extraction (%d chars, quality OK)", page_num + 1, len(text))
         else:
-            # Likely scanned - use Claude Vision
-            logger.info("Page %d: using Claude Vision OCR", page_num + 1)
-            pix = page.get_pixmap(dpi=200)
+            reason = "insufficient text" if len(text) <= 50 else "low quality OCR layer"
+            logger.info("Page %d: Google Vision OCR (%s)", page_num + 1, reason)
+            pix = page.get_pixmap(dpi=300)
             img_bytes = pix.tobytes("png")
-            ocr_text = await _ocr_with_claude(img_bytes, page_num + 1)
+            ocr_text = await asyncio.to_thread(
+                _ocr_with_google_vision, img_bytes, page_num + 1
+            )
             pages_text.append(ocr_text)
 
     doc.close()
     return "\n\n".join(pages_text), page_count
 
 
-async def _ocr_with_claude(image_bytes: bytes, page_num: int) -> str:
-    """OCR a single page image using Claude Vision API."""
-    client = _get_anthropic()
-    b64_image = base64.b64encode(image_bytes).decode("utf-8")
+def _ocr_with_google_vision(image_bytes: bytes, page_num: int) -> str:
+    """OCR a single page image using Google Cloud Vision API."""
+    client = _get_vision_client()
+    image = vision.Image(content=image_bytes)
 
-    message = client.messages.create(
-        model="claude-sonnet-4-20250514",
-        max_tokens=4096,
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "source": {
-                            "type": "base64",
-                            "media_type": "image/png",
-                            "data": b64_image,
-                        },
-                    },
-                    {
-                        "type": "text",
-                        "text": (
-                            "חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
-                            "שמור על מבנה הפסקאות המקורי. "
-                            "החזר רק את הטקסט המחולץ, ללא הערות נוספות."
-                        ),
-                    },
-                ],
-            }
-        ],
+    response = client.document_text_detection(
+        image=image,
+        image_context=vision.ImageContext(language_hints=["he"]),
     )
-    return message.content[0].text
+
+    if response.error.message:
+        raise RuntimeError(
+            f"Google Vision error on page {page_num}: {response.error.message}"
+        )
+
+    text = response.full_text_annotation.text if response.full_text_annotation else ""
+    return _fix_hebrew_quotes(text)
 
 
 def _extract_docx(path: Path) -> str: