From 6aaca14e31f262e4425b5edc11c4575c21b11d59 Mon Sep 17 00:00:00 2001 From: Chaim Date: Wed, 8 Apr 2026 20:17:58 +0000 Subject: [PATCH] Replace Claude Vision OCR with Google Cloud Vision Benchmark results on Hebrew legal docs (case 1130-25): - Google Vision: 1s/page, $0.001/page, high accuracy - Claude Opus Vision: 90s/page, $0.05/page, poor accuracy - PyMuPDF broken OCR layers now detected via quality check Changes: - extractor.py: Google Vision OCR with Hebrew language hint (300 DPI) - extractor.py: text quality detection (word length, words-per-line, Hebrew ratio) - extractor.py: Hebrew abbreviation quote fixer (15 known patterns) - config.py: add GOOGLE_CLOUD_VISION_API_KEY, remove ANTHROPIC_API_KEY - pyproject.toml: add google-cloud-vision, remove anthropic Co-Authored-By: Claude Opus 4.6 (1M context) --- mcp-server/pyproject.toml | 2 +- mcp-server/src/legal_mcp/config.py | 6 +- .../src/legal_mcp/services/extractor.py | 174 +++++++++++++----- 3 files changed, 128 insertions(+), 54 deletions(-) diff --git a/mcp-server/pyproject.toml b/mcp-server/pyproject.toml index 00fa7fd..88a7a69 100644 --- a/mcp-server/pyproject.toml +++ b/mcp-server/pyproject.toml @@ -8,7 +8,6 @@ dependencies = [ "asyncpg>=0.29.0", "pgvector>=0.3.0", "voyageai>=0.3.0", - "anthropic>=0.40.0", "python-dotenv>=1.0.0", "pydantic>=2.0.0", "pymupdf>=1.25.0", @@ -17,6 +16,7 @@ dependencies = [ "redis>=5.0.0", "rq>=1.16.0", "pillow>=10.0.0", + "google-cloud-vision>=3.7.0", ] [build-system] diff --git a/mcp-server/src/legal_mcp/config.py b/mcp-server/src/legal_mcp/config.py index e8d53ca..a224388 100644 --- a/mcp-server/src/legal_mcp/config.py +++ b/mcp-server/src/legal_mcp/config.py @@ -47,8 +47,8 @@ VOYAGE_API_KEY = os.environ.get("VOYAGE_API_KEY", "") VOYAGE_MODEL = os.environ.get("VOYAGE_MODEL", "voyage-law-2") VOYAGE_DIMENSIONS = 1024 -# Anthropic (for Claude Vision OCR) -ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") +# Google Cloud Vision (OCR for scanned PDFs) +GOOGLE_CLOUD_VISION_API_KEY = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY", "") # Data directory DATA_DIR = Path(os.environ.get("DATA_DIR", str(Path.home() / "legal-ai" / "data"))) @@ -82,8 +82,8 @@ CHUNK_OVERLAP_TOKENS = 100 # External service allowlist — case materials may ONLY be sent to these domains ALLOWED_EXTERNAL_SERVICES = { - "api.anthropic.com", # Claude API (text generation, OCR) "api.voyageai.com", # Voyage AI (embeddings) + "vision.googleapis.com", # Google Cloud Vision (OCR) } # Audit diff --git a/mcp-server/src/legal_mcp/services/extractor.py b/mcp-server/src/legal_mcp/services/extractor.py index 09df0f3..91bbed2 100644 --- a/mcp-server/src/legal_mcp/services/extractor.py +++ b/mcp-server/src/legal_mcp/services/extractor.py @@ -1,32 +1,118 @@ """Text extraction from PDF, DOCX, and RTF files. -Primary PDF extraction: Claude Vision API (for scanned documents). -Fallback: PyMuPDF direct text extraction (for born-digital PDFs). +Primary PDF extraction: PyMuPDF direct text (for born-digital PDFs). +Fallback: Google Cloud Vision OCR (for scanned documents). +Post-processing: Hebrew abbreviation quote fixer. """ from __future__ import annotations -import base64 +import asyncio import logging +import re from pathlib import Path -import anthropic import fitz # PyMuPDF from docx import Document as DocxDocument +from google.cloud import vision from striprtf.striprtf import rtf_to_text from legal_mcp import config logger = logging.getLogger(__name__) -_anthropic_client: anthropic.Anthropic | None = None +# ── Google Cloud Vision client ─────────────────────────────────── + +_vision_client: vision.ImageAnnotatorClient | None = None -def _get_anthropic() -> anthropic.Anthropic: - global _anthropic_client - if _anthropic_client is None: - _anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY) - return _anthropic_client +def _get_vision_client() -> vision.ImageAnnotatorClient: + global _vision_client + if _vision_client is None: + _vision_client = vision.ImageAnnotatorClient( + client_options={"api_key": config.GOOGLE_CLOUD_VISION_API_KEY} + ) + return _vision_client + + +# ── Hebrew text quality detection ──────────────────────────────── + +_HEBREW_RE = re.compile(r'[\u0590-\u05FF]') +_WORD_RE = re.compile(r'\S+') + + +def _text_quality_ok(text: str) -> bool: + """Check if extracted text is real content vs broken OCR layer. + + Returns True if text appears to be genuine Hebrew legal content. + Broken OCR layers from scanned PDFs often have: + - Very short words / single-character fragments + - Each word on its own line (high words-per-line ratio) + - Non-Hebrew characters mixed in + """ + words = _WORD_RE.findall(text) + if len(words) < 10: + return False + + # Average word length — real Hebrew words avg 4-6 chars. + avg_len = sum(len(w) for w in words) / len(words) + if avg_len < 2.5: + return False + + # Percentage of single-character "words" + single_char_pct = sum(1 for w in words if len(w) == 1) / len(words) + if single_char_pct > 0.4: + return False + + # Words per line — broken OCR puts each word on its own line. + # Real text has 5-15 words per line; broken OCR has ~1-2. + lines = [l for l in text.split("\n") if l.strip()] + if lines: + words_per_line = len(words) / len(lines) + if words_per_line < 3.0: + return False + + # Hebrew character ratio among letter characters + letters = re.findall(r'[a-zA-Z\u0590-\u05FF]', text) + if letters: + hebrew_pct = sum(1 for c in letters if _HEBREW_RE.match(c)) / len(letters) + if hebrew_pct < 0.5: + return False + + return True + + +# ── Hebrew abbreviation quote fixer ────────────────────────────── + +_HEBREW_ABBREV_FIXES: dict[str, str] = { + 'עוהייד': 'עוה"ד', + 'עוייד': 'עו"ד', + 'הנייל': 'הנ"ל', + 'מצייב': 'מצ"ב', + 'ביהמייש': 'ביהמ"ש', + 'תייז': 'ת"ז', + 'עייי': 'ע"י', + 'אחייכ': 'אח"כ', + 'סייק': 'ס"ק', + 'דייר': 'ד"ר', + 'כדוייח': 'כדו"ח', + 'חווייד': 'חוו"ד', + 'מייר': 'מ"ר', + 'יחייד': 'יח"ד', + 'בייכ': 'ב"כ', +} + +_ABBREV_PATTERN = re.compile( + '|'.join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True)) +) + + +def _fix_hebrew_quotes(text: str) -> str: + """Fix known Hebrew abbreviation quote replacements from Google Vision OCR.""" + return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text) + + +# ── Extraction ─────────────────────────────────────────────────── async def extract_text(file_path: str) -> tuple[str, int]: @@ -52,65 +138,53 @@ async def extract_text(file_path: str) -> tuple[str, int]: async def _extract_pdf(path: Path) -> tuple[str, int]: - """Extract text from PDF. Try direct text first, fall back to Claude Vision for scanned pages.""" + """Extract text from PDF. + + Try direct text first, fall back to Google Cloud Vision for scanned + or broken-OCR pages. + """ doc = fitz.open(str(path)) page_count = len(doc) pages_text: list[str] = [] for page_num in range(page_count): page = doc[page_num] - # Try direct text extraction first text = page.get_text().strip() - if len(text) > 50: - # Sufficient text found - born-digital page + if len(text) > 50 and _text_quality_ok(text): pages_text.append(text) - logger.debug("Page %d: direct text extraction (%d chars)", page_num + 1, len(text)) + logger.debug("Page %d: direct extraction (%d chars, quality OK)", page_num + 1, len(text)) else: - # Likely scanned - use Claude Vision - logger.info("Page %d: using Claude Vision OCR", page_num + 1) - pix = page.get_pixmap(dpi=200) + reason = "insufficient text" if len(text) <= 50 else "low quality OCR layer" + logger.info("Page %d: Google Vision OCR (%s)", page_num + 1, reason) + pix = page.get_pixmap(dpi=300) img_bytes = pix.tobytes("png") - ocr_text = await _ocr_with_claude(img_bytes, page_num + 1) + ocr_text = await asyncio.to_thread( + _ocr_with_google_vision, img_bytes, page_num + 1 + ) pages_text.append(ocr_text) doc.close() return "\n\n".join(pages_text), page_count -async def _ocr_with_claude(image_bytes: bytes, page_num: int) -> str: - """OCR a single page image using Claude Vision API.""" - client = _get_anthropic() - b64_image = base64.b64encode(image_bytes).decode("utf-8") +def _ocr_with_google_vision(image_bytes: bytes, page_num: int) -> str: + """OCR a single page image using Google Cloud Vision API.""" + client = _get_vision_client() + image = vision.Image(content=image_bytes) - message = client.messages.create( - model="claude-sonnet-4-20250514", - max_tokens=4096, - messages=[ - { - "role": "user", - "content": [ - { - "type": "image", - "source": { - "type": "base64", - "media_type": "image/png", - "data": b64_image, - }, - }, - { - "type": "text", - "text": ( - "חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. " - "שמור על מבנה הפסקאות המקורי. " - "החזר רק את הטקסט המחולץ, ללא הערות נוספות." - ), - }, - ], - } - ], + response = client.document_text_detection( + image=image, + image_context=vision.ImageContext(language_hints=["he"]), ) - return message.content[0].text + + if response.error.message: + raise RuntimeError( + f"Google Vision error on page {page_num}: {response.error.message}" + ) + + text = response.full_text_annotation.text if response.full_text_annotation else "" + return _fix_hebrew_quotes(text) def _extract_docx(path: Path) -> str: