Replace Claude Vision OCR with Google Cloud Vision
Benchmark results on Hebrew legal docs (case 1130-25): - Google Vision: 1s/page, $0.001/page, high accuracy - Claude Opus Vision: 90s/page, $0.05/page, poor accuracy - PyMuPDF broken OCR layers now detected via quality check Changes: - extractor.py: Google Vision OCR with Hebrew language hint (300 DPI) - extractor.py: text quality detection (word length, words-per-line, Hebrew ratio) - extractor.py: Hebrew abbreviation quote fixer (15 known patterns) - config.py: add GOOGLE_CLOUD_VISION_API_KEY, remove ANTHROPIC_API_KEY - pyproject.toml: add google-cloud-vision, remove anthropic Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -8,7 +8,6 @@ dependencies = [
|
|||||||
"asyncpg>=0.29.0",
|
"asyncpg>=0.29.0",
|
||||||
"pgvector>=0.3.0",
|
"pgvector>=0.3.0",
|
||||||
"voyageai>=0.3.0",
|
"voyageai>=0.3.0",
|
||||||
"anthropic>=0.40.0",
|
|
||||||
"python-dotenv>=1.0.0",
|
"python-dotenv>=1.0.0",
|
||||||
"pydantic>=2.0.0",
|
"pydantic>=2.0.0",
|
||||||
"pymupdf>=1.25.0",
|
"pymupdf>=1.25.0",
|
||||||
@@ -17,6 +16,7 @@ dependencies = [
|
|||||||
"redis>=5.0.0",
|
"redis>=5.0.0",
|
||||||
"rq>=1.16.0",
|
"rq>=1.16.0",
|
||||||
"pillow>=10.0.0",
|
"pillow>=10.0.0",
|
||||||
|
"google-cloud-vision>=3.7.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|||||||
@@ -47,8 +47,8 @@ VOYAGE_API_KEY = os.environ.get("VOYAGE_API_KEY", "")
|
|||||||
VOYAGE_MODEL = os.environ.get("VOYAGE_MODEL", "voyage-law-2")
|
VOYAGE_MODEL = os.environ.get("VOYAGE_MODEL", "voyage-law-2")
|
||||||
VOYAGE_DIMENSIONS = 1024
|
VOYAGE_DIMENSIONS = 1024
|
||||||
|
|
||||||
# Anthropic (for Claude Vision OCR)
|
# Google Cloud Vision (OCR for scanned PDFs)
|
||||||
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
|
GOOGLE_CLOUD_VISION_API_KEY = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY", "")
|
||||||
|
|
||||||
# Data directory
|
# Data directory
|
||||||
DATA_DIR = Path(os.environ.get("DATA_DIR", str(Path.home() / "legal-ai" / "data")))
|
DATA_DIR = Path(os.environ.get("DATA_DIR", str(Path.home() / "legal-ai" / "data")))
|
||||||
@@ -82,8 +82,8 @@ CHUNK_OVERLAP_TOKENS = 100
|
|||||||
|
|
||||||
# External service allowlist — case materials may ONLY be sent to these domains
|
# External service allowlist — case materials may ONLY be sent to these domains
|
||||||
ALLOWED_EXTERNAL_SERVICES = {
|
ALLOWED_EXTERNAL_SERVICES = {
|
||||||
"api.anthropic.com", # Claude API (text generation, OCR)
|
|
||||||
"api.voyageai.com", # Voyage AI (embeddings)
|
"api.voyageai.com", # Voyage AI (embeddings)
|
||||||
|
"vision.googleapis.com", # Google Cloud Vision (OCR)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Audit
|
# Audit
|
||||||
|
|||||||
@@ -1,32 +1,118 @@
|
|||||||
"""Text extraction from PDF, DOCX, and RTF files.
|
"""Text extraction from PDF, DOCX, and RTF files.
|
||||||
|
|
||||||
Primary PDF extraction: Claude Vision API (for scanned documents).
|
Primary PDF extraction: PyMuPDF direct text (for born-digital PDFs).
|
||||||
Fallback: PyMuPDF direct text extraction (for born-digital PDFs).
|
Fallback: Google Cloud Vision OCR (for scanned documents).
|
||||||
|
Post-processing: Hebrew abbreviation quote fixer.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import base64
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import anthropic
|
|
||||||
import fitz # PyMuPDF
|
import fitz # PyMuPDF
|
||||||
from docx import Document as DocxDocument
|
from docx import Document as DocxDocument
|
||||||
|
from google.cloud import vision
|
||||||
from striprtf.striprtf import rtf_to_text
|
from striprtf.striprtf import rtf_to_text
|
||||||
|
|
||||||
from legal_mcp import config
|
from legal_mcp import config
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_anthropic_client: anthropic.Anthropic | None = None
|
# ── Google Cloud Vision client ───────────────────────────────────
|
||||||
|
|
||||||
|
_vision_client: vision.ImageAnnotatorClient | None = None
|
||||||
|
|
||||||
|
|
||||||
def _get_anthropic() -> anthropic.Anthropic:
|
def _get_vision_client() -> vision.ImageAnnotatorClient:
|
||||||
global _anthropic_client
|
global _vision_client
|
||||||
if _anthropic_client is None:
|
if _vision_client is None:
|
||||||
_anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
|
_vision_client = vision.ImageAnnotatorClient(
|
||||||
return _anthropic_client
|
client_options={"api_key": config.GOOGLE_CLOUD_VISION_API_KEY}
|
||||||
|
)
|
||||||
|
return _vision_client
|
||||||
|
|
||||||
|
|
||||||
|
# ── Hebrew text quality detection ────────────────────────────────
|
||||||
|
|
||||||
|
_HEBREW_RE = re.compile(r'[\u0590-\u05FF]')
|
||||||
|
_WORD_RE = re.compile(r'\S+')
|
||||||
|
|
||||||
|
|
||||||
|
def _text_quality_ok(text: str) -> bool:
|
||||||
|
"""Check if extracted text is real content vs broken OCR layer.
|
||||||
|
|
||||||
|
Returns True if text appears to be genuine Hebrew legal content.
|
||||||
|
Broken OCR layers from scanned PDFs often have:
|
||||||
|
- Very short words / single-character fragments
|
||||||
|
- Each word on its own line (high words-per-line ratio)
|
||||||
|
- Non-Hebrew characters mixed in
|
||||||
|
"""
|
||||||
|
words = _WORD_RE.findall(text)
|
||||||
|
if len(words) < 10:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Average word length — real Hebrew words avg 4-6 chars.
|
||||||
|
avg_len = sum(len(w) for w in words) / len(words)
|
||||||
|
if avg_len < 2.5:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Percentage of single-character "words"
|
||||||
|
single_char_pct = sum(1 for w in words if len(w) == 1) / len(words)
|
||||||
|
if single_char_pct > 0.4:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Words per line — broken OCR puts each word on its own line.
|
||||||
|
# Real text has 5-15 words per line; broken OCR has ~1-2.
|
||||||
|
lines = [l for l in text.split("\n") if l.strip()]
|
||||||
|
if lines:
|
||||||
|
words_per_line = len(words) / len(lines)
|
||||||
|
if words_per_line < 3.0:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Hebrew character ratio among letter characters
|
||||||
|
letters = re.findall(r'[a-zA-Z\u0590-\u05FF]', text)
|
||||||
|
if letters:
|
||||||
|
hebrew_pct = sum(1 for c in letters if _HEBREW_RE.match(c)) / len(letters)
|
||||||
|
if hebrew_pct < 0.5:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
# ── Hebrew abbreviation quote fixer ──────────────────────────────
|
||||||
|
|
||||||
|
_HEBREW_ABBREV_FIXES: dict[str, str] = {
|
||||||
|
'עוהייד': 'עוה"ד',
|
||||||
|
'עוייד': 'עו"ד',
|
||||||
|
'הנייל': 'הנ"ל',
|
||||||
|
'מצייב': 'מצ"ב',
|
||||||
|
'ביהמייש': 'ביהמ"ש',
|
||||||
|
'תייז': 'ת"ז',
|
||||||
|
'עייי': 'ע"י',
|
||||||
|
'אחייכ': 'אח"כ',
|
||||||
|
'סייק': 'ס"ק',
|
||||||
|
'דייר': 'ד"ר',
|
||||||
|
'כדוייח': 'כדו"ח',
|
||||||
|
'חווייד': 'חוו"ד',
|
||||||
|
'מייר': 'מ"ר',
|
||||||
|
'יחייד': 'יח"ד',
|
||||||
|
'בייכ': 'ב"כ',
|
||||||
|
}
|
||||||
|
|
||||||
|
_ABBREV_PATTERN = re.compile(
|
||||||
|
'|'.join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_hebrew_quotes(text: str) -> str:
|
||||||
|
"""Fix known Hebrew abbreviation quote replacements from Google Vision OCR."""
|
||||||
|
return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Extraction ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
async def extract_text(file_path: str) -> tuple[str, int]:
|
async def extract_text(file_path: str) -> tuple[str, int]:
|
||||||
@@ -52,65 +138,53 @@ async def extract_text(file_path: str) -> tuple[str, int]:
|
|||||||
|
|
||||||
|
|
||||||
async def _extract_pdf(path: Path) -> tuple[str, int]:
|
async def _extract_pdf(path: Path) -> tuple[str, int]:
|
||||||
"""Extract text from PDF. Try direct text first, fall back to Claude Vision for scanned pages."""
|
"""Extract text from PDF.
|
||||||
|
|
||||||
|
Try direct text first, fall back to Google Cloud Vision for scanned
|
||||||
|
or broken-OCR pages.
|
||||||
|
"""
|
||||||
doc = fitz.open(str(path))
|
doc = fitz.open(str(path))
|
||||||
page_count = len(doc)
|
page_count = len(doc)
|
||||||
pages_text: list[str] = []
|
pages_text: list[str] = []
|
||||||
|
|
||||||
for page_num in range(page_count):
|
for page_num in range(page_count):
|
||||||
page = doc[page_num]
|
page = doc[page_num]
|
||||||
# Try direct text extraction first
|
|
||||||
text = page.get_text().strip()
|
text = page.get_text().strip()
|
||||||
|
|
||||||
if len(text) > 50:
|
if len(text) > 50 and _text_quality_ok(text):
|
||||||
# Sufficient text found - born-digital page
|
|
||||||
pages_text.append(text)
|
pages_text.append(text)
|
||||||
logger.debug("Page %d: direct text extraction (%d chars)", page_num + 1, len(text))
|
logger.debug("Page %d: direct extraction (%d chars, quality OK)", page_num + 1, len(text))
|
||||||
else:
|
else:
|
||||||
# Likely scanned - use Claude Vision
|
reason = "insufficient text" if len(text) <= 50 else "low quality OCR layer"
|
||||||
logger.info("Page %d: using Claude Vision OCR", page_num + 1)
|
logger.info("Page %d: Google Vision OCR (%s)", page_num + 1, reason)
|
||||||
pix = page.get_pixmap(dpi=200)
|
pix = page.get_pixmap(dpi=300)
|
||||||
img_bytes = pix.tobytes("png")
|
img_bytes = pix.tobytes("png")
|
||||||
ocr_text = await _ocr_with_claude(img_bytes, page_num + 1)
|
ocr_text = await asyncio.to_thread(
|
||||||
|
_ocr_with_google_vision, img_bytes, page_num + 1
|
||||||
|
)
|
||||||
pages_text.append(ocr_text)
|
pages_text.append(ocr_text)
|
||||||
|
|
||||||
doc.close()
|
doc.close()
|
||||||
return "\n\n".join(pages_text), page_count
|
return "\n\n".join(pages_text), page_count
|
||||||
|
|
||||||
|
|
||||||
async def _ocr_with_claude(image_bytes: bytes, page_num: int) -> str:
|
def _ocr_with_google_vision(image_bytes: bytes, page_num: int) -> str:
|
||||||
"""OCR a single page image using Claude Vision API."""
|
"""OCR a single page image using Google Cloud Vision API."""
|
||||||
client = _get_anthropic()
|
client = _get_vision_client()
|
||||||
b64_image = base64.b64encode(image_bytes).decode("utf-8")
|
image = vision.Image(content=image_bytes)
|
||||||
|
|
||||||
message = client.messages.create(
|
response = client.document_text_detection(
|
||||||
model="claude-sonnet-4-20250514",
|
image=image,
|
||||||
max_tokens=4096,
|
image_context=vision.ImageContext(language_hints=["he"]),
|
||||||
messages=[
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "image",
|
|
||||||
"source": {
|
|
||||||
"type": "base64",
|
|
||||||
"media_type": "image/png",
|
|
||||||
"data": b64_image,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": (
|
|
||||||
"חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
|
|
||||||
"שמור על מבנה הפסקאות המקורי. "
|
|
||||||
"החזר רק את הטקסט המחולץ, ללא הערות נוספות."
|
|
||||||
),
|
|
||||||
},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
],
|
|
||||||
)
|
)
|
||||||
return message.content[0].text
|
|
||||||
|
if response.error.message:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Google Vision error on page {page_num}: {response.error.message}"
|
||||||
|
)
|
||||||
|
|
||||||
|
text = response.full_text_annotation.text if response.full_text_annotation else ""
|
||||||
|
return _fix_hebrew_quotes(text)
|
||||||
|
|
||||||
|
|
||||||
def _extract_docx(path: Path) -> str:
|
def _extract_docx(path: Path) -> str:
|
||||||
|
|||||||
Reference in New Issue
Block a user