"""Text extraction from PDF, DOCX, and RTF files. Primary PDF extraction: Claude Vision API (for scanned documents). Fallback: PyMuPDF direct text extraction (for born-digital PDFs). """ from __future__ import annotations import base64 import logging from pathlib import Path import anthropic import fitz # PyMuPDF from docx import Document as DocxDocument from striprtf.striprtf import rtf_to_text from legal_mcp import config logger = logging.getLogger(__name__) _anthropic_client: anthropic.Anthropic | None = None def _get_anthropic() -> anthropic.Anthropic: global _anthropic_client if _anthropic_client is None: _anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY) return _anthropic_client async def extract_text(file_path: str) -> tuple[str, int]: """Extract text from a document file. Returns: Tuple of (extracted_text, page_count). page_count is 0 for non-PDF files. """ path = Path(file_path) suffix = path.suffix.lower() if suffix == ".pdf": return await _extract_pdf(path) elif suffix == ".docx": return _extract_docx(path), 0 elif suffix == ".rtf": return _extract_rtf(path), 0 elif suffix in (".txt", ".md"): return path.read_text(encoding="utf-8"), 0 else: raise ValueError(f"Unsupported file type: {suffix}") async def _extract_pdf(path: Path) -> tuple[str, int]: """Extract text from PDF. Try direct text first, fall back to Claude Vision for scanned pages.""" doc = fitz.open(str(path)) page_count = len(doc) pages_text: list[str] = [] for page_num in range(page_count): page = doc[page_num] # Try direct text extraction first text = page.get_text().strip() if len(text) > 50: # Sufficient text found - born-digital page pages_text.append(text) logger.debug("Page %d: direct text extraction (%d chars)", page_num + 1, len(text)) else: # Likely scanned - use Claude Vision logger.info("Page %d: using Claude Vision OCR", page_num + 1) pix = page.get_pixmap(dpi=200) img_bytes = pix.tobytes("png") ocr_text = await _ocr_with_claude(img_bytes, page_num + 1) pages_text.append(ocr_text) doc.close() return "\n\n".join(pages_text), page_count async def _ocr_with_claude(image_bytes: bytes, page_num: int) -> str: """OCR a single page image using Claude Vision API.""" client = _get_anthropic() b64_image = base64.b64encode(image_bytes).decode("utf-8") message = client.messages.create( model="claude-sonnet-4-20250514", max_tokens=4096, messages=[ { "role": "user", "content": [ { "type": "image", "source": { "type": "base64", "media_type": "image/png", "data": b64_image, }, }, { "type": "text", "text": ( "חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. " "שמור על מבנה הפסקאות המקורי. " "החזר רק את הטקסט המחולץ, ללא הערות נוספות." ), }, ], } ], ) return message.content[0].text def _extract_docx(path: Path) -> str: """Extract text from DOCX file.""" doc = DocxDocument(str(path)) paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] return "\n\n".join(paragraphs) def _extract_rtf(path: Path) -> str: """Extract text from RTF file.""" rtf_content = path.read_text(encoding="utf-8", errors="replace") return rtf_to_text(rtf_content)