Initial commit: MCP server + web upload interface

Ezer Mishpati - AI legal decision drafting system with: - MCP server (FastMCP) with document processing pipeline - Web upload interface (FastAPI) for file upload and classification - pgvector-based semantic search - Hebrew legal document chunking and embedding
2026-03-23 12:33:07 +00:00
commit 6f515dc2cb
33 changed files with 3297 additions and 0 deletions
--- a/mcp-server/src/legal_mcp/services/extractor.py
+++ b/mcp-server/src/legal_mcp/services/extractor.py
@@ -0,0 +1,126 @@
+"""Text extraction from PDF, DOCX, and RTF files.
+
+Primary PDF extraction: Claude Vision API (for scanned documents).
+Fallback: PyMuPDF direct text extraction (for born-digital PDFs).
+"""
+
+from __future__ import annotations
+
+import base64
+import logging
+from pathlib import Path
+
+import anthropic
+import fitz  # PyMuPDF
+from docx import Document as DocxDocument
+from striprtf.striprtf import rtf_to_text
+
+from legal_mcp import config
+
+logger = logging.getLogger(__name__)
+
+_anthropic_client: anthropic.Anthropic | None = None
+
+
+def _get_anthropic() -> anthropic.Anthropic:
+    global _anthropic_client
+    if _anthropic_client is None:
+        _anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
+    return _anthropic_client
+
+
+async def extract_text(file_path: str) -> tuple[str, int]:
+    """Extract text from a document file.
+
+    Returns:
+        Tuple of (extracted_text, page_count).
+        page_count is 0 for non-PDF files.
+    """
+    path = Path(file_path)
+    suffix = path.suffix.lower()
+
+    if suffix == ".pdf":
+        return await _extract_pdf(path)
+    elif suffix == ".docx":
+        return _extract_docx(path), 0
+    elif suffix == ".rtf":
+        return _extract_rtf(path), 0
+    elif suffix == ".txt":
+        return path.read_text(encoding="utf-8"), 0
+    else:
+        raise ValueError(f"Unsupported file type: {suffix}")
+
+
+async def _extract_pdf(path: Path) -> tuple[str, int]:
+    """Extract text from PDF. Try direct text first, fall back to Claude Vision for scanned pages."""
+    doc = fitz.open(str(path))
+    page_count = len(doc)
+    pages_text: list[str] = []
+
+    for page_num in range(page_count):
+        page = doc[page_num]
+        # Try direct text extraction first
+        text = page.get_text().strip()
+
+        if len(text) > 50:
+            # Sufficient text found - born-digital page
+            pages_text.append(text)
+            logger.debug("Page %d: direct text extraction (%d chars)", page_num + 1, len(text))
+        else:
+            # Likely scanned - use Claude Vision
+            logger.info("Page %d: using Claude Vision OCR", page_num + 1)
+            pix = page.get_pixmap(dpi=200)
+            img_bytes = pix.tobytes("png")
+            ocr_text = await _ocr_with_claude(img_bytes, page_num + 1)
+            pages_text.append(ocr_text)
+
+    doc.close()
+    return "\n\n".join(pages_text), page_count
+
+
+async def _ocr_with_claude(image_bytes: bytes, page_num: int) -> str:
+    """OCR a single page image using Claude Vision API."""
+    client = _get_anthropic()
+    b64_image = base64.b64encode(image_bytes).decode("utf-8")
+
+    message = client.messages.create(
+        model="claude-sonnet-4-20250514",
+        max_tokens=4096,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": "image/png",
+                            "data": b64_image,
+                        },
+                    },
+                    {
+                        "type": "text",
+                        "text": (
+                            "חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
+                            "שמור על מבנה הפסקאות המקורי. "
+                            "החזר רק את הטקסט המחולץ, ללא הערות נוספות."
+                        ),
+                    },
+                ],
+            }
+        ],
+    )
+    return message.content[0].text
+
+
+def _extract_docx(path: Path) -> str:
+    """Extract text from DOCX file."""
+    doc = DocxDocument(str(path))
+    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
+    return "\n\n".join(paragraphs)
+
+
+def _extract_rtf(path: Path) -> str:
+    """Extract text from RTF file."""
+    rtf_content = path.read_text(encoding="utf-8", errors="replace")
+    return rtf_to_text(rtf_content)