Initial commit: MCP server + web upload interface

Ezer Mishpati - AI legal decision drafting system with:
- MCP server (FastMCP) with document processing pipeline
- Web upload interface (FastAPI) for file upload and classification
- pgvector-based semantic search
- Hebrew legal document chunking and embedding
This commit is contained in:
2026-03-23 12:33:07 +00:00
commit 6f515dc2cb
33 changed files with 3297 additions and 0 deletions

View File

@@ -0,0 +1,126 @@
"""Text extraction from PDF, DOCX, and RTF files.
Primary PDF extraction: Claude Vision API (for scanned documents).
Fallback: PyMuPDF direct text extraction (for born-digital PDFs).
"""
from __future__ import annotations
import base64
import logging
from pathlib import Path
import anthropic
import fitz # PyMuPDF
from docx import Document as DocxDocument
from striprtf.striprtf import rtf_to_text
from legal_mcp import config
logger = logging.getLogger(__name__)
_anthropic_client: anthropic.Anthropic | None = None
def _get_anthropic() -> anthropic.Anthropic:
global _anthropic_client
if _anthropic_client is None:
_anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
return _anthropic_client
async def extract_text(file_path: str) -> tuple[str, int]:
"""Extract text from a document file.
Returns:
Tuple of (extracted_text, page_count).
page_count is 0 for non-PDF files.
"""
path = Path(file_path)
suffix = path.suffix.lower()
if suffix == ".pdf":
return await _extract_pdf(path)
elif suffix == ".docx":
return _extract_docx(path), 0
elif suffix == ".rtf":
return _extract_rtf(path), 0
elif suffix == ".txt":
return path.read_text(encoding="utf-8"), 0
else:
raise ValueError(f"Unsupported file type: {suffix}")
async def _extract_pdf(path: Path) -> tuple[str, int]:
"""Extract text from PDF. Try direct text first, fall back to Claude Vision for scanned pages."""
doc = fitz.open(str(path))
page_count = len(doc)
pages_text: list[str] = []
for page_num in range(page_count):
page = doc[page_num]
# Try direct text extraction first
text = page.get_text().strip()
if len(text) > 50:
# Sufficient text found - born-digital page
pages_text.append(text)
logger.debug("Page %d: direct text extraction (%d chars)", page_num + 1, len(text))
else:
# Likely scanned - use Claude Vision
logger.info("Page %d: using Claude Vision OCR", page_num + 1)
pix = page.get_pixmap(dpi=200)
img_bytes = pix.tobytes("png")
ocr_text = await _ocr_with_claude(img_bytes, page_num + 1)
pages_text.append(ocr_text)
doc.close()
return "\n\n".join(pages_text), page_count
async def _ocr_with_claude(image_bytes: bytes, page_num: int) -> str:
"""OCR a single page image using Claude Vision API."""
client = _get_anthropic()
b64_image = base64.b64encode(image_bytes).decode("utf-8")
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4096,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": b64_image,
},
},
{
"type": "text",
"text": (
"חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
"שמור על מבנה הפסקאות המקורי. "
"החזר רק את הטקסט המחולץ, ללא הערות נוספות."
),
},
],
}
],
)
return message.content[0].text
def _extract_docx(path: Path) -> str:
"""Extract text from DOCX file."""
doc = DocxDocument(str(path))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
return "\n\n".join(paragraphs)
def _extract_rtf(path: Path) -> str:
"""Extract text from RTF file."""
rtf_content = path.read_text(encoding="utf-8", errors="replace")
return rtf_to_text(rtf_content)