Initial commit: MCP server + web upload interface
Ezer Mishpati - AI legal decision drafting system with: - MCP server (FastMCP) with document processing pipeline - Web upload interface (FastAPI) for file upload and classification - pgvector-based semantic search - Hebrew legal document chunking and embedding
This commit is contained in:
126
mcp-server/src/legal_mcp/services/extractor.py
Normal file
126
mcp-server/src/legal_mcp/services/extractor.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""Text extraction from PDF, DOCX, and RTF files.
|
||||
|
||||
Primary PDF extraction: Claude Vision API (for scanned documents).
|
||||
Fallback: PyMuPDF direct text extraction (for born-digital PDFs).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import fitz # PyMuPDF
|
||||
from docx import Document as DocxDocument
|
||||
from striprtf.striprtf import rtf_to_text
|
||||
|
||||
from legal_mcp import config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_anthropic_client: anthropic.Anthropic | None = None
|
||||
|
||||
|
||||
def _get_anthropic() -> anthropic.Anthropic:
|
||||
global _anthropic_client
|
||||
if _anthropic_client is None:
|
||||
_anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
|
||||
return _anthropic_client
|
||||
|
||||
|
||||
async def extract_text(file_path: str) -> tuple[str, int]:
|
||||
"""Extract text from a document file.
|
||||
|
||||
Returns:
|
||||
Tuple of (extracted_text, page_count).
|
||||
page_count is 0 for non-PDF files.
|
||||
"""
|
||||
path = Path(file_path)
|
||||
suffix = path.suffix.lower()
|
||||
|
||||
if suffix == ".pdf":
|
||||
return await _extract_pdf(path)
|
||||
elif suffix == ".docx":
|
||||
return _extract_docx(path), 0
|
||||
elif suffix == ".rtf":
|
||||
return _extract_rtf(path), 0
|
||||
elif suffix == ".txt":
|
||||
return path.read_text(encoding="utf-8"), 0
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {suffix}")
|
||||
|
||||
|
||||
async def _extract_pdf(path: Path) -> tuple[str, int]:
|
||||
"""Extract text from PDF. Try direct text first, fall back to Claude Vision for scanned pages."""
|
||||
doc = fitz.open(str(path))
|
||||
page_count = len(doc)
|
||||
pages_text: list[str] = []
|
||||
|
||||
for page_num in range(page_count):
|
||||
page = doc[page_num]
|
||||
# Try direct text extraction first
|
||||
text = page.get_text().strip()
|
||||
|
||||
if len(text) > 50:
|
||||
# Sufficient text found - born-digital page
|
||||
pages_text.append(text)
|
||||
logger.debug("Page %d: direct text extraction (%d chars)", page_num + 1, len(text))
|
||||
else:
|
||||
# Likely scanned - use Claude Vision
|
||||
logger.info("Page %d: using Claude Vision OCR", page_num + 1)
|
||||
pix = page.get_pixmap(dpi=200)
|
||||
img_bytes = pix.tobytes("png")
|
||||
ocr_text = await _ocr_with_claude(img_bytes, page_num + 1)
|
||||
pages_text.append(ocr_text)
|
||||
|
||||
doc.close()
|
||||
return "\n\n".join(pages_text), page_count
|
||||
|
||||
|
||||
async def _ocr_with_claude(image_bytes: bytes, page_num: int) -> str:
|
||||
"""OCR a single page image using Claude Vision API."""
|
||||
client = _get_anthropic()
|
||||
b64_image = base64.b64encode(image_bytes).decode("utf-8")
|
||||
|
||||
message = client.messages.create(
|
||||
model="claude-sonnet-4-20250514",
|
||||
max_tokens=4096,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": "image/png",
|
||||
"data": b64_image,
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": (
|
||||
"חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
|
||||
"שמור על מבנה הפסקאות המקורי. "
|
||||
"החזר רק את הטקסט המחולץ, ללא הערות נוספות."
|
||||
),
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
)
|
||||
return message.content[0].text
|
||||
|
||||
|
||||
def _extract_docx(path: Path) -> str:
|
||||
"""Extract text from DOCX file."""
|
||||
doc = DocxDocument(str(path))
|
||||
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
||||
return "\n\n".join(paragraphs)
|
||||
|
||||
|
||||
def _extract_rtf(path: Path) -> str:
|
||||
"""Extract text from RTF file."""
|
||||
rtf_content = path.read_text(encoding="utf-8", errors="replace")
|
||||
return rtf_to_text(rtf_content)
|
||||
Reference in New Issue
Block a user