Files
ezer-mishpati/web/app.py
Chaim 6f515dc2cb Initial commit: MCP server + web upload interface
Ezer Mishpati - AI legal decision drafting system with:
- MCP server (FastMCP) with document processing pipeline
- Web upload interface (FastAPI) for file upload and classification
- pgvector-based semantic search
- Hebrew legal document chunking and embedding
2026-03-23 12:33:07 +00:00

343 lines
11 KiB
Python

"""Ezer Mishpati — Web upload interface for legal documents."""
from __future__ import annotations
import asyncio
import json
import logging
import re
import shutil
import subprocess
import sys
import time
from contextlib import asynccontextmanager
from pathlib import Path
from uuid import UUID, uuid4
# Allow importing legal_mcp from the MCP server source
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
from fastapi import FastAPI, File, HTTPException, UploadFile
from fastapi.responses import FileResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
from legal_mcp import config
from legal_mcp.services import chunker, db, embeddings, extractor, processor
logger = logging.getLogger(__name__)
UPLOAD_DIR = config.DATA_DIR / "uploads"
ALLOWED_EXTENSIONS = {".pdf", ".docx", ".rtf", ".txt"}
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
# In-memory progress tracking
_progress: dict[str, dict] = {}
@asynccontextmanager
async def lifespan(app: FastAPI):
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
await db.init_schema()
yield
await db.close_pool()
app = FastAPI(title="Ezer Mishpati — Upload", lifespan=lifespan)
STATIC_DIR = Path(__file__).parent / "static"
# ── API Endpoints ──────────────────────────────────────────────────
@app.get("/")
async def index():
return FileResponse(STATIC_DIR / "index.html")
@app.post("/api/upload")
async def upload_file(file: UploadFile = File(...)):
"""Upload a file to the temporary uploads directory."""
if not file.filename:
raise HTTPException(400, "No filename provided")
# Validate extension
ext = Path(file.filename).suffix.lower()
if ext not in ALLOWED_EXTENSIONS:
raise HTTPException(400, f"Unsupported file type: {ext}. Allowed: {', '.join(ALLOWED_EXTENSIONS)}")
# Sanitize filename
safe_name = re.sub(r"[^\w\u0590-\u05FF\s.\-()]", "", Path(file.filename).stem)
if not safe_name:
safe_name = "document"
timestamp = int(time.time())
filename = f"{timestamp}_{safe_name}{ext}"
# Read and validate size
content = await file.read()
if len(content) > MAX_FILE_SIZE:
raise HTTPException(400, f"File too large. Max: {MAX_FILE_SIZE // (1024*1024)}MB")
dest = UPLOAD_DIR / filename
dest.write_bytes(content)
return {
"filename": filename,
"original_name": file.filename,
"size": len(content),
}
@app.get("/api/uploads")
async def list_uploads():
"""List files in the uploads (pending) directory."""
if not UPLOAD_DIR.exists():
return []
files = []
for f in sorted(UPLOAD_DIR.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True):
if f.is_file() and f.suffix.lower() in ALLOWED_EXTENSIONS:
stat = f.stat()
files.append({
"filename": f.name,
"size": stat.st_size,
"uploaded_at": stat.st_mtime,
})
return files
@app.delete("/api/uploads/{filename}")
async def delete_upload(filename: str):
"""Remove a file from the uploads directory."""
path = UPLOAD_DIR / filename
if not path.exists() or not path.parent.samefile(UPLOAD_DIR):
raise HTTPException(404, "File not found")
path.unlink()
return {"deleted": filename}
class ClassifyRequest(BaseModel):
filename: str
category: str # "training" or "case"
# For case documents
case_number: str = ""
doc_type: str = "appeal"
title: str = ""
# For training documents
decision_number: str = ""
decision_date: str = ""
subject_categories: list[str] = []
@app.post("/api/classify")
async def classify_file(req: ClassifyRequest):
"""Classify a pending file and start processing."""
source = UPLOAD_DIR / req.filename
if not source.exists() or not source.parent.samefile(UPLOAD_DIR):
raise HTTPException(404, "File not found in uploads")
if req.category not in ("training", "case"):
raise HTTPException(400, "Category must be 'training' or 'case'")
if req.category == "case" and not req.case_number:
raise HTTPException(400, "case_number required for case documents")
task_id = str(uuid4())
_progress[task_id] = {"status": "queued", "filename": req.filename}
asyncio.create_task(_process_file(task_id, source, req))
return {"task_id": task_id}
@app.get("/api/progress/{task_id}")
async def progress_stream(task_id: str):
"""SSE stream of processing progress."""
if task_id not in _progress:
raise HTTPException(404, "Task not found")
async def event_stream():
while True:
data = _progress.get(task_id, {})
yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n"
if data.get("status") in ("completed", "failed"):
break
await asyncio.sleep(1)
# Clean up after a delay
await asyncio.sleep(30)
_progress.pop(task_id, None)
return StreamingResponse(event_stream(), media_type="text/event-stream")
@app.get("/api/cases")
async def list_cases():
"""List existing cases for the dropdown."""
cases = await db.list_cases()
return [
{
"case_number": c["case_number"],
"title": c["title"],
"status": c["status"],
}
for c in cases
]
# ── Background Processing ─────────────────────────────────────────
async def _process_file(task_id: str, source: Path, req: ClassifyRequest):
"""Process a classified file in the background."""
try:
if req.category == "case":
await _process_case_document(task_id, source, req)
else:
await _process_training_document(task_id, source, req)
except Exception as e:
logger.exception("Processing failed for %s", req.filename)
_progress[task_id] = {"status": "failed", "error": str(e), "filename": req.filename}
async def _process_case_document(task_id: str, source: Path, req: ClassifyRequest):
"""Process a case document (mirrors documents.document_upload logic)."""
_progress[task_id] = {"status": "validating", "filename": req.filename}
case = await db.get_case_by_number(req.case_number)
if not case:
_progress[task_id] = {"status": "failed", "error": f"Case {req.case_number} not found"}
return
case_id = UUID(case["id"])
title = req.title or source.stem.split("_", 1)[-1] # Remove timestamp prefix
# Copy to case directory
_progress[task_id] = {"status": "copying", "filename": req.filename}
case_dir = config.CASES_DIR / req.case_number / "documents"
case_dir.mkdir(parents=True, exist_ok=True)
# Use original name without timestamp prefix
original_name = re.sub(r"^\d+_", "", source.name)
dest = case_dir / original_name
shutil.copy2(str(source), str(dest))
# Create document record
_progress[task_id] = {"status": "registering", "filename": req.filename}
doc = await db.create_document(
case_id=case_id,
doc_type=req.doc_type,
title=title,
file_path=str(dest),
)
# Process (extract → chunk → embed → store)
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "extracting"}
result = await processor.process_document(UUID(doc["id"]), case_id)
# Git commit
repo_dir = config.CASES_DIR / req.case_number
if repo_dir.exists():
subprocess.run(["git", "add", "."], cwd=repo_dir, capture_output=True)
doc_type_hebrew = {
"appeal": "כתב ערר", "response": "תשובה", "decision": "החלטה",
"reference": "מסמך עזר", "exhibit": "נספח",
}.get(req.doc_type, req.doc_type)
subprocess.run(
["git", "commit", "-m", f"הוספת {doc_type_hebrew}: {title}"],
cwd=repo_dir, capture_output=True,
env={"GIT_AUTHOR_NAME": "Ezer Mishpati", "GIT_AUTHOR_EMAIL": "legal@local",
"GIT_COMMITTER_NAME": "Ezer Mishpati", "GIT_COMMITTER_EMAIL": "legal@local",
"PATH": "/usr/bin:/bin"},
)
# Remove from uploads
source.unlink(missing_ok=True)
_progress[task_id] = {
"status": "completed",
"filename": req.filename,
"result": result,
"case_number": req.case_number,
"doc_type": req.doc_type,
}
async def _process_training_document(task_id: str, source: Path, req: ClassifyRequest):
"""Process a training document (mirrors documents.document_upload_training logic)."""
from datetime import date as date_type
title = req.title or source.stem.split("_", 1)[-1]
# Copy to training directory
_progress[task_id] = {"status": "copying", "filename": req.filename}
config.TRAINING_DIR.mkdir(parents=True, exist_ok=True)
original_name = re.sub(r"^\d+_", "", source.name)
dest = config.TRAINING_DIR / original_name
shutil.copy2(str(source), str(dest))
# Extract text
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "extracting"}
text, page_count = await extractor.extract_text(str(dest))
# Parse date
d_date = None
if req.decision_date:
d_date = date_type.fromisoformat(req.decision_date)
# Add to style corpus
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "corpus"}
corpus_id = await db.add_to_style_corpus(
document_id=None,
decision_number=req.decision_number,
decision_date=d_date,
subject_categories=req.subject_categories,
full_text=text,
)
# Chunk and embed
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "chunking"}
chunks = chunker.chunk_document(text)
chunk_count = 0
if chunks:
doc = await db.create_document(
case_id=None,
doc_type="decision",
title=f"[קורפוס] {title}",
file_path=str(dest),
page_count=page_count,
)
doc_id = UUID(doc["id"])
await db.update_document(doc_id, extracted_text=text, extraction_status="completed")
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "embedding"}
texts = [c.content for c in chunks]
embs = await embeddings.embed_texts(texts, input_type="document")
chunk_dicts = [
{
"content": c.content,
"section_type": c.section_type,
"embedding": emb,
"page_number": c.page_number,
"chunk_index": c.chunk_index,
}
for c, emb in zip(chunks, embs)
]
await db.store_chunks(doc_id, None, chunk_dicts)
chunk_count = len(chunks)
# Remove from uploads
source.unlink(missing_ok=True)
_progress[task_id] = {
"status": "completed",
"filename": req.filename,
"result": {
"corpus_id": str(corpus_id),
"title": title,
"pages": page_count,
"text_length": len(text),
"chunks": chunk_count,
},
}