Initial commit: MCP server + web upload interface

Ezer Mishpati - AI legal decision drafting system with:
- MCP server (FastMCP) with document processing pipeline
- Web upload interface (FastAPI) for file upload and classification
- pgvector-based semantic search
- Hebrew legal document chunking and embedding
This commit is contained in:
2026-03-23 12:33:07 +00:00
commit 6f515dc2cb
33 changed files with 3297 additions and 0 deletions

View File

@@ -0,0 +1,79 @@
"""Document processing pipeline: extract → chunk → embed → store."""
from __future__ import annotations
import logging
from uuid import UUID
from legal_mcp.services import chunker, db, embeddings, extractor
logger = logging.getLogger(__name__)
async def process_document(document_id: UUID, case_id: UUID) -> dict:
"""Full processing pipeline for a document.
1. Extract text from file
2. Split into chunks
3. Generate embeddings
4. Store chunks + embeddings in DB
Returns processing summary.
"""
doc = await db.get_document(document_id)
if not doc:
raise ValueError(f"Document {document_id} not found")
await db.update_document(document_id, extraction_status="processing")
try:
# Step 1: Extract text
logger.info("Extracting text from %s", doc["file_path"])
text, page_count = await extractor.extract_text(doc["file_path"])
await db.update_document(
document_id,
extracted_text=text,
page_count=page_count,
)
# Step 2: Chunk
logger.info("Chunking document (%d chars)", len(text))
chunks = chunker.chunk_document(text)
if not chunks:
await db.update_document(document_id, extraction_status="completed")
return {"status": "completed", "chunks": 0, "message": "No text to chunk"}
# Step 3: Embed
logger.info("Generating embeddings for %d chunks", len(chunks))
texts = [c.content for c in chunks]
embs = await embeddings.embed_texts(texts, input_type="document")
# Step 4: Store
chunk_dicts = [
{
"content": c.content,
"section_type": c.section_type,
"embedding": emb,
"page_number": c.page_number,
"chunk_index": c.chunk_index,
}
for c, emb in zip(chunks, embs)
]
stored = await db.store_chunks(document_id, case_id, chunk_dicts)
await db.update_document(document_id, extraction_status="completed")
logger.info("Document processed: %d chunks stored", stored)
return {
"status": "completed",
"chunks": stored,
"pages": page_count,
"text_length": len(text),
}
except Exception as e:
logger.exception("Document processing failed: %s", e)
await db.update_document(document_id, extraction_status="failed")
return {"status": "failed", "error": str(e)}