Initial commit: MCP server + web upload interface

Ezer Mishpati - AI legal decision drafting system with: - MCP server (FastMCP) with document processing pipeline - Web upload interface (FastAPI) for file upload and classification - pgvector-based semantic search - Hebrew legal document chunking and embedding
2026-03-23 12:33:07 +00:00
commit 6f515dc2cb
33 changed files with 3297 additions and 0 deletions
--- a/mcp-server/src/legal_mcp/services/processor.py
+++ b/mcp-server/src/legal_mcp/services/processor.py
@@ -0,0 +1,79 @@
+"""Document processing pipeline: extract → chunk → embed → store."""
+
+from __future__ import annotations
+
+import logging
+from uuid import UUID
+
+from legal_mcp.services import chunker, db, embeddings, extractor
+
+logger = logging.getLogger(__name__)
+
+
+async def process_document(document_id: UUID, case_id: UUID) -> dict:
+    """Full processing pipeline for a document.
+
+    1. Extract text from file
+    2. Split into chunks
+    3. Generate embeddings
+    4. Store chunks + embeddings in DB
+
+    Returns processing summary.
+    """
+    doc = await db.get_document(document_id)
+    if not doc:
+        raise ValueError(f"Document {document_id} not found")
+
+    await db.update_document(document_id, extraction_status="processing")
+
+    try:
+        # Step 1: Extract text
+        logger.info("Extracting text from %s", doc["file_path"])
+        text, page_count = await extractor.extract_text(doc["file_path"])
+
+        await db.update_document(
+            document_id,
+            extracted_text=text,
+            page_count=page_count,
+        )
+
+        # Step 2: Chunk
+        logger.info("Chunking document (%d chars)", len(text))
+        chunks = chunker.chunk_document(text)
+
+        if not chunks:
+            await db.update_document(document_id, extraction_status="completed")
+            return {"status": "completed", "chunks": 0, "message": "No text to chunk"}
+
+        # Step 3: Embed
+        logger.info("Generating embeddings for %d chunks", len(chunks))
+        texts = [c.content for c in chunks]
+        embs = await embeddings.embed_texts(texts, input_type="document")
+
+        # Step 4: Store
+        chunk_dicts = [
+            {
+                "content": c.content,
+                "section_type": c.section_type,
+                "embedding": emb,
+                "page_number": c.page_number,
+                "chunk_index": c.chunk_index,
+            }
+            for c, emb in zip(chunks, embs)
+        ]
+
+        stored = await db.store_chunks(document_id, case_id, chunk_dicts)
+        await db.update_document(document_id, extraction_status="completed")
+
+        logger.info("Document processed: %d chunks stored", stored)
+        return {
+            "status": "completed",
+            "chunks": stored,
+            "pages": page_count,
+            "text_length": len(text),
+        }
+
+    except Exception as e:
+        logger.exception("Document processing failed: %s", e)
+        await db.update_document(document_id, extraction_status="failed")
+        return {"status": "failed", "error": str(e)}