Initial commit: MCP server + web upload interface
Ezer Mishpati - AI legal decision drafting system with: - MCP server (FastMCP) with document processing pipeline - Web upload interface (FastAPI) for file upload and classification - pgvector-based semantic search - Hebrew legal document chunking and embedding
This commit is contained in:
79
mcp-server/src/legal_mcp/services/processor.py
Normal file
79
mcp-server/src/legal_mcp/services/processor.py
Normal file
@@ -0,0 +1,79 @@
|
||||
"""Document processing pipeline: extract → chunk → embed → store."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from uuid import UUID
|
||||
|
||||
from legal_mcp.services import chunker, db, embeddings, extractor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def process_document(document_id: UUID, case_id: UUID) -> dict:
|
||||
"""Full processing pipeline for a document.
|
||||
|
||||
1. Extract text from file
|
||||
2. Split into chunks
|
||||
3. Generate embeddings
|
||||
4. Store chunks + embeddings in DB
|
||||
|
||||
Returns processing summary.
|
||||
"""
|
||||
doc = await db.get_document(document_id)
|
||||
if not doc:
|
||||
raise ValueError(f"Document {document_id} not found")
|
||||
|
||||
await db.update_document(document_id, extraction_status="processing")
|
||||
|
||||
try:
|
||||
# Step 1: Extract text
|
||||
logger.info("Extracting text from %s", doc["file_path"])
|
||||
text, page_count = await extractor.extract_text(doc["file_path"])
|
||||
|
||||
await db.update_document(
|
||||
document_id,
|
||||
extracted_text=text,
|
||||
page_count=page_count,
|
||||
)
|
||||
|
||||
# Step 2: Chunk
|
||||
logger.info("Chunking document (%d chars)", len(text))
|
||||
chunks = chunker.chunk_document(text)
|
||||
|
||||
if not chunks:
|
||||
await db.update_document(document_id, extraction_status="completed")
|
||||
return {"status": "completed", "chunks": 0, "message": "No text to chunk"}
|
||||
|
||||
# Step 3: Embed
|
||||
logger.info("Generating embeddings for %d chunks", len(chunks))
|
||||
texts = [c.content for c in chunks]
|
||||
embs = await embeddings.embed_texts(texts, input_type="document")
|
||||
|
||||
# Step 4: Store
|
||||
chunk_dicts = [
|
||||
{
|
||||
"content": c.content,
|
||||
"section_type": c.section_type,
|
||||
"embedding": emb,
|
||||
"page_number": c.page_number,
|
||||
"chunk_index": c.chunk_index,
|
||||
}
|
||||
for c, emb in zip(chunks, embs)
|
||||
]
|
||||
|
||||
stored = await db.store_chunks(document_id, case_id, chunk_dicts)
|
||||
await db.update_document(document_id, extraction_status="completed")
|
||||
|
||||
logger.info("Document processed: %d chunks stored", stored)
|
||||
return {
|
||||
"status": "completed",
|
||||
"chunks": stored,
|
||||
"pages": page_count,
|
||||
"text_length": len(text),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Document processing failed: %s", e)
|
||||
await db.update_document(document_id, extraction_status="failed")
|
||||
return {"status": "failed", "error": str(e)}
|
||||
Reference in New Issue
Block a user