legal-ai/mcp-server/src/legal_mcp/services/processor.py

"""Document processing pipeline: extract → chunk → embed → store."""

from __future__ import annotations

import logging
from uuid import UUID

from legal_mcp.services import chunker, classifier, db, embeddings, extractor, references_extractor

logger = logging.getLogger(__name__)


async def process_document(document_id: UUID, case_id: UUID) -> dict:
    """Full processing pipeline for a document.

    1. Extract text from file
    2. Split into chunks
    3. Generate embeddings
    4. Store chunks + embeddings in DB

    Returns processing summary.
    """
    doc = await db.get_document(document_id)
    if not doc:
        raise ValueError(f"Document {document_id} not found")

    await db.update_document(document_id, extraction_status="processing")

    try:
        # Step 1: Extract text
        logger.info("Extracting text from %s", doc["file_path"])
        text, page_count = await extractor.extract_text(doc["file_path"])

        await db.update_document(
            document_id,
            extracted_text=text,
            page_count=page_count,
        )

        # Step 1.5: Classify document and identify parties
        logger.info("Classifying document")
        case_number = ""
        if case_id:
            case = await db.get_case(case_id)
            if case:
                case_number = case.get("case_number", "")
        classification_result = await classifier.classify_and_identify(text, case_number)
        await db.update_document(
            document_id,
            metadata=classification_result,
        )
        logger.info(
            "Classification: %s (confidence: %.2f), parties found: %d appellants, %d respondents",
            classification_result["classification"].get("doc_type", "?"),
            classification_result["classification"].get("confidence", 0),
            len(classification_result["parties"].get("appellants", [])),
            len(classification_result["parties"].get("respondents", [])),
        )

        # Step 2: Chunk
        logger.info("Chunking document (%d chars)", len(text))
        chunks = chunker.chunk_document(text)

        if not chunks:
            await db.update_document(document_id, extraction_status="completed")
            return {"status": "completed", "chunks": 0, "message": "No text to chunk"}

        # Step 3: Embed
        logger.info("Generating embeddings for %d chunks", len(chunks))
        texts = [c.content for c in chunks]
        embs = await embeddings.embed_texts(texts, input_type="document")

        # Step 4: Store
        chunk_dicts = [
            {
                "content": c.content,
                "section_type": c.section_type,
                "embedding": emb,
                "page_number": c.page_number,
                "chunk_index": c.chunk_index,
            }
            for c, emb in zip(chunks, embs)
        ]

        stored = await db.store_chunks(document_id, case_id, chunk_dicts)

        # Step 5: Extract references (plans, case law, legislation)
        logger.info("Extracting legal references")
        refs_result = await references_extractor.extract_and_link_references(
            document_id, case_id, text,
        )
        logger.info(
            "References found: %d plans, %d case law (%d linked), %d legislation",
            refs_result["plans"], refs_result["case_law"],
            refs_result["case_law_linked"], refs_result["legislation"],
        )

        await db.update_document(document_id, extraction_status="completed")

        logger.info("Document processed: %d chunks stored", stored)
        return {
            "status": "completed",
            "chunks": stored,
            "pages": page_count,
            "text_length": len(text),
            "classification": classification_result,
            "references": {
                "plans": refs_result["plans"],
                "case_law": refs_result["case_law"],
                "legislation": refs_result["legislation"],
            },
        }

    except Exception as e:
        logger.exception("Document processing failed: %s", e)
        await db.update_document(document_id, extraction_status="failed")
        return {"status": "failed", "error": str(e)}