"""Document processing pipeline: extract → chunk → embed → store.""" from __future__ import annotations import logging from uuid import UUID from legal_mcp.services import chunker, classifier, db, embeddings, extractor, references_extractor logger = logging.getLogger(__name__) async def process_document(document_id: UUID, case_id: UUID) -> dict: """Full processing pipeline for a document. 1. Extract text from file 2. Split into chunks 3. Generate embeddings 4. Store chunks + embeddings in DB Returns processing summary. """ doc = await db.get_document(document_id) if not doc: raise ValueError(f"Document {document_id} not found") await db.update_document(document_id, extraction_status="processing") try: # Step 1: Extract text logger.info("Extracting text from %s", doc["file_path"]) text, page_count = await extractor.extract_text(doc["file_path"]) await db.update_document( document_id, extracted_text=text, page_count=page_count, ) # Step 1.5: Classify document and identify parties logger.info("Classifying document") case_number = "" if case_id: case = await db.get_case(case_id) if case: case_number = case.get("case_number", "") classification_result = await classifier.classify_and_identify(text, case_number) await db.update_document( document_id, metadata=classification_result, ) logger.info( "Classification: %s (confidence: %.2f), parties found: %d appellants, %d respondents", classification_result["classification"].get("doc_type", "?"), classification_result["classification"].get("confidence", 0), len(classification_result["parties"].get("appellants", [])), len(classification_result["parties"].get("respondents", [])), ) # Step 2: Chunk logger.info("Chunking document (%d chars)", len(text)) chunks = chunker.chunk_document(text) if not chunks: await db.update_document(document_id, extraction_status="completed") return {"status": "completed", "chunks": 0, "message": "No text to chunk"} # Step 3: Embed logger.info("Generating embeddings for %d chunks", len(chunks)) texts = [c.content for c in chunks] embs = await embeddings.embed_texts(texts, input_type="document") # Step 4: Store chunk_dicts = [ { "content": c.content, "section_type": c.section_type, "embedding": emb, "page_number": c.page_number, "chunk_index": c.chunk_index, } for c, emb in zip(chunks, embs) ] stored = await db.store_chunks(document_id, case_id, chunk_dicts) # Step 5: Extract references (plans, case law, legislation) logger.info("Extracting legal references") refs_result = await references_extractor.extract_and_link_references( document_id, case_id, text, ) logger.info( "References found: %d plans, %d case law (%d linked), %d legislation", refs_result["plans"], refs_result["case_law"], refs_result["case_law_linked"], refs_result["legislation"], ) await db.update_document(document_id, extraction_status="completed") logger.info("Document processed: %d chunks stored", stored) return { "status": "completed", "chunks": stored, "pages": page_count, "text_length": len(text), "classification": classification_result, "references": { "plans": refs_result["plans"], "case_law": refs_result["case_law"], "legislation": refs_result["legislation"], }, } except Exception as e: logger.exception("Document processing failed: %s", e) await db.update_document(document_id, extraction_status="failed") return {"status": "failed", "error": str(e)}