New services (11 files): - classifier.py: auto doc-type classification + party identification (Claude Haiku) - claims_extractor.py: claim extraction from pleadings (Claude Sonnet + regex) - references_extractor.py: plan/case-law/legislation detection (regex) - brainstorm.py: direction generation with 2-3 options (Claude Sonnet) - block_writer.py: 12-block decision writer (template + Claude Sonnet/Opus) - docx_exporter.py: DOCX export with David font, RTL, headings - qa_validator.py: 6 QA checks with export blocking on critical failure - learning_loop.py: draft vs final comparison + lesson extraction - metrics.py: KPIs dashboard per case and global - audit.py: action audit log - cli.py: standalone CLI with 11 commands Updated pipeline: extract → classify → chunk → embed → store → extract_references New MCP tools: 29 total (was 16) New DB tables: audit_log, decisions CRUD, claims CRUD Config: Infisical support, external service allowlist Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
118 lines
4.1 KiB
Python
118 lines
4.1 KiB
Python
"""Document processing pipeline: extract → chunk → embed → store."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from uuid import UUID
|
|
|
|
from legal_mcp.services import chunker, classifier, db, embeddings, extractor, references_extractor
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def process_document(document_id: UUID, case_id: UUID) -> dict:
|
|
"""Full processing pipeline for a document.
|
|
|
|
1. Extract text from file
|
|
2. Split into chunks
|
|
3. Generate embeddings
|
|
4. Store chunks + embeddings in DB
|
|
|
|
Returns processing summary.
|
|
"""
|
|
doc = await db.get_document(document_id)
|
|
if not doc:
|
|
raise ValueError(f"Document {document_id} not found")
|
|
|
|
await db.update_document(document_id, extraction_status="processing")
|
|
|
|
try:
|
|
# Step 1: Extract text
|
|
logger.info("Extracting text from %s", doc["file_path"])
|
|
text, page_count = await extractor.extract_text(doc["file_path"])
|
|
|
|
await db.update_document(
|
|
document_id,
|
|
extracted_text=text,
|
|
page_count=page_count,
|
|
)
|
|
|
|
# Step 1.5: Classify document and identify parties
|
|
logger.info("Classifying document")
|
|
case_number = ""
|
|
if case_id:
|
|
case = await db.get_case(case_id)
|
|
if case:
|
|
case_number = case.get("case_number", "")
|
|
classification_result = await classifier.classify_and_identify(text, case_number)
|
|
await db.update_document(
|
|
document_id,
|
|
metadata=classification_result,
|
|
)
|
|
logger.info(
|
|
"Classification: %s (confidence: %.2f), parties found: %d appellants, %d respondents",
|
|
classification_result["classification"].get("doc_type", "?"),
|
|
classification_result["classification"].get("confidence", 0),
|
|
len(classification_result["parties"].get("appellants", [])),
|
|
len(classification_result["parties"].get("respondents", [])),
|
|
)
|
|
|
|
# Step 2: Chunk
|
|
logger.info("Chunking document (%d chars)", len(text))
|
|
chunks = chunker.chunk_document(text)
|
|
|
|
if not chunks:
|
|
await db.update_document(document_id, extraction_status="completed")
|
|
return {"status": "completed", "chunks": 0, "message": "No text to chunk"}
|
|
|
|
# Step 3: Embed
|
|
logger.info("Generating embeddings for %d chunks", len(chunks))
|
|
texts = [c.content for c in chunks]
|
|
embs = await embeddings.embed_texts(texts, input_type="document")
|
|
|
|
# Step 4: Store
|
|
chunk_dicts = [
|
|
{
|
|
"content": c.content,
|
|
"section_type": c.section_type,
|
|
"embedding": emb,
|
|
"page_number": c.page_number,
|
|
"chunk_index": c.chunk_index,
|
|
}
|
|
for c, emb in zip(chunks, embs)
|
|
]
|
|
|
|
stored = await db.store_chunks(document_id, case_id, chunk_dicts)
|
|
|
|
# Step 5: Extract references (plans, case law, legislation)
|
|
logger.info("Extracting legal references")
|
|
refs_result = await references_extractor.extract_and_link_references(
|
|
document_id, case_id, text,
|
|
)
|
|
logger.info(
|
|
"References found: %d plans, %d case law (%d linked), %d legislation",
|
|
refs_result["plans"], refs_result["case_law"],
|
|
refs_result["case_law_linked"], refs_result["legislation"],
|
|
)
|
|
|
|
await db.update_document(document_id, extraction_status="completed")
|
|
|
|
logger.info("Document processed: %d chunks stored", stored)
|
|
return {
|
|
"status": "completed",
|
|
"chunks": stored,
|
|
"pages": page_count,
|
|
"text_length": len(text),
|
|
"classification": classification_result,
|
|
"references": {
|
|
"plans": refs_result["plans"],
|
|
"case_law": refs_result["case_law"],
|
|
"legislation": refs_result["legislation"],
|
|
},
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.exception("Document processing failed: %s", e)
|
|
await db.update_document(document_id, extraction_status="failed")
|
|
return {"status": "failed", "error": str(e)}
|