Make classification and reference extraction non-fatal in document pipeline

Text extraction, chunking and embedding proceed even if Claude API classification or reference extraction fails (e.g. API quota exceeded). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-04 13:00:34 +00:00
parent 40406b5fde
commit 9e7492e761
1 changed files with 48 additions and 40 deletions
--- a/mcp-server/src/legal_mcp/services/processor.py
+++ b/mcp-server/src/legal_mcp/services/processor.py
@@ -37,37 +37,41 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:
            page_count=page_count,
        )

-        # Step 1.5: Classify document and identify parties
-        logger.info("Classifying document")
-        case_number = ""
-        if case_id:
-            case = await db.get_case(case_id)
-            if case:
-                case_number = case.get("case_number", "")
-        classification_result = await classifier.classify_and_identify(text, case_number)
-        await db.update_document(
-            document_id,
-            metadata=classification_result,
-        )
-        logger.info(
-            "Classification: %s (confidence: %.2f), parties found: %d appellants, %d respondents",
-            classification_result["classification"].get("doc_type", "?"),
-            classification_result["classification"].get("confidence", 0),
-            len(classification_result["parties"].get("appellants", [])),
-            len(classification_result["parties"].get("respondents", [])),
-        )
+        # Step 1.5: Classify document and identify parties (non-fatal)
+        classification_result = {}
+        try:
+            logger.info("Classifying document")
+            case_number = ""
+            if case_id:
+                case = await db.get_case(case_id)
+                if case:
+                    case_number = case.get("case_number", "")
+            classification_result = await classifier.classify_and_identify(text, case_number)
+            await db.update_document(
+                document_id,
+                metadata=classification_result,
+            )
+            logger.info(
+                "Classification: %s (confidence: %.2f), parties found: %d appellants, %d respondents",
+                classification_result["classification"].get("doc_type", "?"),
+                classification_result["classification"].get("confidence", 0),
+                len(classification_result["parties"].get("appellants", [])),
+                len(classification_result["parties"].get("respondents", [])),
+            )

-        # Step 1.6: Update case parties if empty
-        if case_id and case:
-            parties = classification_result.get("parties", {})
-            updates = {}
-            if not case.get("appellants") and parties.get("appellants"):
-                updates["appellants"] = parties["appellants"]
-            if not case.get("respondents") and parties.get("respondents"):
-                updates["respondents"] = parties["respondents"]
-            if updates:
-                await db.update_case(case_id, **updates)
-                logger.info("Updated case parties: %s", updates)
+            # Update case parties if empty
+            if case_id and case:
+                parties = classification_result.get("parties", {})
+                updates = {}
+                if not case.get("appellants") and parties.get("appellants"):
+                    updates["appellants"] = parties["appellants"]
+                if not case.get("respondents") and parties.get("respondents"):
+                    updates["respondents"] = parties["respondents"]
+                if updates:
+                    await db.update_case(case_id, **updates)
+                    logger.info("Updated case parties: %s", updates)
+        except Exception as e:
+            logger.warning("Classification failed (non-fatal): %s", e)

        # Step 2: Chunk
        logger.info("Chunking document (%d chars)", len(text))
@@ -96,16 +100,20 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:

        stored = await db.store_chunks(document_id, case_id, chunk_dicts)

-        # Step 5: Extract references (plans, case law, legislation)
-        logger.info("Extracting legal references")
-        refs_result = await references_extractor.extract_and_link_references(
-            document_id, case_id, text,
-        )
-        logger.info(
-            "References found: %d plans, %d case law (%d linked), %d legislation",
-            refs_result["plans"], refs_result["case_law"],
-            refs_result["case_law_linked"], refs_result["legislation"],
-        )
+        # Step 5: Extract references (plans, case law, legislation) — non-fatal
+        refs_result = {"plans": 0, "case_law": 0, "case_law_linked": 0, "legislation": 0}
+        try:
+            logger.info("Extracting legal references")
+            refs_result = await references_extractor.extract_and_link_references(
+                document_id, case_id, text,
+            )
+            logger.info(
+                "References found: %d plans, %d case law (%d linked), %d legislation",
+                refs_result["plans"], refs_result["case_law"],
+                refs_result["case_law_linked"], refs_result["legislation"],
+            )
+        except Exception as e:
+            logger.warning("Reference extraction failed (non-fatal): %s", e)

        await db.update_document(document_id, extraction_status="completed")