Make classification and reference extraction non-fatal in document pipeline

Text extraction, chunking and embedding proceed even if Claude API
classification or reference extraction fails (e.g. API quota exceeded).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-04 13:00:34 +00:00
parent 40406b5fde
commit 9e7492e761

View File

@@ -37,37 +37,41 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:
page_count=page_count, page_count=page_count,
) )
# Step 1.5: Classify document and identify parties # Step 1.5: Classify document and identify parties (non-fatal)
logger.info("Classifying document") classification_result = {}
case_number = "" try:
if case_id: logger.info("Classifying document")
case = await db.get_case(case_id) case_number = ""
if case: if case_id:
case_number = case.get("case_number", "") case = await db.get_case(case_id)
classification_result = await classifier.classify_and_identify(text, case_number) if case:
await db.update_document( case_number = case.get("case_number", "")
document_id, classification_result = await classifier.classify_and_identify(text, case_number)
metadata=classification_result, await db.update_document(
) document_id,
logger.info( metadata=classification_result,
"Classification: %s (confidence: %.2f), parties found: %d appellants, %d respondents", )
classification_result["classification"].get("doc_type", "?"), logger.info(
classification_result["classification"].get("confidence", 0), "Classification: %s (confidence: %.2f), parties found: %d appellants, %d respondents",
len(classification_result["parties"].get("appellants", [])), classification_result["classification"].get("doc_type", "?"),
len(classification_result["parties"].get("respondents", [])), classification_result["classification"].get("confidence", 0),
) len(classification_result["parties"].get("appellants", [])),
len(classification_result["parties"].get("respondents", [])),
)
# Step 1.6: Update case parties if empty # Update case parties if empty
if case_id and case: if case_id and case:
parties = classification_result.get("parties", {}) parties = classification_result.get("parties", {})
updates = {} updates = {}
if not case.get("appellants") and parties.get("appellants"): if not case.get("appellants") and parties.get("appellants"):
updates["appellants"] = parties["appellants"] updates["appellants"] = parties["appellants"]
if not case.get("respondents") and parties.get("respondents"): if not case.get("respondents") and parties.get("respondents"):
updates["respondents"] = parties["respondents"] updates["respondents"] = parties["respondents"]
if updates: if updates:
await db.update_case(case_id, **updates) await db.update_case(case_id, **updates)
logger.info("Updated case parties: %s", updates) logger.info("Updated case parties: %s", updates)
except Exception as e:
logger.warning("Classification failed (non-fatal): %s", e)
# Step 2: Chunk # Step 2: Chunk
logger.info("Chunking document (%d chars)", len(text)) logger.info("Chunking document (%d chars)", len(text))
@@ -96,16 +100,20 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:
stored = await db.store_chunks(document_id, case_id, chunk_dicts) stored = await db.store_chunks(document_id, case_id, chunk_dicts)
# Step 5: Extract references (plans, case law, legislation) # Step 5: Extract references (plans, case law, legislation) — non-fatal
logger.info("Extracting legal references") refs_result = {"plans": 0, "case_law": 0, "case_law_linked": 0, "legislation": 0}
refs_result = await references_extractor.extract_and_link_references( try:
document_id, case_id, text, logger.info("Extracting legal references")
) refs_result = await references_extractor.extract_and_link_references(
logger.info( document_id, case_id, text,
"References found: %d plans, %d case law (%d linked), %d legislation", )
refs_result["plans"], refs_result["case_law"], logger.info(
refs_result["case_law_linked"], refs_result["legislation"], "References found: %d plans, %d case law (%d linked), %d legislation",
) refs_result["plans"], refs_result["case_law"],
refs_result["case_law_linked"], refs_result["legislation"],
)
except Exception as e:
logger.warning("Reference extraction failed (non-fatal): %s", e)
await db.update_document(document_id, extraction_status="completed") await db.update_document(document_id, extraction_status="completed")