Make classification and reference extraction non-fatal in document pipeline
Text extraction, chunking and embedding proceed even if Claude API classification or reference extraction fails (e.g. API quota exceeded). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -37,37 +37,41 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:
|
|||||||
page_count=page_count,
|
page_count=page_count,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Step 1.5: Classify document and identify parties
|
# Step 1.5: Classify document and identify parties (non-fatal)
|
||||||
logger.info("Classifying document")
|
classification_result = {}
|
||||||
case_number = ""
|
try:
|
||||||
if case_id:
|
logger.info("Classifying document")
|
||||||
case = await db.get_case(case_id)
|
case_number = ""
|
||||||
if case:
|
if case_id:
|
||||||
case_number = case.get("case_number", "")
|
case = await db.get_case(case_id)
|
||||||
classification_result = await classifier.classify_and_identify(text, case_number)
|
if case:
|
||||||
await db.update_document(
|
case_number = case.get("case_number", "")
|
||||||
document_id,
|
classification_result = await classifier.classify_and_identify(text, case_number)
|
||||||
metadata=classification_result,
|
await db.update_document(
|
||||||
)
|
document_id,
|
||||||
logger.info(
|
metadata=classification_result,
|
||||||
"Classification: %s (confidence: %.2f), parties found: %d appellants, %d respondents",
|
)
|
||||||
classification_result["classification"].get("doc_type", "?"),
|
logger.info(
|
||||||
classification_result["classification"].get("confidence", 0),
|
"Classification: %s (confidence: %.2f), parties found: %d appellants, %d respondents",
|
||||||
len(classification_result["parties"].get("appellants", [])),
|
classification_result["classification"].get("doc_type", "?"),
|
||||||
len(classification_result["parties"].get("respondents", [])),
|
classification_result["classification"].get("confidence", 0),
|
||||||
)
|
len(classification_result["parties"].get("appellants", [])),
|
||||||
|
len(classification_result["parties"].get("respondents", [])),
|
||||||
|
)
|
||||||
|
|
||||||
# Step 1.6: Update case parties if empty
|
# Update case parties if empty
|
||||||
if case_id and case:
|
if case_id and case:
|
||||||
parties = classification_result.get("parties", {})
|
parties = classification_result.get("parties", {})
|
||||||
updates = {}
|
updates = {}
|
||||||
if not case.get("appellants") and parties.get("appellants"):
|
if not case.get("appellants") and parties.get("appellants"):
|
||||||
updates["appellants"] = parties["appellants"]
|
updates["appellants"] = parties["appellants"]
|
||||||
if not case.get("respondents") and parties.get("respondents"):
|
if not case.get("respondents") and parties.get("respondents"):
|
||||||
updates["respondents"] = parties["respondents"]
|
updates["respondents"] = parties["respondents"]
|
||||||
if updates:
|
if updates:
|
||||||
await db.update_case(case_id, **updates)
|
await db.update_case(case_id, **updates)
|
||||||
logger.info("Updated case parties: %s", updates)
|
logger.info("Updated case parties: %s", updates)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Classification failed (non-fatal): %s", e)
|
||||||
|
|
||||||
# Step 2: Chunk
|
# Step 2: Chunk
|
||||||
logger.info("Chunking document (%d chars)", len(text))
|
logger.info("Chunking document (%d chars)", len(text))
|
||||||
@@ -96,16 +100,20 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:
|
|||||||
|
|
||||||
stored = await db.store_chunks(document_id, case_id, chunk_dicts)
|
stored = await db.store_chunks(document_id, case_id, chunk_dicts)
|
||||||
|
|
||||||
# Step 5: Extract references (plans, case law, legislation)
|
# Step 5: Extract references (plans, case law, legislation) — non-fatal
|
||||||
logger.info("Extracting legal references")
|
refs_result = {"plans": 0, "case_law": 0, "case_law_linked": 0, "legislation": 0}
|
||||||
refs_result = await references_extractor.extract_and_link_references(
|
try:
|
||||||
document_id, case_id, text,
|
logger.info("Extracting legal references")
|
||||||
)
|
refs_result = await references_extractor.extract_and_link_references(
|
||||||
logger.info(
|
document_id, case_id, text,
|
||||||
"References found: %d plans, %d case law (%d linked), %d legislation",
|
)
|
||||||
refs_result["plans"], refs_result["case_law"],
|
logger.info(
|
||||||
refs_result["case_law_linked"], refs_result["legislation"],
|
"References found: %d plans, %d case law (%d linked), %d legislation",
|
||||||
)
|
refs_result["plans"], refs_result["case_law"],
|
||||||
|
refs_result["case_law_linked"], refs_result["legislation"],
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Reference extraction failed (non-fatal): %s", e)
|
||||||
|
|
||||||
await db.update_document(document_id, extraction_status="completed")
|
await db.update_document(document_id, extraction_status="completed")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user