feat(digests): digest_kind classification — robust extraction for all issue types (X12)
~2% מגיליונות "כל יום" הם לא-הכרעות (עדכוני-חקיקה/הודעות/ברכות) ללא ruling → החילוץ ה-decision-centric החזיר ריק → both-empty → מחזורי ב-self-heal. - SCHEMA_V32: `digest_kind` (decision/announcement/other) + backfill legacy בזול (יש citation→decision, אחרת announcement) — לפני שה-self-heal מסתמך עליו. - extractor: prompt מסווג + מחלץ תמיד concept/headline/summary; underlying_* רק ל-decision. extract מנרמל digest_kind. - enrich: שומר digest_kind; חילוץ מוצלח תמיד מסתיים ב-kind לא-ריק (ברירת-מחדל לפי citation אם המודל השמיט). - drain self-heal: הגדרת-כשל = completed עם digest_kind='' (במקום both-empty) → הודעות לא מנוסות-מחדש לנצח. - db: digest_kind ב-_DIGEST_COLS + update-whitelist (זורם ל-search/list/API). - X12 spec: תיעוד digest_kind + הגדרת-הכשל המתוקנת. אומת: V32 סיווג 533 (525 decision + 8 announcement, 0 unclassified — self-heal לא נוגע בהם). extract: 5163→decision+citation · 5060→announcement+concept, citation ריק (לא both-empty). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1383,6 +1383,24 @@ CREATE INDEX IF NOT EXISTS idx_court_fetch_jobs_digest ON court_fetch_jobs(diges
|
||||
WHERE digest_id IS NOT NULL;
|
||||
"""
|
||||
|
||||
SCHEMA_V32_SQL = """
|
||||
-- digest_kind (X12): classify each "כל יום" issue. Most are decision-summaries
|
||||
-- (point at a ruling → underlying_citation set), but some are non-decision
|
||||
-- ANNOUNCEMENTS (legislative/planning updates, new-year notices) that legitimately
|
||||
-- have no ruling. Classifying explicitly lets enrich treat an announcement as a
|
||||
-- SUCCESS (concept+summary, no citation) instead of a both-empty "failure" that
|
||||
-- the drain self-heal would retry forever. '' = not yet classified (= a genuine
|
||||
-- extraction failure once enriched).
|
||||
ALTER TABLE digests ADD COLUMN IF NOT EXISTS digest_kind TEXT NOT NULL DEFAULT '';
|
||||
-- Backfill legacy rows cheaply (no LLM): a row with a citation is a decision,
|
||||
-- otherwise an announcement. MUST run before the new self-heal keys on
|
||||
-- digest_kind='' (else it would reset every legacy row). Idempotent.
|
||||
UPDATE digests SET digest_kind =
|
||||
CASE WHEN coalesce(underlying_citation,'') <> '' THEN 'decision' ELSE 'announcement' END
|
||||
WHERE coalesce(digest_kind,'') = '' AND extraction_status = 'completed';
|
||||
CREATE INDEX IF NOT EXISTS idx_digests_kind ON digests(digest_kind);
|
||||
"""
|
||||
|
||||
|
||||
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
||||
async with pool.acquire() as conn:
|
||||
@@ -1418,7 +1436,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
||||
await conn.execute(SCHEMA_V29_SQL)
|
||||
await conn.execute(SCHEMA_V30_SQL)
|
||||
await conn.execute(SCHEMA_V31_SQL)
|
||||
logger.info("Database schema initialized (v1-v31)")
|
||||
await conn.execute(SCHEMA_V32_SQL)
|
||||
logger.info("Database schema initialized (v1-v32)")
|
||||
|
||||
|
||||
async def init_schema() -> None:
|
||||
@@ -3600,7 +3619,7 @@ _DIGEST_COLS = (
|
||||
"headline_holding, analysis_text, summary, underlying_citation, "
|
||||
"underlying_court, underlying_date, underlying_judge, practice_area, "
|
||||
"appeal_subtype, subject_tags, linked_case_law_id, source_document_path, "
|
||||
"content_hash, extraction_status, created_at, updated_at"
|
||||
"content_hash, extraction_status, digest_kind, created_at, updated_at"
|
||||
)
|
||||
|
||||
_DIGEST_UPDATE_ALLOWED = {
|
||||
@@ -3608,7 +3627,7 @@ _DIGEST_UPDATE_ALLOWED = {
|
||||
"headline_holding", "analysis_text", "summary", "underlying_citation",
|
||||
"underlying_court", "underlying_date", "underlying_judge", "practice_area",
|
||||
"appeal_subtype", "subject_tags", "source_document_path", "content_hash",
|
||||
"extraction_status",
|
||||
"extraction_status", "digest_kind",
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user