fix(extraction): reconcile לתיקים-יתומים בתור — pending+requested_at=NULL (#139)
תיק יכול להיות <kind>_extraction_status='pending' עם _requested_at=NULL — מעולם-לא-נכנס-לתור (מסלולי bulk/מיגרציה, או status שנכתב לפני החותם), והדריינר (סורק requested_at IS NOT NULL) עיוור אליו לנצח → ה-backlog מתנקז בשקט לאפס. requeue_stale_processing_extractions מרפא רק 'processing'. נצפה 2026-06-14: 96 תיקים pending אך 0 בתור (תוקנו ידנית). תיקון (G1 — שחזור invariant במקור, G2 — predicate יחיד): - db.reconcile_orphaned_pending_extractions(kind=) — kind-agnostic, מחזיר את invariant "שורה ברת-חילוץ ⇒ בתור": חותם requested_at ל-rows שהם pending + requested_at IS NULL + EXTRACTION_ELIGIBLE_PREDICATE (אותו מסנן של #140 — cited_only/chunkless לעולם לא נדחפים). אידמפוטנטי (rows מסומנים לא נתפסים). - precedent_library.process_pending_extractions קורא reconcile אחרי requeue_stale ולפני list — תיקים-משוחזרים נקלטים באותו pass. מנגנון-ריפוי יחיד (G2), לא מסלול מקביל; requeue_stale='processing', reconcile='pending'. - request_halacha_extraction מציב status='pending' עם החותם (סימטרי ל-metadata) — סוגר את חלון-ה-drift שמייצר pending+NULL מלכתחילה. מצב חי נקי (0 יתומים-כשירים אחרי התיקון-הידני); זהו תיקון מונע — הדריינר יְרַפֵּא יתומים עתידיים אוטומטית. בדיקות: test_extraction_orphan_reconcile (predicate משותף, pending+NULL בלבד, מובחן מ-requeue_stale, request_halacha סימטרי), שני ה-kinds. כל 349 עוברות. Invariants: G1, G2 (predicate משותף עם #140, ריפוי יחיד), INV-G3/INV-DUR1 (X16), INV-G4 (אין בליעה שקטה — reconcile מתעד), G12. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -6736,10 +6736,17 @@ async def request_metadata_extraction(case_law_id: UUID) -> bool:
|
||||
|
||||
async def request_halacha_extraction(case_law_id: UUID) -> bool:
|
||||
"""Same but for halacha extraction. See note on
|
||||
:func:`request_metadata_extraction` re: opening to all source kinds."""
|
||||
:func:`request_metadata_extraction` re: opening to all source kinds.
|
||||
|
||||
Sets ``halacha_extraction_status='pending'`` alongside the timestamp —
|
||||
symmetric to :func:`request_metadata_extraction` — so status and queue-stamp
|
||||
are written together and a re-request never leaves a stale terminal badge.
|
||||
This also closes the drift window where a row could end up 'pending' with a
|
||||
NULL requested_at (orphaned from the queue) (#139)."""
|
||||
pool = await get_pool()
|
||||
result = await pool.execute(
|
||||
"UPDATE case_law SET halacha_extraction_requested_at = now() "
|
||||
"UPDATE case_law SET halacha_extraction_requested_at = now(), "
|
||||
"halacha_extraction_status = 'pending' "
|
||||
"WHERE id = $1",
|
||||
case_law_id,
|
||||
)
|
||||
@@ -6828,6 +6835,43 @@ async def requeue_stale_processing_extractions(kind: str = "halacha") -> int:
|
||||
return 0
|
||||
|
||||
|
||||
async def reconcile_orphaned_pending_extractions(kind: str = "halacha") -> int:
|
||||
"""Re-stamp eligible 'pending' rows that fell off the queue. Returns count.
|
||||
|
||||
``requeue_stale_processing_extractions`` heals only ``status='processing'``
|
||||
orphans. A row can also be ``status='pending'`` with ``requested_at IS NULL``
|
||||
— never enqueued (bulk/migration paths, or a status set before the stamp) —
|
||||
and the queue (``requested_at IS NOT NULL``) is blind to it forever, so the
|
||||
backlog drains silently to nothing (#139, INV-DUR1).
|
||||
|
||||
This restores the "eligible row ⇒ in the queue" invariant: it stamps
|
||||
``requested_at`` (and re-affirms 'pending') for rows that are pending,
|
||||
unstamped, and EXTRACTION-eligible — using the SAME
|
||||
``EXTRACTION_ELIGIBLE_PREDICATE`` the queue reader uses (#140, G2), so
|
||||
cited_only / chunkless stubs are never proactively enqueued. kind-agnostic
|
||||
(metadata + halacha). Idempotent: a row already stamped is not matched, so
|
||||
concurrent runs converge on the same ``now()`` stamp harmlessly.
|
||||
"""
|
||||
status_col = (
|
||||
"metadata_extraction_status" if kind == "metadata"
|
||||
else "halacha_extraction_status"
|
||||
)
|
||||
req_col = (
|
||||
"metadata_extraction_requested_at" if kind == "metadata"
|
||||
else "halacha_extraction_requested_at"
|
||||
)
|
||||
pool = await get_pool()
|
||||
tag = await pool.execute(
|
||||
f"UPDATE case_law SET {req_col} = now(), {status_col} = 'pending' "
|
||||
f"WHERE {status_col} = 'pending' AND {req_col} IS NULL "
|
||||
f"AND {EXTRACTION_ELIGIBLE_PREDICATE}"
|
||||
)
|
||||
try:
|
||||
return int(str(tag).split()[-1])
|
||||
except (ValueError, IndexError):
|
||||
return 0
|
||||
|
||||
|
||||
async def extraction_queue_status() -> dict:
|
||||
"""Pending-extraction queue depth per kind (INV-TOOL4 visibility / GAP-45).
|
||||
|
||||
|
||||
@@ -231,6 +231,17 @@ async def process_pending_extractions(kind: str = "metadata", limit: int = 20) -
|
||||
if healed:
|
||||
logger.warning("self-healed %d stale '%s' processing row(s)", healed, kind)
|
||||
|
||||
# Re-enqueue eligible 'pending' rows that never got a queue stamp (orphaned
|
||||
# from the queue — bulk/migration paths). requeue_stale only covers
|
||||
# 'processing'; this covers 'pending' with requested_at IS NULL (#139). Runs
|
||||
# before the list below so reclaimed rows drain in this same pass.
|
||||
reconciled = await db.reconcile_orphaned_pending_extractions(kind=kind)
|
||||
if reconciled:
|
||||
logger.warning(
|
||||
"reconciled %d orphaned 'pending' (no queue stamp) '%s' row(s)",
|
||||
reconciled, kind,
|
||||
)
|
||||
|
||||
pending = await db.list_pending_extraction_requests(kind=kind, limit=limit)
|
||||
if not pending:
|
||||
return {"status": "no_pending", "kind": kind, "processed": 0, "results": []}
|
||||
|
||||
Reference in New Issue
Block a user