feat(halacha): crash-safe incremental extraction + resume (A + resume)
Halacha extraction held ALL chunk results in memory and stored once at the very end — a crash/interrupt mid-run (e.g. the 2026-05-31 freeze) lost everything and re-paid the full LLM cost on retry. Now each chunk's halachot are stored AND the chunk is checkpointed (precedent_chunks.halacha_extracted_at) the moment it finishes: - V25 schema: precedent_chunks.halacha_extracted_at (per-chunk checkpoint). - db.store_halachot_for_chunk: atomic per-chunk insert (halacha_index continues from MAX, caller serializes via an in-process store-lock) + checkpoint mark. - db.reset_halacha_extraction (force) / mark_all_chunks_extracted (legacy backfill). - _extract_impl rewritten: resume by default (skip checkpointed chunks; failed chunks stay pending and are retried; status stays 'processing' until all done); force=True wipes + redoes all. reextract_halachot passes force=True; the queue drain (process_pending) resumes by default. - Legacy guard: a pre-V25 precedent (halachot exist, no checkpoints) is backfilled and treated as complete — never re-extracted (would duplicate). Verified on 9002-24 (55 halachot, legacy): resume → legacy-backfill, NO duplication (stays 55), all chunks checkpointed. Index continuation: store at 55,56 after max 54, no collision. Tracks #72. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1149,6 +1149,14 @@ CREATE TABLE IF NOT EXISTS halacha_citation_corroboration (
|
||||
CREATE INDEX IF NOT EXISTS idx_hcc_halacha ON halacha_citation_corroboration(halacha_id);
|
||||
"""
|
||||
|
||||
SCHEMA_V25_SQL = """
|
||||
-- Crash-safe halacha extraction: per-chunk checkpoint enables incremental store
|
||||
-- + resume. A chunk with halacha_extracted_at set has been processed; a resumed
|
||||
-- run skips it (so a crash never loses completed chunks or re-pays for them).
|
||||
ALTER TABLE precedent_chunks
|
||||
ADD COLUMN IF NOT EXISTS halacha_extracted_at TIMESTAMPTZ;
|
||||
"""
|
||||
|
||||
|
||||
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
||||
async with pool.acquire() as conn:
|
||||
@@ -1177,7 +1185,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
||||
await conn.execute(SCHEMA_V22_SQL)
|
||||
await conn.execute(SCHEMA_V23_SQL)
|
||||
await conn.execute(SCHEMA_V24_SQL)
|
||||
logger.info("Database schema initialized (v1-v24)")
|
||||
await conn.execute(SCHEMA_V25_SQL)
|
||||
logger.info("Database schema initialized (v1-v25)")
|
||||
|
||||
|
||||
async def init_schema() -> None:
|
||||
@@ -3199,7 +3208,8 @@ async def list_precedent_chunks(
|
||||
pool = await get_pool()
|
||||
if section_types:
|
||||
rows = await pool.fetch(
|
||||
"""SELECT id, chunk_index, content, section_type, page_number
|
||||
"""SELECT id, chunk_index, content, section_type, page_number,
|
||||
halacha_extracted_at
|
||||
FROM precedent_chunks
|
||||
WHERE case_law_id = $1 AND section_type = ANY($2::text[])
|
||||
ORDER BY chunk_index""",
|
||||
@@ -3207,7 +3217,8 @@ async def list_precedent_chunks(
|
||||
)
|
||||
else:
|
||||
rows = await pool.fetch(
|
||||
"""SELECT id, chunk_index, content, section_type, page_number
|
||||
"""SELECT id, chunk_index, content, section_type, page_number,
|
||||
halacha_extracted_at
|
||||
FROM precedent_chunks
|
||||
WHERE case_law_id = $1
|
||||
ORDER BY chunk_index""",
|
||||
@@ -3280,6 +3291,89 @@ async def store_halachot(case_law_id: UUID, halachot: list[dict]) -> int:
|
||||
return len(halachot)
|
||||
|
||||
|
||||
async def reset_halacha_extraction(case_law_id: UUID) -> None:
|
||||
"""Force a clean re-extraction: wipe halachot + clear per-chunk checkpoints
|
||||
so every chunk is re-processed (used by explicit re-extract, not resume)."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
async with conn.transaction():
|
||||
await conn.execute("DELETE FROM halachot WHERE case_law_id = $1", case_law_id)
|
||||
await conn.execute(
|
||||
"UPDATE precedent_chunks SET halacha_extracted_at = NULL "
|
||||
"WHERE case_law_id = $1", case_law_id,
|
||||
)
|
||||
|
||||
|
||||
async def mark_all_chunks_extracted(case_law_id: UUID) -> int:
|
||||
"""Checkpoint every un-marked chunk of a precedent as extracted.
|
||||
|
||||
Used to backfill pre-V25 precedents (halachot already exist but no chunk was
|
||||
checkpointed) so a resume run skips them instead of re-extracting (which
|
||||
would duplicate). Returns rows updated.
|
||||
"""
|
||||
pool = await get_pool()
|
||||
result = await pool.execute(
|
||||
"UPDATE precedent_chunks SET halacha_extracted_at = now() "
|
||||
"WHERE case_law_id = $1 AND halacha_extracted_at IS NULL", case_law_id,
|
||||
)
|
||||
try:
|
||||
return int(result.split()[-1])
|
||||
except (ValueError, IndexError):
|
||||
return 0
|
||||
|
||||
|
||||
async def store_halachot_for_chunk(
|
||||
case_law_id: UUID, chunk_id: UUID, halachot: list[dict],
|
||||
) -> int:
|
||||
"""Persist ONE chunk's halachot and mark the chunk done — atomically.
|
||||
|
||||
Crash-safe + resumable: each chunk's results land in the DB the moment it
|
||||
finishes, and the chunk is flagged (``halacha_extracted_at``) so a resumed
|
||||
run skips it. ``halacha_index`` continues from the current max so appends
|
||||
across chunks never collide. The chunk is marked even when ``halachot`` is
|
||||
empty (so resume skips genuinely-empty chunks too). Caller serializes calls
|
||||
(a single in-process store-lock) so the MAX read stays race-free.
|
||||
"""
|
||||
threshold = config.HALACHA_AUTO_APPROVE_THRESHOLD
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
async with conn.transaction():
|
||||
base = await conn.fetchval(
|
||||
"SELECT COALESCE(MAX(halacha_index), -1) + 1 FROM halachot "
|
||||
"WHERE case_law_id = $1", case_law_id,
|
||||
)
|
||||
for j, h in enumerate(halachot):
|
||||
confidence = float(h.get("confidence", 0.0))
|
||||
auto_approve = confidence >= threshold
|
||||
review_status = "approved" if auto_approve else "pending_review"
|
||||
reviewer = (
|
||||
f"auto-approved (confidence ≥ {threshold:.2f})"
|
||||
if auto_approve else None
|
||||
)
|
||||
reviewed_at_clause = "now()" if auto_approve else "NULL"
|
||||
await conn.execute(
|
||||
f"""INSERT INTO halachot
|
||||
(case_law_id, halacha_index, rule_statement, rule_type,
|
||||
reasoning_summary, supporting_quote, page_reference,
|
||||
practice_areas, subject_tags, cites, confidence,
|
||||
quote_verified, embedding, review_status,
|
||||
reviewer, reviewed_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11,
|
||||
$12, $13, $14, $15, {reviewed_at_clause})""",
|
||||
case_law_id, base + j, h["rule_statement"],
|
||||
h.get("rule_type", "binding"), h.get("reasoning_summary", ""),
|
||||
h["supporting_quote"], h.get("page_reference", ""),
|
||||
h.get("practice_areas", []), h.get("subject_tags", []),
|
||||
h.get("cites", []), confidence, h.get("quote_verified", False),
|
||||
h.get("embedding"), review_status, reviewer,
|
||||
)
|
||||
await conn.execute(
|
||||
"UPDATE precedent_chunks SET halacha_extracted_at = now() "
|
||||
"WHERE id = $1", chunk_id,
|
||||
)
|
||||
return len(halachot)
|
||||
|
||||
|
||||
async def list_halachot(
|
||||
case_law_id: UUID | None = None,
|
||||
review_status: str | None = None,
|
||||
|
||||
Reference in New Issue
Block a user