FU-2a: idempotent ingest + write-time normalization + searchable flag (GAP-03/06/13) #12

Merged
chaim merged 9 commits from fix/fu2a-idempotent-ingest into main 2026-05-30 21:06:33 +00:00
Showing only changes of commit 4b8bbc3794 - Show all commits

View File

@@ -1094,6 +1094,18 @@ CREATE INDEX IF NOT EXISTS idx_case_law_meta_tsv
""" """
# ── V21: explicit `searchable` flag (GAP-13 / INV-DM1) ─────────────
# Materialized completeness flag — a case_law row is exposed to search only
# when it satisfies the completeness contract (02-data-model §2a). Recomputed
# on ingest/metadata completion via recompute_searchable(); not inferred at
# query time. Default false so a freshly-inserted row is excluded until proven
# complete. Health-check surfaces count(*) FILTER (WHERE NOT searchable).
SCHEMA_V21_SQL = """
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS searchable boolean NOT NULL DEFAULT false;
CREATE INDEX IF NOT EXISTS idx_case_law_searchable ON case_law (searchable);
"""
async def _run_schema_migrations(pool: asyncpg.Pool) -> None: async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
async with pool.acquire() as conn: async with pool.acquire() as conn:
await conn.execute(SCHEMA_SQL) await conn.execute(SCHEMA_SQL)
@@ -1117,7 +1129,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
await conn.execute(SCHEMA_V18_SQL) await conn.execute(SCHEMA_V18_SQL)
await conn.execute(SCHEMA_V19_SQL) await conn.execute(SCHEMA_V19_SQL)
await conn.execute(SCHEMA_V20_SQL) await conn.execute(SCHEMA_V20_SQL)
logger.info("Database schema initialized (v1-v20)") await conn.execute(SCHEMA_V21_SQL)
logger.info("Database schema initialized (v1-v21)")
async def init_schema() -> None: async def init_schema() -> None:
@@ -2706,6 +2719,68 @@ async def create_internal_committee_decision(
return _row_to_case_law(row) return _row_to_case_law(row)
def _compute_searchable(row: dict, has_embedded_chunk: bool) -> bool:
"""Completeness contract (INV-DM1 / 02-data-model §2a).
A row is searchable IFF: canonical id present · case_name/practice_area/
source_kind present · ≥1 chunk with a non-null embedding · extraction
completed · metadata non-empty (≥1 of headnote/summary/subject_tags).
Pure — `has_embedded_chunk` is supplied by the caller (cross-table check).
"""
if not has_embedded_chunk:
return False
if (row.get("extraction_status") or "") != "completed":
return False
if not (row.get("case_number") or "").strip():
return False
if not (row.get("case_name") or "").strip():
return False
if not (row.get("practice_area") or "").strip():
return False
if not (row.get("source_kind") or "").strip():
return False
tags = row.get("subject_tags") or []
has_meta = bool((row.get("headnote") or "").strip()) \
or bool((row.get("summary") or "").strip()) \
or (len(tags) > 0)
return has_meta
async def recompute_searchable(case_law_id: "UUID | str | None" = None) -> int:
"""Recompute and persist the `searchable` flag. Idempotent / reversible.
If case_law_id is None, recompute ALL rows (used by the V21 backfill and
the dry-run). Returns the number of rows now marked searchable=true.
"""
pool = await get_pool()
async with pool.acquire() as conn:
if case_law_id is not None:
cid = case_law_id if isinstance(case_law_id, UUID) else UUID(str(case_law_id))
rows = await conn.fetch(
"SELECT * FROM case_law WHERE id = $1", cid)
else:
rows = await conn.fetch("SELECT * FROM case_law")
n_true = 0
for r in rows:
row = dict(r)
tags = row.get("subject_tags")
if isinstance(tags, str):
try:
tags = json.loads(tags)
except (ValueError, TypeError):
tags = []
row["subject_tags"] = tags or []
has_chunk = await conn.fetchval(
"SELECT EXISTS(SELECT 1 FROM precedent_chunks "
"WHERE case_law_id = $1 AND embedding IS NOT NULL)", row["id"])
val = _compute_searchable(row, bool(has_chunk))
await conn.execute(
"UPDATE case_law SET searchable = $2 WHERE id = $1", row["id"], val)
if val:
n_true += 1
return n_true
async def update_case_law(case_law_id: UUID, **fields) -> dict | None: async def update_case_law(case_law_id: UUID, **fields) -> dict | None:
"""Patch metadata fields on a case_law row. """Patch metadata fields on a case_law row.