FU-2a: idempotent ingest + write-time normalization + searchable flag (GAP-03/06/13) #12
@@ -1094,6 +1094,18 @@ CREATE INDEX IF NOT EXISTS idx_case_law_meta_tsv
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ── V21: explicit `searchable` flag (GAP-13 / INV-DM1) ─────────────
|
||||||
|
# Materialized completeness flag — a case_law row is exposed to search only
|
||||||
|
# when it satisfies the completeness contract (02-data-model §2a). Recomputed
|
||||||
|
# on ingest/metadata completion via recompute_searchable(); not inferred at
|
||||||
|
# query time. Default false so a freshly-inserted row is excluded until proven
|
||||||
|
# complete. Health-check surfaces count(*) FILTER (WHERE NOT searchable).
|
||||||
|
SCHEMA_V21_SQL = """
|
||||||
|
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS searchable boolean NOT NULL DEFAULT false;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_case_law_searchable ON case_law (searchable);
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
||||||
async with pool.acquire() as conn:
|
async with pool.acquire() as conn:
|
||||||
await conn.execute(SCHEMA_SQL)
|
await conn.execute(SCHEMA_SQL)
|
||||||
@@ -1117,7 +1129,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
|||||||
await conn.execute(SCHEMA_V18_SQL)
|
await conn.execute(SCHEMA_V18_SQL)
|
||||||
await conn.execute(SCHEMA_V19_SQL)
|
await conn.execute(SCHEMA_V19_SQL)
|
||||||
await conn.execute(SCHEMA_V20_SQL)
|
await conn.execute(SCHEMA_V20_SQL)
|
||||||
logger.info("Database schema initialized (v1-v20)")
|
await conn.execute(SCHEMA_V21_SQL)
|
||||||
|
logger.info("Database schema initialized (v1-v21)")
|
||||||
|
|
||||||
|
|
||||||
async def init_schema() -> None:
|
async def init_schema() -> None:
|
||||||
@@ -2706,6 +2719,68 @@ async def create_internal_committee_decision(
|
|||||||
return _row_to_case_law(row)
|
return _row_to_case_law(row)
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_searchable(row: dict, has_embedded_chunk: bool) -> bool:
|
||||||
|
"""Completeness contract (INV-DM1 / 02-data-model §2a).
|
||||||
|
|
||||||
|
A row is searchable IFF: canonical id present · case_name/practice_area/
|
||||||
|
source_kind present · ≥1 chunk with a non-null embedding · extraction
|
||||||
|
completed · metadata non-empty (≥1 of headnote/summary/subject_tags).
|
||||||
|
Pure — `has_embedded_chunk` is supplied by the caller (cross-table check).
|
||||||
|
"""
|
||||||
|
if not has_embedded_chunk:
|
||||||
|
return False
|
||||||
|
if (row.get("extraction_status") or "") != "completed":
|
||||||
|
return False
|
||||||
|
if not (row.get("case_number") or "").strip():
|
||||||
|
return False
|
||||||
|
if not (row.get("case_name") or "").strip():
|
||||||
|
return False
|
||||||
|
if not (row.get("practice_area") or "").strip():
|
||||||
|
return False
|
||||||
|
if not (row.get("source_kind") or "").strip():
|
||||||
|
return False
|
||||||
|
tags = row.get("subject_tags") or []
|
||||||
|
has_meta = bool((row.get("headnote") or "").strip()) \
|
||||||
|
or bool((row.get("summary") or "").strip()) \
|
||||||
|
or (len(tags) > 0)
|
||||||
|
return has_meta
|
||||||
|
|
||||||
|
|
||||||
|
async def recompute_searchable(case_law_id: "UUID | str | None" = None) -> int:
|
||||||
|
"""Recompute and persist the `searchable` flag. Idempotent / reversible.
|
||||||
|
|
||||||
|
If case_law_id is None, recompute ALL rows (used by the V21 backfill and
|
||||||
|
the dry-run). Returns the number of rows now marked searchable=true.
|
||||||
|
"""
|
||||||
|
pool = await get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
if case_law_id is not None:
|
||||||
|
cid = case_law_id if isinstance(case_law_id, UUID) else UUID(str(case_law_id))
|
||||||
|
rows = await conn.fetch(
|
||||||
|
"SELECT * FROM case_law WHERE id = $1", cid)
|
||||||
|
else:
|
||||||
|
rows = await conn.fetch("SELECT * FROM case_law")
|
||||||
|
n_true = 0
|
||||||
|
for r in rows:
|
||||||
|
row = dict(r)
|
||||||
|
tags = row.get("subject_tags")
|
||||||
|
if isinstance(tags, str):
|
||||||
|
try:
|
||||||
|
tags = json.loads(tags)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
tags = []
|
||||||
|
row["subject_tags"] = tags or []
|
||||||
|
has_chunk = await conn.fetchval(
|
||||||
|
"SELECT EXISTS(SELECT 1 FROM precedent_chunks "
|
||||||
|
"WHERE case_law_id = $1 AND embedding IS NOT NULL)", row["id"])
|
||||||
|
val = _compute_searchable(row, bool(has_chunk))
|
||||||
|
await conn.execute(
|
||||||
|
"UPDATE case_law SET searchable = $2 WHERE id = $1", row["id"], val)
|
||||||
|
if val:
|
||||||
|
n_true += 1
|
||||||
|
return n_true
|
||||||
|
|
||||||
|
|
||||||
async def update_case_law(case_law_id: UUID, **fields) -> dict | None:
|
async def update_case_law(case_law_id: UUID, **fields) -> dict | None:
|
||||||
"""Patch metadata fields on a case_law row.
|
"""Patch metadata fields on a case_law row.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user