From 4b8bbc3794864881f6c9259e4960739702d2964c Mon Sep 17 00:00:00 2001 From: Chaim Date: Sat, 30 May 2026 20:46:29 +0000 Subject: [PATCH] feat(data-model): V21 searchable flag + recompute_searchable (GAP-13, FU-2a) Add SCHEMA_V21_SQL (searchable boolean column + index on case_law), wire it into _run_schema_migrations, and implement _compute_searchable (pure predicate) + recompute_searchable (idempotent async backfill/update). All 5 unit tests pass. Co-Authored-By: Claude Sonnet 4.6 --- mcp-server/src/legal_mcp/services/db.py | 77 ++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py index 567026b..d2cfedc 100644 --- a/mcp-server/src/legal_mcp/services/db.py +++ b/mcp-server/src/legal_mcp/services/db.py @@ -1094,6 +1094,18 @@ CREATE INDEX IF NOT EXISTS idx_case_law_meta_tsv """ +# ── V21: explicit `searchable` flag (GAP-13 / INV-DM1) ───────────── +# Materialized completeness flag — a case_law row is exposed to search only +# when it satisfies the completeness contract (02-data-model §2a). Recomputed +# on ingest/metadata completion via recompute_searchable(); not inferred at +# query time. Default false so a freshly-inserted row is excluded until proven +# complete. Health-check surfaces count(*) FILTER (WHERE NOT searchable). +SCHEMA_V21_SQL = """ +ALTER TABLE case_law ADD COLUMN IF NOT EXISTS searchable boolean NOT NULL DEFAULT false; +CREATE INDEX IF NOT EXISTS idx_case_law_searchable ON case_law (searchable); +""" + + async def _run_schema_migrations(pool: asyncpg.Pool) -> None: async with pool.acquire() as conn: await conn.execute(SCHEMA_SQL) @@ -1117,7 +1129,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None: await conn.execute(SCHEMA_V18_SQL) await conn.execute(SCHEMA_V19_SQL) await conn.execute(SCHEMA_V20_SQL) - logger.info("Database schema initialized (v1-v20)") + await conn.execute(SCHEMA_V21_SQL) + logger.info("Database schema initialized (v1-v21)") async def init_schema() -> None: @@ -2706,6 +2719,68 @@ async def create_internal_committee_decision( return _row_to_case_law(row) +def _compute_searchable(row: dict, has_embedded_chunk: bool) -> bool: + """Completeness contract (INV-DM1 / 02-data-model §2a). + + A row is searchable IFF: canonical id present · case_name/practice_area/ + source_kind present · ≥1 chunk with a non-null embedding · extraction + completed · metadata non-empty (≥1 of headnote/summary/subject_tags). + Pure — `has_embedded_chunk` is supplied by the caller (cross-table check). + """ + if not has_embedded_chunk: + return False + if (row.get("extraction_status") or "") != "completed": + return False + if not (row.get("case_number") or "").strip(): + return False + if not (row.get("case_name") or "").strip(): + return False + if not (row.get("practice_area") or "").strip(): + return False + if not (row.get("source_kind") or "").strip(): + return False + tags = row.get("subject_tags") or [] + has_meta = bool((row.get("headnote") or "").strip()) \ + or bool((row.get("summary") or "").strip()) \ + or (len(tags) > 0) + return has_meta + + +async def recompute_searchable(case_law_id: "UUID | str | None" = None) -> int: + """Recompute and persist the `searchable` flag. Idempotent / reversible. + + If case_law_id is None, recompute ALL rows (used by the V21 backfill and + the dry-run). Returns the number of rows now marked searchable=true. + """ + pool = await get_pool() + async with pool.acquire() as conn: + if case_law_id is not None: + cid = case_law_id if isinstance(case_law_id, UUID) else UUID(str(case_law_id)) + rows = await conn.fetch( + "SELECT * FROM case_law WHERE id = $1", cid) + else: + rows = await conn.fetch("SELECT * FROM case_law") + n_true = 0 + for r in rows: + row = dict(r) + tags = row.get("subject_tags") + if isinstance(tags, str): + try: + tags = json.loads(tags) + except (ValueError, TypeError): + tags = [] + row["subject_tags"] = tags or [] + has_chunk = await conn.fetchval( + "SELECT EXISTS(SELECT 1 FROM precedent_chunks " + "WHERE case_law_id = $1 AND embedding IS NOT NULL)", row["id"]) + val = _compute_searchable(row, bool(has_chunk)) + await conn.execute( + "UPDATE case_law SET searchable = $2 WHERE id = $1", row["id"], val) + if val: + n_true += 1 + return n_true + + async def update_case_law(case_law_id: UUID, **fields) -> dict | None: """Patch metadata fields on a case_law row.