feat(corpus): corpus redesign — eliminate halacha queue, verified-by-citation layer, rank-at-retrieval (#153)
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 5s
Lint — undefined names / undefined-names (pull_request) Successful in 12s

Implements chaim's 2026-06-20 directive (5 steps; step 6 deferred):
1. No review queue — HALACHA_NO_REVIEW_QUEUE=true (auto-approve all → background);
   migration cleared 2,416 pending_review → approved.
2. Verified layer — halachot.verified/cite_count from chair citations
   (db.refresh_verified_layer + scripts/build_verified_layer.py runs citator on
   ALL committee decisions). 2,775 verified / 137 precedents.
3. Retrieval ranks verified ≫ background — HALACHA_VERIFIED_BOOST in both semantic
   + lexical halacha queries; filter now includes background (<> rejected).
5. Disabled destructive panel cap/novelty — HALACHA_PANEL_REGIME_ENABLED=false
   (8508/1049/1200 proved it lost 22-30 genuine principles incl. Lustrenik).
4. Ingest contract — going-forward already queues metadata; backfill_practice_area.py
   + 206 re-queued to the metadata drain.

Source of truth: docs/precedent-corpus-redesign/00-final-synthesis.md. Quality flags
are 97% false-positive (nli-audit) → no longer gate. UI queue removal → Claude Design
gate. 429 tests green (no regressions).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-20 13:55:00 +00:00
parent afe6894441
commit b9fa74b875
6 changed files with 255 additions and 11 deletions

View File

@@ -1669,6 +1669,20 @@ ALTER TABLE halachot
CREATE INDEX IF NOT EXISTS idx_halachot_gold ON halachot(gold_chair, gold_digest)
WHERE gold_chair OR gold_digest;
-- Corpus redesign (#153, chaim 2026-06-20: "trusted = citation, not review").
-- Two-layer retrieval model:
-- • verified = the principle's SOURCE precedent was actually cited by a chair
-- (precedent_internal_citations). The ONLY trust signal — never
-- from human review (the halacha review queue is eliminated).
-- • cite_count = # distinct chair decisions citing the source precedent → the
-- importance/ranking signal (verified ≫ background at retrieval).
-- Refreshed by db.refresh_verified_layer() (scripts/build_verified_layer.py), and
-- grows automatically as new chair decisions are ingested (active-learning).
ALTER TABLE halachot
ADD COLUMN IF NOT EXISTS verified BOOLEAN NOT NULL DEFAULT false,
ADD COLUMN IF NOT EXISTS cite_count INT NOT NULL DEFAULT 0;
CREATE INDEX IF NOT EXISTS idx_halachot_verified ON halachot(verified) WHERE verified;
-- halacha_citation_corroboration (X11) gains canonical_id so the signal
-- aggregates at the principle level rather than the per-instance level.
-- Backfill: UPDATE halacha_citation_corroboration SET canonical_id =
@@ -5148,7 +5162,8 @@ async def store_halachot(case_law_id: UUID, halachot: list[dict]) -> int:
async with pool.acquire() as conn:
for i, h in enumerate(halachot):
confidence = float(h.get("confidence", 0.0))
auto_approve = confidence >= threshold
# #153: no review queue → everything is available background (approved).
auto_approve = config.HALACHA_NO_REVIEW_QUEUE or confidence >= threshold
review_status = "approved" if auto_approve else "pending_review"
reviewer = (
f"auto-approved (confidence ≥ {threshold:.2f})"
@@ -5353,7 +5368,9 @@ async def store_halachot_for_chunk(
instance_type = "citation"
confidence = float(h.get("confidence", 0.0))
auto_approve = confidence >= threshold and not flags
# #153: no review queue → everything is available background (approved);
# quality flags become ranking signals, not an approval gate (nli 97% FP).
auto_approve = config.HALACHA_NO_REVIEW_QUEUE or (confidence >= threshold and not flags)
review_status = "approved" if auto_approve else "pending_review"
reviewer = (
f"auto-approved (confidence ≥ {threshold:.2f})"
@@ -6417,6 +6434,38 @@ async def gold_coverage_stats() -> dict:
return dict(row)
async def refresh_verified_layer() -> dict:
"""Recompute the verified/cite_count layer from chair citations (#153).
'verified' = the principle's SOURCE precedent was cited by a chair (any
committee decision). 'cite_count' = # distinct chair decisions citing it. This
is the ONLY trust signal — never human review. Idempotent (full recompute).
Returns {verified_principles, verified_precedents}.
"""
pool = await get_pool()
async with pool.acquire() as conn:
async with conn.transaction():
await conn.execute(
"UPDATE halachot SET verified=false, cite_count=0 "
"WHERE verified OR cite_count>0")
await conn.execute(
"WITH cc AS ("
" SELECT pic.cited_case_law_id AS id, "
" count(DISTINCT pic.source_case_law_id) AS n "
" FROM precedent_internal_citations pic "
" JOIN case_law src ON src.id = pic.source_case_law_id "
" WHERE src.source_kind='internal_committee' "
" AND pic.cited_case_law_id IS NOT NULL "
" GROUP BY pic.cited_case_law_id) "
"UPDATE halachot h SET verified=true, cite_count=cc.n, updated_at=now() "
"FROM cc WHERE h.case_law_id = cc.id")
row = await conn.fetchrow(
"SELECT count(*) FILTER (WHERE verified) AS vp, "
" count(DISTINCT case_law_id) FILTER (WHERE verified) AS vc "
"FROM halachot")
return {"verified_principles": row["vp"], "verified_precedents": row["vc"]}
async def list_canonical_instances(canonical_id: "UUID") -> list[dict]:
"""List all halachot (instances) sharing a canonical_id — used by the UI accordion."""
pool = await get_pool()
@@ -6943,7 +6992,7 @@ async def search_precedent_library_semantic(
"""
pool = await get_pool()
halacha_filters = [
"h.review_status IN ('approved', 'published')",
"h.review_status <> 'rejected'", # #153: include background; rank verified higher
f"cl.source_kind = '{source_kind}'",
"cl.searchable = true",
]
@@ -7007,18 +7056,22 @@ async def search_precedent_library_semantic(
c_params.append(chair_name)
c_idx += 1
# #153: verified (chair-cited) principles float above background.
vboost = (f"(CASE WHEN h.verified THEN {config.HALACHA_VERIFIED_BOOST} ELSE 0 END "
f"+ LEAST(h.cite_count, {config.HALACHA_CITE_BOOST_CAP}) * {config.HALACHA_CITE_BOOST_PER})")
halacha_sql = f"""
SELECT h.id AS halacha_id, h.case_law_id, h.rule_statement,
h.reasoning_summary, h.supporting_quote, h.page_reference,
h.practice_areas, h.subject_tags, h.confidence, h.rule_type,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level, cl.chair_name, cl.district,
1 - (h.embedding <=> $1) AS score
h.verified, h.cite_count,
(1 - (h.embedding <=> $1)) + {vboost} AS score
FROM halachot h
JOIN case_law cl ON cl.id = h.case_law_id
WHERE {' AND '.join(halacha_filters)}
AND h.embedding IS NOT NULL
ORDER BY h.embedding <=> $1
ORDER BY score DESC
LIMIT $2
"""
@@ -7182,7 +7235,7 @@ async def search_precedent_library_lexical(
pool = await get_pool()
halacha_filters = [
"h.review_status IN ('approved', 'published')",
"h.review_status <> 'rejected'", # #153: include background; rank verified higher
f"cl.source_kind = '{source_kind}'",
"cl.searchable = true",
]
@@ -7247,18 +7300,23 @@ async def search_precedent_library_lexical(
c_params.append(chair_name)
c_idx += 1
# #153: verified (chair-cited) principles float above background.
vboost = (f"(CASE WHEN h.verified THEN {config.HALACHA_VERIFIED_BOOST} ELSE 0 END "
f"+ LEAST(h.cite_count, {config.HALACHA_CITE_BOOST_CAP}) * {config.HALACHA_CITE_BOOST_PER})")
halacha_sql = f"""
SELECT h.id AS halacha_id, h.case_law_id, h.rule_statement,
h.reasoning_summary, h.supporting_quote, h.page_reference,
h.practice_areas, h.subject_tags, h.confidence, h.rule_type,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level, cl.chair_name, cl.district,
h.verified, h.cite_count,
GREATEST(
ts_rank_cd(h.rule_tsv, plainto_tsquery('simple', $1)),
ts_rank_cd(cl.meta_tsv, plainto_tsquery('simple', $1))
)
+ CASE WHEN cl.meta_tsv @@ plainto_tsquery('simple', $1)
THEN 1.0 ELSE 0.0 END AS score
THEN 1.0 ELSE 0.0 END
+ {vboost} AS score
FROM halachot h
JOIN case_law cl ON cl.id = h.case_law_id
WHERE {' AND '.join(halacha_filters)}