Merge pull request 'feat(halachot): canonical principles model — V41 schema + backfill (Phase 1+2)' (#298) from worktree-canonical-halachot into main
This commit was merged in pull request #298.
This commit is contained in:
@@ -1585,6 +1585,90 @@ CREATE INDEX IF NOT EXISTS idx_missing_precedents_citation_norm
|
||||
"""
|
||||
|
||||
|
||||
# ── V41: canonical_halachot ──────────────────────────────────────────
|
||||
# Replaces the equivalent_halachot bidirectional-link model (V28) with a
|
||||
# first-class canonical entity. Instead of recording that halacha A ≡ halacha B,
|
||||
# we now have ONE canonical_halachot row that BOTH A and B point to.
|
||||
#
|
||||
# Each legal PRINCIPLE is defined ONCE here (canonical_statement = LLM-
|
||||
# synthesized abstraction, grounded in source statements per INV-AH). The
|
||||
# per-precedent halachot rows become INSTANCES that link to the canonical and
|
||||
# carry only their own quote, treatment, and context.
|
||||
#
|
||||
# Extraction pipeline change (Phase 3, separate PR): lookup-before-insert —
|
||||
# embed new extraction, cosine-search canonical_halachot (≥0.85); if match,
|
||||
# store a thin 'citation' instance; if not, create new canonical + 'original'
|
||||
# instance. This eliminates per-extraction duplication of the same principle.
|
||||
#
|
||||
# INV-DM7: authority (binding/persuasive) derived from
|
||||
# first_established_in.precedent_level — never stored on canonical.
|
||||
# INV-G10: only 'published' canonicals reach drafting agents.
|
||||
# INV-AH: canonical_statement grounded in source statements, never invented;
|
||||
# review_status='pending_synthesis' until chair verifies.
|
||||
# G2: equivalent_halachot (V28) deprecated post-backfill (no parallel path).
|
||||
SCHEMA_V41_SQL = """
|
||||
-- One row per unique legal principle across all precedents.
|
||||
CREATE TABLE IF NOT EXISTS canonical_halachot (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
canonical_statement TEXT NOT NULL,
|
||||
rule_type TEXT NOT NULL DEFAULT 'interpretive',
|
||||
practice_areas TEXT[] NOT NULL DEFAULT '{}',
|
||||
subject_tags TEXT[] NOT NULL DEFAULT '{}',
|
||||
embedding vector(1024),
|
||||
review_status TEXT NOT NULL DEFAULT 'pending_synthesis'
|
||||
CHECK (review_status IN
|
||||
('pending_synthesis','pending_review','approved','published','rejected')),
|
||||
first_established_in UUID REFERENCES case_law(id) ON DELETE SET NULL,
|
||||
instance_count INT NOT NULL DEFAULT 0,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_canonical_halachot_status
|
||||
ON canonical_halachot(review_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_canonical_halachot_practice
|
||||
ON canonical_halachot USING gin(practice_areas);
|
||||
CREATE INDEX IF NOT EXISTS idx_canonical_halachot_tags
|
||||
ON canonical_halachot USING gin(subject_tags);
|
||||
CREATE INDEX IF NOT EXISTS idx_canonical_halachot_vec
|
||||
ON canonical_halachot USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 30);
|
||||
|
||||
-- halachot: canonical linkage + role columns.
|
||||
-- canonical_id: NULL until backfill_canonical_halachot.py runs; 100% filled after.
|
||||
-- instance_type: role of this precedent's mention of the principle.
|
||||
-- 'original' = the precedent that FIRST established the principle (source)
|
||||
-- 'citation' = a later precedent that cites/applies the principle
|
||||
-- 'application'= a later precedent that applies the principle to new facts
|
||||
-- treatment: how this precedent's mention relates to the canonical principle.
|
||||
-- Parallels halacha_citation_corroboration.treatment (X11) but for precedents
|
||||
-- (X11 tracks citations from internal decisions; this tracks per-precedent treatment).
|
||||
-- rule_statement + embedding become nullable: citation instances inherit these
|
||||
-- from canonical_halachot. 'original' instances keep their own stored values.
|
||||
ALTER TABLE halachot
|
||||
ADD COLUMN IF NOT EXISTS canonical_id UUID
|
||||
REFERENCES canonical_halachot(id) ON DELETE SET NULL,
|
||||
ADD COLUMN IF NOT EXISTS instance_type TEXT NOT NULL DEFAULT 'original'
|
||||
CHECK (instance_type IN ('original','citation','application')),
|
||||
ADD COLUMN IF NOT EXISTS treatment TEXT NOT NULL DEFAULT 'mentioned';
|
||||
ALTER TABLE halachot ALTER COLUMN rule_statement DROP NOT NULL;
|
||||
ALTER TABLE halachot ALTER COLUMN embedding DROP NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_halachot_canonical ON halachot(canonical_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_halachot_instance_type ON halachot(instance_type);
|
||||
|
||||
-- halacha_citation_corroboration (X11) gains canonical_id so the signal
|
||||
-- aggregates at the principle level rather than the per-instance level.
|
||||
-- Backfill: UPDATE halacha_citation_corroboration SET canonical_id =
|
||||
-- (SELECT canonical_id FROM halachot WHERE id = halacha_id).
|
||||
-- halacha_id is retained for audit trail.
|
||||
ALTER TABLE halacha_citation_corroboration
|
||||
ADD COLUMN IF NOT EXISTS canonical_id UUID
|
||||
REFERENCES canonical_halachot(id) ON DELETE CASCADE;
|
||||
CREATE INDEX IF NOT EXISTS idx_hcc_canonical
|
||||
ON halacha_citation_corroboration(canonical_id)
|
||||
WHERE canonical_id IS NOT NULL;
|
||||
"""
|
||||
|
||||
|
||||
# Stable, arbitrary key for the session-level advisory lock that serialises
|
||||
# schema DDL across processes. Every short-lived process (cron drains, services)
|
||||
# re-runs the idempotent migrations on startup; without this lock two processes
|
||||
@@ -1602,7 +1686,7 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
||||
await _apply_schema_ddl(conn)
|
||||
finally:
|
||||
await conn.execute("SELECT pg_advisory_unlock($1)", _MIGRATION_LOCK_KEY)
|
||||
logger.info("Database schema initialized (v1-v40)")
|
||||
logger.info("Database schema initialized (v1-v41)")
|
||||
|
||||
|
||||
async def _apply_schema_ddl(conn: asyncpg.Connection) -> None:
|
||||
@@ -1647,6 +1731,7 @@ async def _apply_schema_ddl(conn: asyncpg.Connection) -> None:
|
||||
await conn.execute(SCHEMA_V38_SQL)
|
||||
await conn.execute(SCHEMA_V39_SQL)
|
||||
await conn.execute(SCHEMA_V40_SQL)
|
||||
await conn.execute(SCHEMA_V41_SQL)
|
||||
|
||||
|
||||
async def init_schema() -> None:
|
||||
@@ -5764,12 +5849,20 @@ async def store_corroboration(
|
||||
s_id = _UUID(source_id) if isinstance(source_id, str) else source_id
|
||||
cl_id = _UUID(citing_case_law_id) if (citing_case_law_id and isinstance(citing_case_law_id, str)) else citing_case_law_id
|
||||
d_id = _UUID(citing_decision_id) if (citing_decision_id and isinstance(citing_decision_id, str)) else citing_decision_id
|
||||
# INSERT ... SELECT so we can pull canonical_id from halachot in one round-trip.
|
||||
# canonical_id is NULL until backfill_canonical_halachot.py runs; COALESCE keeps
|
||||
# existing canonical_id on conflict so a pre-backfill row is upgraded when the
|
||||
# same corroboration is re-stored post-backfill.
|
||||
await pool.execute(
|
||||
"INSERT INTO halacha_citation_corroboration "
|
||||
"(halacha_id, citing_case_law_id, citing_decision_id, source_citation_id, treatment, match_score, match_context) "
|
||||
"VALUES ($1,$2,$3,$4,$5,$6,$7) "
|
||||
"(halacha_id, canonical_id, citing_case_law_id, citing_decision_id, "
|
||||
" source_citation_id, treatment, match_score, match_context) "
|
||||
"SELECT $1, h.canonical_id, $2, $3, $4, $5, $6, $7 "
|
||||
"FROM halachot h WHERE h.id = $1 "
|
||||
"ON CONFLICT (halacha_id, source_citation_id) DO UPDATE SET "
|
||||
"treatment=EXCLUDED.treatment, match_score=EXCLUDED.match_score",
|
||||
"treatment=EXCLUDED.treatment, match_score=EXCLUDED.match_score, "
|
||||
"canonical_id=COALESCE(EXCLUDED.canonical_id, "
|
||||
" halacha_citation_corroboration.canonical_id)",
|
||||
h_id, cl_id, d_id, s_id, treatment, score, context,
|
||||
)
|
||||
|
||||
@@ -5846,6 +5939,101 @@ async def list_equivalent_for_halacha(halacha_id: UUID) -> list[dict]:
|
||||
]
|
||||
|
||||
|
||||
# ── Canonical halachot (V41) ─────────────────────────────────────────────────
|
||||
|
||||
async def create_canonical_halacha(
|
||||
statement: str,
|
||||
rule_type: str = "interpretive",
|
||||
practice_areas: list[str] | None = None,
|
||||
subject_tags: list[str] | None = None,
|
||||
embedding: list[float] | None = None,
|
||||
first_established_in: "UUID | None" = None,
|
||||
review_status: str = "pending_synthesis",
|
||||
) -> "UUID":
|
||||
"""Insert a new canonical principle and return its id."""
|
||||
pool = await get_pool()
|
||||
row = await pool.fetchrow(
|
||||
"INSERT INTO canonical_halachot "
|
||||
"(canonical_statement, rule_type, practice_areas, subject_tags, "
|
||||
" embedding, first_established_in, review_status) "
|
||||
"VALUES ($1,$2,$3,$4,$5,$6,$7) RETURNING id",
|
||||
statement,
|
||||
rule_type,
|
||||
practice_areas or [],
|
||||
subject_tags or [],
|
||||
embedding,
|
||||
first_established_in,
|
||||
review_status,
|
||||
)
|
||||
return row["id"]
|
||||
|
||||
|
||||
async def nearest_canonical_halacha(
|
||||
vec: list[float],
|
||||
threshold: float = 0.85,
|
||||
status_filter: tuple[str, ...] = ("approved", "published"),
|
||||
) -> "tuple[str, float] | None":
|
||||
"""Return (canonical_id, cosine_sim) of the nearest approved/published canonical
|
||||
whose cosine similarity to `vec` meets `threshold`, or None if none qualifies.
|
||||
|
||||
Used by the extractor's lookup-before-insert (Phase 3) to detect whether
|
||||
a newly extracted principle already exists in the registry.
|
||||
"""
|
||||
pool = await get_pool()
|
||||
row = await pool.fetchrow(
|
||||
"SELECT id::text AS id, 1 - (embedding <=> $1) AS sim "
|
||||
"FROM canonical_halachot "
|
||||
"WHERE embedding IS NOT NULL AND review_status = ANY($2::text[]) "
|
||||
"ORDER BY embedding <=> $1 LIMIT 1",
|
||||
vec, list(status_filter),
|
||||
)
|
||||
if not row:
|
||||
return None
|
||||
sim = float(row["sim"])
|
||||
return (row["id"], sim) if sim >= threshold else None
|
||||
|
||||
|
||||
async def refresh_canonical_instance_count(canonical_id: "UUID") -> None:
|
||||
"""Recount halachot rows pointing to this canonical and update instance_count."""
|
||||
pool = await get_pool()
|
||||
await pool.execute(
|
||||
"UPDATE canonical_halachot SET "
|
||||
"instance_count = (SELECT COUNT(*) FROM halachot WHERE canonical_id = $1), "
|
||||
"updated_at = now() "
|
||||
"WHERE id = $1",
|
||||
canonical_id,
|
||||
)
|
||||
|
||||
|
||||
async def get_canonical_halacha(canonical_id: "UUID") -> "dict | None":
|
||||
"""Fetch one canonical principle with its instance list."""
|
||||
pool = await get_pool()
|
||||
row = await pool.fetchrow(
|
||||
"SELECT ch.id::text, ch.canonical_statement, ch.rule_type, "
|
||||
" ch.practice_areas, ch.subject_tags, ch.review_status, "
|
||||
" ch.instance_count, ch.created_at, ch.updated_at, "
|
||||
" cl.case_number AS first_established_case "
|
||||
"FROM canonical_halachot ch "
|
||||
"LEFT JOIN case_law cl ON cl.id = ch.first_established_in "
|
||||
"WHERE ch.id = $1",
|
||||
canonical_id,
|
||||
)
|
||||
if not row:
|
||||
return None
|
||||
instances = await pool.fetch(
|
||||
"SELECT h.id::text, h.instance_type, h.treatment, h.supporting_quote, "
|
||||
" h.page_reference, h.review_status AS instance_status, "
|
||||
" cl.case_number, cl.case_name "
|
||||
"FROM halachot h JOIN case_law cl ON cl.id = h.case_law_id "
|
||||
"WHERE h.canonical_id = $1 ORDER BY h.instance_type, cl.case_number",
|
||||
canonical_id,
|
||||
)
|
||||
return {
|
||||
**dict(row),
|
||||
"instances": [dict(i) for i in instances],
|
||||
}
|
||||
|
||||
|
||||
async def _annotate_equivalents(pool, out: list[dict]) -> None:
|
||||
"""Attach an `equivalents` list to each row (#84.2) — parallel-authority links.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user