feat(halacha): strict-rubric quality gate + dedup-on-insert (#81,#82)

Bake the 2026-06-03 strict-cleanup rubric into the extraction pipeline so the
corpus stays clean at the source instead of accumulating duplicates, obiter
dicta, truncated quotes and thin restatements that clog the review queue.

#81 — quality gate:
- New pure module halacha_quality.py with unit-tested validators:
  non-decision/obiter (Wambaugh markers), truncated-quote (mid-word cut),
  thin-restatement (rule≈quote), quote-unverified.
- Validators run in halacha_extractor._process; a non-decision is re-typed
  obiter; flags persist in new halachot.quality_flags column.
- Auto-approve now requires confidence>=threshold AND no quality flags;
  flagged items route to pending_review regardless of confidence.
- Both extraction prompts hardened: reject undecided dicta, exclude
  case-specific applications, require abstraction, forbid over-splitting.

#82 — dedup-on-insert (store_halachot_for_chunk):
- Within the same precedent, skip a halacha whose normalized supporting_quote
  already exists, or whose rule-embedding has cosine>=HALACHA_DEDUP_COSINE
  (0.93) against an already-stored one. Makes re-runs idempotent.

Migration: halachot.quality_flags TEXT[] (additive, idempotent ALTER).
Tests: 19 new unit tests; full suite 156 passed. Validated end-to-end against
dev DB (dedup skips dups, flag blocks auto-approve, re-run inserts 0).
Calibration: flags fire on only ~10% of current survivors (low false-positive).

Spec: docs/halacha-strict-rubric.md

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-03 12:30:38 +00:00
parent b0ec24a9d5
commit ca959d4a9c
6 changed files with 386 additions and 18 deletions

View File

@@ -14,6 +14,7 @@ import asyncpg
from pgvector.asyncpg import register_vector
from legal_mcp import config
from legal_mcp.services import halacha_quality
logger = logging.getLogger(__name__)
@@ -661,10 +662,14 @@ CREATE TABLE IF NOT EXISTS halachot (
-- pending_review | approved | rejected | published
reviewer TEXT DEFAULT '',
reviewed_at TIMESTAMPTZ,
quality_flags TEXT[] DEFAULT '{}',
-- non_decision | truncated_quote | thin_restatement | quote_unverified
-- (any flag blocks auto-approve → routes to pending_review)
embedding vector(1024),
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now()
);
ALTER TABLE halachot ADD COLUMN IF NOT EXISTS quality_flags TEXT[] DEFAULT '{}';
CREATE INDEX IF NOT EXISTS idx_halachot_case_law ON halachot(case_law_id);
CREATE INDEX IF NOT EXISTS idx_halachot_status ON halachot(review_status);
CREATE INDEX IF NOT EXISTS idx_halachot_practice ON halachot USING gin(practice_areas);
@@ -3333,18 +3338,61 @@ async def store_halachot_for_chunk(
across chunks never collide. The chunk is marked even when ``halachot`` is
empty (so resume skips genuinely-empty chunks too). Caller serializes calls
(a single in-process store-lock) so the MAX read stays race-free.
Two gates encode the strict rubric (docs/halacha-strict-rubric.md) so the
corpus stays clean at the source instead of accumulating noise:
* Auto-approve gate — a halacha auto-approves only if confidence ≥ threshold
AND it carries no ``quality_flags`` (non_decision / truncated_quote /
thin_restatement / quote_unverified). Flagged items route to
``pending_review`` regardless of confidence.
* Dedup-on-insert — within the SAME precedent, a halacha is skipped if its
normalized ``supporting_quote`` already exists, or its rule-embedding has
cosine ≥ ``HALACHA_DEDUP_COSINE`` against an already-stored halacha.
Returns the number of halachot actually INSERTED (after dedup skips).
"""
threshold = config.HALACHA_AUTO_APPROVE_THRESHOLD
dedup_distance = 1.0 - config.HALACHA_DEDUP_COSINE # cosine sim → distance
pool = await get_pool()
inserted = 0
skipped = 0
async with pool.acquire() as conn:
async with conn.transaction():
base = await conn.fetchval(
"SELECT COALESCE(MAX(halacha_index), -1) + 1 FROM halachot "
"WHERE case_law_id = $1", case_law_id,
)
for j, h in enumerate(halachot):
# Existing normalized quotes for exact-dedup (incl. within-batch).
existing_quotes = {
halacha_quality.normalize_text(r["supporting_quote"])
for r in await conn.fetch(
"SELECT supporting_quote FROM halachot WHERE case_law_id = $1",
case_law_id,
)
}
for h in halachot:
norm_quote = halacha_quality.normalize_text(h["supporting_quote"])
# 1) exact normalized-quote duplicate within this precedent
if norm_quote and norm_quote in existing_quotes:
skipped += 1
continue
# 2) semantic near-duplicate (rule embedding cosine)
emb = h.get("embedding")
if emb is not None and config.HALACHA_DEDUP_COSINE <= 1.0:
dup = await conn.fetchval(
"SELECT 1 FROM halachot WHERE case_law_id = $1 "
"AND embedding IS NOT NULL AND (embedding <=> $2) <= $3 "
"LIMIT 1",
case_law_id, emb, dedup_distance,
)
if dup:
skipped += 1
continue
confidence = float(h.get("confidence", 0.0))
auto_approve = confidence >= threshold
flags = h.get("quality_flags") or []
auto_approve = confidence >= threshold and not flags
review_status = "approved" if auto_approve else "pending_review"
reviewer = (
f"auto-approved (confidence ≥ {threshold:.2f})"
@@ -3356,22 +3404,29 @@ async def store_halachot_for_chunk(
(case_law_id, halacha_index, rule_statement, rule_type,
reasoning_summary, supporting_quote, page_reference,
practice_areas, subject_tags, cites, confidence,
quote_verified, embedding, review_status,
quote_verified, quality_flags, embedding, review_status,
reviewer, reviewed_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11,
$12, $13, $14, $15, {reviewed_at_clause})""",
case_law_id, base + j, h["rule_statement"],
$12, $13, $14, $15, $16, {reviewed_at_clause})""",
case_law_id, base + inserted, h["rule_statement"],
h.get("rule_type", "binding"), h.get("reasoning_summary", ""),
h["supporting_quote"], h.get("page_reference", ""),
h.get("practice_areas", []), h.get("subject_tags", []),
h.get("cites", []), confidence, h.get("quote_verified", False),
h.get("embedding"), review_status, reviewer,
flags, h.get("embedding"), review_status, reviewer,
)
existing_quotes.add(norm_quote)
inserted += 1
await conn.execute(
"UPDATE precedent_chunks SET halacha_extracted_at = now() "
"WHERE id = $1", chunk_id,
)
return len(halachot)
if skipped:
logger.info(
"store_halachot_for_chunk: case_law=%s chunk=%s%d inserted, "
"%d skipped as duplicates", case_law_id, chunk_id, inserted, skipped,
)
return inserted
async def list_halachot(
@@ -3403,7 +3458,8 @@ async def list_halachot(
SELECT h.id, h.case_law_id, h.halacha_index, h.rule_statement,
h.rule_type, h.reasoning_summary, h.supporting_quote,
h.page_reference, h.practice_areas, h.subject_tags,
h.cites, h.confidence, h.quote_verified, h.review_status,
h.cites, h.confidence, h.quote_verified, h.quality_flags,
h.review_status,
h.reviewer, h.reviewed_at, h.created_at, h.updated_at,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level,