feat(halacha): strict-rubric quality gate + dedup-on-insert (#81,#82)
Bake the 2026-06-03 strict-cleanup rubric into the extraction pipeline so the corpus stays clean at the source instead of accumulating duplicates, obiter dicta, truncated quotes and thin restatements that clog the review queue. #81 — quality gate: - New pure module halacha_quality.py with unit-tested validators: non-decision/obiter (Wambaugh markers), truncated-quote (mid-word cut), thin-restatement (rule≈quote), quote-unverified. - Validators run in halacha_extractor._process; a non-decision is re-typed obiter; flags persist in new halachot.quality_flags column. - Auto-approve now requires confidence>=threshold AND no quality flags; flagged items route to pending_review regardless of confidence. - Both extraction prompts hardened: reject undecided dicta, exclude case-specific applications, require abstraction, forbid over-splitting. #82 — dedup-on-insert (store_halachot_for_chunk): - Within the same precedent, skip a halacha whose normalized supporting_quote already exists, or whose rule-embedding has cosine>=HALACHA_DEDUP_COSINE (0.93) against an already-stored one. Makes re-runs idempotent. Migration: halachot.quality_flags TEXT[] (additive, idempotent ALTER). Tests: 19 new unit tests; full suite 156 passed. Validated end-to-end against dev DB (dedup skips dups, flag blocks auto-approve, re-run inserts 0). Calibration: flags fire on only ~10% of current survivors (low false-positive). Spec: docs/halacha-strict-rubric.md Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -14,6 +14,7 @@ import asyncpg
|
||||
from pgvector.asyncpg import register_vector
|
||||
|
||||
from legal_mcp import config
|
||||
from legal_mcp.services import halacha_quality
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -661,10 +662,14 @@ CREATE TABLE IF NOT EXISTS halachot (
|
||||
-- pending_review | approved | rejected | published
|
||||
reviewer TEXT DEFAULT '',
|
||||
reviewed_at TIMESTAMPTZ,
|
||||
quality_flags TEXT[] DEFAULT '{}',
|
||||
-- non_decision | truncated_quote | thin_restatement | quote_unverified
|
||||
-- (any flag blocks auto-approve → routes to pending_review)
|
||||
embedding vector(1024),
|
||||
created_at TIMESTAMPTZ DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ DEFAULT now()
|
||||
);
|
||||
ALTER TABLE halachot ADD COLUMN IF NOT EXISTS quality_flags TEXT[] DEFAULT '{}';
|
||||
CREATE INDEX IF NOT EXISTS idx_halachot_case_law ON halachot(case_law_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_halachot_status ON halachot(review_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_halachot_practice ON halachot USING gin(practice_areas);
|
||||
@@ -3333,18 +3338,61 @@ async def store_halachot_for_chunk(
|
||||
across chunks never collide. The chunk is marked even when ``halachot`` is
|
||||
empty (so resume skips genuinely-empty chunks too). Caller serializes calls
|
||||
(a single in-process store-lock) so the MAX read stays race-free.
|
||||
|
||||
Two gates encode the strict rubric (docs/halacha-strict-rubric.md) so the
|
||||
corpus stays clean at the source instead of accumulating noise:
|
||||
|
||||
* Auto-approve gate — a halacha auto-approves only if confidence ≥ threshold
|
||||
AND it carries no ``quality_flags`` (non_decision / truncated_quote /
|
||||
thin_restatement / quote_unverified). Flagged items route to
|
||||
``pending_review`` regardless of confidence.
|
||||
* Dedup-on-insert — within the SAME precedent, a halacha is skipped if its
|
||||
normalized ``supporting_quote`` already exists, or its rule-embedding has
|
||||
cosine ≥ ``HALACHA_DEDUP_COSINE`` against an already-stored halacha.
|
||||
|
||||
Returns the number of halachot actually INSERTED (after dedup skips).
|
||||
"""
|
||||
threshold = config.HALACHA_AUTO_APPROVE_THRESHOLD
|
||||
dedup_distance = 1.0 - config.HALACHA_DEDUP_COSINE # cosine sim → distance
|
||||
pool = await get_pool()
|
||||
inserted = 0
|
||||
skipped = 0
|
||||
async with pool.acquire() as conn:
|
||||
async with conn.transaction():
|
||||
base = await conn.fetchval(
|
||||
"SELECT COALESCE(MAX(halacha_index), -1) + 1 FROM halachot "
|
||||
"WHERE case_law_id = $1", case_law_id,
|
||||
)
|
||||
for j, h in enumerate(halachot):
|
||||
# Existing normalized quotes for exact-dedup (incl. within-batch).
|
||||
existing_quotes = {
|
||||
halacha_quality.normalize_text(r["supporting_quote"])
|
||||
for r in await conn.fetch(
|
||||
"SELECT supporting_quote FROM halachot WHERE case_law_id = $1",
|
||||
case_law_id,
|
||||
)
|
||||
}
|
||||
for h in halachot:
|
||||
norm_quote = halacha_quality.normalize_text(h["supporting_quote"])
|
||||
# 1) exact normalized-quote duplicate within this precedent
|
||||
if norm_quote and norm_quote in existing_quotes:
|
||||
skipped += 1
|
||||
continue
|
||||
# 2) semantic near-duplicate (rule embedding cosine)
|
||||
emb = h.get("embedding")
|
||||
if emb is not None and config.HALACHA_DEDUP_COSINE <= 1.0:
|
||||
dup = await conn.fetchval(
|
||||
"SELECT 1 FROM halachot WHERE case_law_id = $1 "
|
||||
"AND embedding IS NOT NULL AND (embedding <=> $2) <= $3 "
|
||||
"LIMIT 1",
|
||||
case_law_id, emb, dedup_distance,
|
||||
)
|
||||
if dup:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
confidence = float(h.get("confidence", 0.0))
|
||||
auto_approve = confidence >= threshold
|
||||
flags = h.get("quality_flags") or []
|
||||
auto_approve = confidence >= threshold and not flags
|
||||
review_status = "approved" if auto_approve else "pending_review"
|
||||
reviewer = (
|
||||
f"auto-approved (confidence ≥ {threshold:.2f})"
|
||||
@@ -3356,22 +3404,29 @@ async def store_halachot_for_chunk(
|
||||
(case_law_id, halacha_index, rule_statement, rule_type,
|
||||
reasoning_summary, supporting_quote, page_reference,
|
||||
practice_areas, subject_tags, cites, confidence,
|
||||
quote_verified, embedding, review_status,
|
||||
quote_verified, quality_flags, embedding, review_status,
|
||||
reviewer, reviewed_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11,
|
||||
$12, $13, $14, $15, {reviewed_at_clause})""",
|
||||
case_law_id, base + j, h["rule_statement"],
|
||||
$12, $13, $14, $15, $16, {reviewed_at_clause})""",
|
||||
case_law_id, base + inserted, h["rule_statement"],
|
||||
h.get("rule_type", "binding"), h.get("reasoning_summary", ""),
|
||||
h["supporting_quote"], h.get("page_reference", ""),
|
||||
h.get("practice_areas", []), h.get("subject_tags", []),
|
||||
h.get("cites", []), confidence, h.get("quote_verified", False),
|
||||
h.get("embedding"), review_status, reviewer,
|
||||
flags, h.get("embedding"), review_status, reviewer,
|
||||
)
|
||||
existing_quotes.add(norm_quote)
|
||||
inserted += 1
|
||||
await conn.execute(
|
||||
"UPDATE precedent_chunks SET halacha_extracted_at = now() "
|
||||
"WHERE id = $1", chunk_id,
|
||||
)
|
||||
return len(halachot)
|
||||
if skipped:
|
||||
logger.info(
|
||||
"store_halachot_for_chunk: case_law=%s chunk=%s — %d inserted, "
|
||||
"%d skipped as duplicates", case_law_id, chunk_id, inserted, skipped,
|
||||
)
|
||||
return inserted
|
||||
|
||||
|
||||
async def list_halachot(
|
||||
@@ -3403,7 +3458,8 @@ async def list_halachot(
|
||||
SELECT h.id, h.case_law_id, h.halacha_index, h.rule_statement,
|
||||
h.rule_type, h.reasoning_summary, h.supporting_quote,
|
||||
h.page_reference, h.practice_areas, h.subject_tags,
|
||||
h.cites, h.confidence, h.quote_verified, h.review_status,
|
||||
h.cites, h.confidence, h.quote_verified, h.quality_flags,
|
||||
h.review_status,
|
||||
h.reviewer, h.reviewed_at, h.created_at, h.updated_at,
|
||||
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
|
||||
cl.precedent_level,
|
||||
|
||||
Reference in New Issue
Block a user