feat(halacha): strict-rubric quality gate + dedup-on-insert (#81,#82)

Bake the 2026-06-03 strict-cleanup rubric into the extraction pipeline so the corpus stays clean at the source instead of accumulating duplicates, obiter dicta, truncated quotes and thin restatements that clog the review queue. #81 — quality gate: - New pure module halacha_quality.py with unit-tested validators: non-decision/obiter (Wambaugh markers), truncated-quote (mid-word cut), thin-restatement (rule≈quote), quote-unverified. - Validators run in halacha_extractor._process; a non-decision is re-typed obiter; flags persist in new halachot.quality_flags column. - Auto-approve now requires confidence>=threshold AND no quality flags; flagged items route to pending_review regardless of confidence. - Both extraction prompts hardened: reject undecided dicta, exclude case-specific applications, require abstraction, forbid over-splitting. #82 — dedup-on-insert (store_halachot_for_chunk): - Within the same precedent, skip a halacha whose normalized supporting_quote already exists, or whose rule-embedding has cosine>=HALACHA_DEDUP_COSINE (0.93) against an already-stored one. Makes re-runs idempotent. Migration: halachot.quality_flags TEXT[] (additive, idempotent ALTER). Tests: 19 new unit tests; full suite 156 passed. Validated end-to-end against dev DB (dedup skips dups, flag blocks auto-approve, re-run inserts 0). Calibration: flags fire on only ~10% of current survivors (low false-positive). Spec: docs/halacha-strict-rubric.md Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 12:30:38 +00:00
parent b0ec24a9d5
commit ca959d4a9c
6 changed files with 386 additions and 18 deletions
--- a/mcp-server/src/legal_mcp/services/db.py
+++ b/mcp-server/src/legal_mcp/services/db.py
@@ -14,6 +14,7 @@ import asyncpg
 from pgvector.asyncpg import register_vector

 from legal_mcp import config
+from legal_mcp.services import halacha_quality

 logger = logging.getLogger(__name__)

@@ -661,10 +662,14 @@ CREATE TABLE IF NOT EXISTS halachot (
        -- pending_review | approved | rejected | published
    reviewer TEXT DEFAULT '',
    reviewed_at TIMESTAMPTZ,
+    quality_flags TEXT[] DEFAULT '{}',
+        -- non_decision | truncated_quote | thin_restatement | quote_unverified
+        -- (any flag blocks auto-approve → routes to pending_review)
    embedding vector(1024),
    created_at TIMESTAMPTZ DEFAULT now(),
    updated_at TIMESTAMPTZ DEFAULT now()
 );
+ALTER TABLE halachot ADD COLUMN IF NOT EXISTS quality_flags TEXT[] DEFAULT '{}';
 CREATE INDEX IF NOT EXISTS idx_halachot_case_law ON halachot(case_law_id);
 CREATE INDEX IF NOT EXISTS idx_halachot_status ON halachot(review_status);
 CREATE INDEX IF NOT EXISTS idx_halachot_practice ON halachot USING gin(practice_areas);
@@ -3333,18 +3338,61 @@ async def store_halachot_for_chunk(
    across chunks never collide. The chunk is marked even when ``halachot`` is
    empty (so resume skips genuinely-empty chunks too). Caller serializes calls
    (a single in-process store-lock) so the MAX read stays race-free.
+
+    Two gates encode the strict rubric (docs/halacha-strict-rubric.md) so the
+    corpus stays clean at the source instead of accumulating noise:
+
+    * Auto-approve gate — a halacha auto-approves only if confidence ≥ threshold
+      AND it carries no ``quality_flags`` (non_decision / truncated_quote /
+      thin_restatement / quote_unverified). Flagged items route to
+      ``pending_review`` regardless of confidence.
+    * Dedup-on-insert — within the SAME precedent, a halacha is skipped if its
+      normalized ``supporting_quote`` already exists, or its rule-embedding has
+      cosine ≥ ``HALACHA_DEDUP_COSINE`` against an already-stored halacha.
+
+    Returns the number of halachot actually INSERTED (after dedup skips).
    """
    threshold = config.HALACHA_AUTO_APPROVE_THRESHOLD
+    dedup_distance = 1.0 - config.HALACHA_DEDUP_COSINE  # cosine sim → distance
    pool = await get_pool()
+    inserted = 0
+    skipped = 0
    async with pool.acquire() as conn:
        async with conn.transaction():
            base = await conn.fetchval(
                "SELECT COALESCE(MAX(halacha_index), -1) + 1 FROM halachot "
                "WHERE case_law_id = $1", case_law_id,
            )
-            for j, h in enumerate(halachot):
+            # Existing normalized quotes for exact-dedup (incl. within-batch).
+            existing_quotes = {
+                halacha_quality.normalize_text(r["supporting_quote"])
+                for r in await conn.fetch(
+                    "SELECT supporting_quote FROM halachot WHERE case_law_id = $1",
+                    case_law_id,
+                )
+            }
+            for h in halachot:
+                norm_quote = halacha_quality.normalize_text(h["supporting_quote"])
+                # 1) exact normalized-quote duplicate within this precedent
+                if norm_quote and norm_quote in existing_quotes:
+                    skipped += 1
+                    continue
+                # 2) semantic near-duplicate (rule embedding cosine)
+                emb = h.get("embedding")
+                if emb is not None and config.HALACHA_DEDUP_COSINE <= 1.0:
+                    dup = await conn.fetchval(
+                        "SELECT 1 FROM halachot WHERE case_law_id = $1 "
+                        "AND embedding IS NOT NULL AND (embedding <=> $2) <= $3 "
+                        "LIMIT 1",
+                        case_law_id, emb, dedup_distance,
+                    )
+                    if dup:
+                        skipped += 1
+                        continue
+
                confidence = float(h.get("confidence", 0.0))
-                auto_approve = confidence >= threshold
+                flags = h.get("quality_flags") or []
+                auto_approve = confidence >= threshold and not flags
                review_status = "approved" if auto_approve else "pending_review"
                reviewer = (
                    f"auto-approved (confidence ≥ {threshold:.2f})"
@@ -3356,22 +3404,29 @@ async def store_halachot_for_chunk(
                       (case_law_id, halacha_index, rule_statement, rule_type,
                        reasoning_summary, supporting_quote, page_reference,
                        practice_areas, subject_tags, cites, confidence,
-                        quote_verified, embedding, review_status,
+                        quote_verified, quality_flags, embedding, review_status,
                        reviewer, reviewed_at)
                       VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11,
-                               $12, $13, $14, $15, {reviewed_at_clause})""",
-                    case_law_id, base + j, h["rule_statement"],
+                               $12, $13, $14, $15, $16, {reviewed_at_clause})""",
+                    case_law_id, base + inserted, h["rule_statement"],
                    h.get("rule_type", "binding"), h.get("reasoning_summary", ""),
                    h["supporting_quote"], h.get("page_reference", ""),
                    h.get("practice_areas", []), h.get("subject_tags", []),
                    h.get("cites", []), confidence, h.get("quote_verified", False),
-                    h.get("embedding"), review_status, reviewer,
+                    flags, h.get("embedding"), review_status, reviewer,
                )
+                existing_quotes.add(norm_quote)
+                inserted += 1
            await conn.execute(
                "UPDATE precedent_chunks SET halacha_extracted_at = now() "
                "WHERE id = $1", chunk_id,
            )
-    return len(halachot)
+    if skipped:
+        logger.info(
+            "store_halachot_for_chunk: case_law=%s chunk=%s — %d inserted, "
+            "%d skipped as duplicates", case_law_id, chunk_id, inserted, skipped,
+        )
+    return inserted


 async def list_halachot(
@@ -3403,7 +3458,8 @@ async def list_halachot(
        SELECT h.id, h.case_law_id, h.halacha_index, h.rule_statement,
               h.rule_type, h.reasoning_summary, h.supporting_quote,
               h.page_reference, h.practice_areas, h.subject_tags,
-               h.cites, h.confidence, h.quote_verified, h.review_status,
+               h.cites, h.confidence, h.quote_verified, h.quality_flags,
+               h.review_status,
               h.reviewer, h.reviewed_at, h.created_at, h.updated_at,
               cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
               cl.precedent_level,