feat(principles): canonical_statement synthesis service + throttled backfill (Phase E groundwork, #152)

Grounded (INV-AH) multi-instance synthesis with drift guard + chair gate (pending_review, G10). Single path used by backfill, MCP tool, nightly drain. HELD from production run pending the principles-redesign (rename+cull, #152). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-19 10:57:48 +00:00
parent db93735ed6
commit 338a8a947f
14 changed files with 1250 additions and 74 deletions
--- a/mcp-server/tests/test_canonical_synthesis.py
+++ b/mcp-server/tests/test_canonical_synthesis.py
@@ -0,0 +1,134 @@
+"""Unit tests for canonical_statement synthesis (V41 Phase 4) — INV-LRN6 / INV-AH.
+
+Pure-helper coverage + the grounding/drift/citation gates of synthesize_canonical,
+with db / claude_session / embeddings monkeypatched (no DB, no LLM, no Voyage).
+"""
+from __future__ import annotations
+
+import asyncio
+from uuid import uuid4
+
+import pytest
+
+from legal_mcp.services import canonical_synthesis as cs
+
+CID = uuid4()
+
+
+# ── pure helpers ───────────────────────────────────────────────────
+
+def test_cosine_identity_and_orthogonal():
+    assert cs._cosine([1.0, 0.0], [1.0, 0.0]) == pytest.approx(1.0)
+    assert cs._cosine([1.0, 0.0], [0.0, 1.0]) == pytest.approx(0.0)
+    assert cs._cosine([0.0, 0.0], [1.0, 1.0]) == 0.0  # zero-norm guard
+
+
+def test_new_citations_flags_invented_docket_only():
+    src = 'העיקרון מתוך ערר 1234/05 והלכה נוספת'
+    # statute section is fine; shared docket is fine; new docket flagged
+    out = 'לפי סעיף 197 לחוק, וכפי שנקבע בערר 1234/05 ובעע"מ 9999/21'
+    assert cs._new_citations(out, src) == ['9999/21']
+    assert cs._new_citations('סעיף 197 לחוק התכנון והבניה', src) == []
+
+
+def _data(*, statement="עיקרון מקורי נקי", instances=None, embedding=None):
+    return {
+        "id": str(CID),
+        "canonical_statement": statement,
+        "practice_areas": [],
+        "subject_tags": [],
+        "review_status": "pending_synthesis",
+        "instance_count": len(instances or [{}]),
+        "embedding": embedding,
+        "instances": instances if instances is not None else [
+            {"instance_type": "original", "treatment": "mentioned",
+             "rule_statement": "עיקרון מקורי נקי",
+             "supporting_quote": "ציטוט תומך מהפסיקה", "reasoning_summary": "",
+             "case_number": "1234-01-20", "case_name": "פלוני"},
+        ],
+    }
+
+
+def _patch(monkeypatch, *, data, llm, emb=None):
+    async def fake_fetch(_cid):
+        return data
+
+    async def fake_query(*a, **k):
+        return llm
+
+    async def fake_embed(texts, input_type="document"):
+        # default: proposed embeds identical to a [1,0] source → drift 1.0
+        return [emb([t]) if emb else [1.0, 0.0] for t in texts]
+
+    monkeypatch.setattr(cs.db, "fetch_canonical_synthesis_input", fake_fetch)
+    monkeypatch.setattr(cs.claude_session, "query_json", fake_query)
+    monkeypatch.setattr(cs.embeddings, "embed_texts", fake_embed)
+
+
+def _run(monkeypatch, **kw):
+    return asyncio.run(cs.synthesize_canonical(CID, **kw))
+
+
+# ── gate behaviour ─────────────────────────────────────────────────
+
+def test_accepted_when_grounded_and_low_drift(monkeypatch):
+    _patch(monkeypatch,
+           data=_data(embedding=[1.0, 0.0]),
+           llm={"canonical_statement": "עיקרון מזוקק כללי", "grounded": True,
+                "changed": True, "reason": "זוקק"})
+    res = _run(monkeypatch)
+    assert res["status"] == "accepted" and res["accepted"] is True
+    assert res["proposed"] == "עיקרון מזוקק כללי"
+    assert res["embedding"] == [1.0, 0.0]
+    assert res["drift_cosine"] == pytest.approx(1.0)
+
+
+def test_abstained_when_not_grounded(monkeypatch):
+    _patch(monkeypatch, data=_data(),
+           llm={"canonical_statement": "x", "grounded": False, "reason": "אין עיגון"})
+    res = _run(monkeypatch)
+    assert res["status"] == "abstained" and res["accepted"] is False
+    assert res["proposed"] == res["original"]  # original kept
+
+
+def test_abstained_when_no_change(monkeypatch):
+    _patch(monkeypatch, data=_data(statement="זהה"),
+           llm={"canonical_statement": "זהה", "grounded": True})
+    assert _run(monkeypatch)["status"] == "abstained"
+
+
+def test_drift_rejected_keeps_original(monkeypatch):
+    # source [1,0], proposed embeds to [0,1] → cosine 0 < floor
+    _patch(monkeypatch,
+           data=_data(embedding=[1.0, 0.0]),
+           llm={"canonical_statement": "עיקרון אחר לגמרי", "grounded": True},
+           emb=lambda t: [0.0, 1.0])
+    res = _run(monkeypatch, drift_floor=0.80)
+    assert res["status"] == "drift_rejected" and res["accepted"] is False
+    assert res["drift_cosine"] == pytest.approx(0.0)
+    assert res["proposed"] == "עיקרון אחר לגמרי"  # surfaced for audit, not committed
+
+
+def test_new_citation_rejected(monkeypatch):
+    _patch(monkeypatch, data=_data(embedding=[1.0, 0.0]),
+           llm={"canonical_statement": 'עיקרון עם ציטוט חדש עע"מ 8888/22', "grounded": True})
+    res = _run(monkeypatch)
+    assert res["status"] == "new_citation" and res["accepted"] is False
+
+
+def test_no_instances(monkeypatch):
+    _patch(monkeypatch, data=_data(instances=[]),
+           llm={"canonical_statement": "x", "grounded": True})
+    assert _run(monkeypatch)["status"] == "no_instances"
+
+
+def test_llm_error_on_none(monkeypatch):
+    _patch(monkeypatch, data=_data(), llm=None)
+    assert _run(monkeypatch)["status"] == "llm_error"
+
+
+def test_not_found(monkeypatch):
+    async def none_fetch(_cid):
+        return None
+    monkeypatch.setattr(cs.db, "fetch_canonical_synthesis_input", none_fetch)
+    assert asyncio.run(cs.synthesize_canonical(CID))["status"] == "not_found"
--- a/mcp-server/tests/test_panel_extraction.py
+++ b/mcp-server/tests/test_panel_extraction.py
@@ -0,0 +1,116 @@
+"""Unit tests for the tri-model panel extraction core (#152, Phase A).
+
+Pure logic only — classify (the chair's approval rule), _coerce_list (judge-reply
+normalisation), and cluster_candidates (cross-model matching/voting) with injected
+embeddings. No LLM, no Voyage, no DB.
+"""
+from __future__ import annotations
+
+import pytest
+
+from legal_mcp import config
+from legal_mcp.services import panel_extraction as pe
+
+
+# ── classify — chaim's rule ────────────────────────────────────────
+
+def test_classify_three_votes_approves_regardless_of_score():
+    assert pe.classify(3, 0.10) == "approved"
+    assert pe.classify(3, 0.99) == "approved"
+
+
+def test_classify_two_votes_gated_by_floor():
+    floor = config.HALACHA_PANEL_SCORE_FLOOR
+    assert pe.classify(2, floor) == "approved"
+    assert pe.classify(2, floor + 0.05) == "approved"
+    assert pe.classify(2, floor - 0.01) == "pending_review"
+
+
+def test_classify_one_or_zero_votes_rejected():
+    assert pe.classify(1, 0.99) == "rejected"
+    assert pe.classify(0, 0.99) == "rejected"
+
+
+# ── _coerce_list — judge reply normalisation ───────────────────────
+
+def test_coerce_list_accepts_bare_list():
+    raw = [{"rule_statement": "כלל", "supporting_quote": "ציטוט", "score": 0.9}]
+    out = pe._coerce_list(raw)
+    assert len(out) == 1 and out[0]["rule_type"] == "interpretive"
+
+
+def test_coerce_list_unwraps_dict_wrapper_and_drops_incomplete():
+    raw = {"principles": [
+        {"rule_statement": "כלל", "supporting_quote": "ציטוט", "rule_type": "holding", "score": 1.5},
+        {"rule_statement": "", "supporting_quote": "ציטוט"},          # no rule → drop
+        {"rule_statement": "כלל2", "supporting_quote": ""},            # no quote → drop
+    ]}
+    out = pe._coerce_list(raw)
+    assert len(out) == 1
+    assert out[0]["rule_type"] == "holding"
+    assert out[0]["score"] == 1.0           # clamped to [0,1]
+
+
+def test_coerce_list_bad_rule_type_falls_back():
+    out = pe._coerce_list([{"rule_statement": "כלל", "supporting_quote": "צ", "rule_type": "obiter", "score": 0.5}])
+    assert out[0]["rule_type"] == "interpretive"
+
+
+def test_coerce_list_junk_returns_empty():
+    assert pe._coerce_list("nonsense") == []
+    assert pe._coerce_list(None) == []
+
+
+# ── cluster_candidates — cross-model matching & voting ─────────────
+
+def _c(rule, score):
+    return {"rule_statement": rule, "supporting_quote": "q", "reasoning_summary": "",
+            "rule_type": "interpretive", "score": score}
+
+
+def test_cluster_merges_across_models_counts_votes_and_means_score():
+    # same principle proposed by all three (identical embedding) → 1 cluster, 3 votes
+    a, b, c = _c("X", 0.9), _c("X", 0.8), _c("X", 0.7)
+    per_model = {"claude": [a], "deepseek": [b], "gemini": [c]}
+    embs = {id(a): [1.0, 0.0], id(b): [1.0, 0.0], id(c): [1.0, 0.0]}
+    out = pe.cluster_candidates(per_model, embs)
+    assert len(out) == 1
+    cl = out[0]
+    assert cl["votes"] == 3
+    assert cl["score"] == pytest.approx((0.9 + 0.8 + 0.7) / 3, abs=1e-3)
+    assert cl["verdict"] == "approved"
+    assert cl["voters"] == ["claude", "deepseek", "gemini"]
+
+
+def test_cluster_separates_distinct_principles():
+    a, b = _c("X", 0.9), _c("Y", 0.9)
+    per_model = {"claude": [a, b]}
+    embs = {id(a): [1.0, 0.0], id(b): [0.0, 1.0]}  # orthogonal → 2 clusters
+    out = pe.cluster_candidates(per_model, embs)
+    assert len(out) == 2
+    assert all(cl["votes"] == 1 and cl["verdict"] == "rejected" for cl in out)
+
+
+def test_cluster_same_model_twice_counts_one_vote_keeps_best_score():
+    # one model proposes two near-dupes; another proposes the same → 2 votes, not 3
+    a1, a2 = _c("X", 0.6), _c("X", 0.95)
+    b = _c("X", 0.88)
+    per_model = {"claude": [a1, a2], "deepseek": [b]}
+    embs = {id(a1): [1.0, 0.0], id(a2): [1.0, 0.0], id(b): [1.0, 0.0]}
+    out = pe.cluster_candidates(per_model, embs)
+    assert len(out) == 1
+    cl = out[0]
+    assert cl["votes"] == 2                       # claude counts once
+    # claude's best (0.95) and deepseek (0.88) → mean
+    assert cl["score"] == pytest.approx((0.95 + 0.88) / 2, abs=1e-3)
+    assert cl["rule_statement"] == "X"
+
+
+def test_cluster_sorted_strongest_first():
+    a = _c("X", 0.9)   # 1 vote
+    b, c = _c("Y", 0.9), _c("Y", 0.9)  # 2 votes
+    per_model = {"claude": [a, b], "deepseek": [c]}
+    embs = {id(a): [1.0, 0.0], id(b): [0.0, 1.0], id(c): [0.0, 1.0]}
+    out = pe.cluster_candidates(per_model, embs)
+    assert out[0]["rule_statement"] == "Y" and out[0]["votes"] == 2
+    assert out[1]["rule_statement"] == "X" and out[1]["votes"] == 1