"""Phase B selection logic — cap-of-5 + dedup-frees-slot in _extract_via_panel (#152). Drives the orchestrator in dry_run mode with panel_extract / corpus-dedup / chunk selection monkeypatched, so the cap and the "linked-existing frees a slot" rule are verified without LLM/DB. """ from __future__ import annotations import asyncio from uuid import uuid4 import pytest from legal_mcp import config from legal_mcp.services import halacha_extractor as he CID = uuid4() def _cluster(rule, verdict, votes=3, score=0.9): return {"rule_statement": rule, "supporting_quote": f"q:{rule}", "reasoning_summary": "", "rule_type": "interpretive", "votes": votes, "score": score, "voters": ["claude", "deepseek", "gemini"][:votes], "verdict": verdict, "embedding": [1.0, 0.0]} def _patch_common(monkeypatch, clusters): async def fake_case(_id): return {"id": CID, "source_kind": "external_upload", "is_binding": True, "full_text": " ".join(f"q:{c['rule_statement']}" for c in clusters)} async def fake_chunks(_id): return ([{"content": "reasoning text"}], False) async def fake_panel(text, **kw): return clusters async def none_match(emb, threshold=0.85, status_filter=()): return None # default: nothing known → all new (tests override per-case) monkeypatch.setattr(he.db, "get_case_law", fake_case) monkeypatch.setattr(he, "_select_extractable_chunks", fake_chunks) monkeypatch.setattr(he.panel_extraction, "panel_extract", fake_panel) monkeypatch.setattr(he.db, "nearest_canonical_halacha", none_match) def _run(monkeypatch, clusters, nearest_fn=None): if nearest_fn: monkeypatch.setattr(he.db, "nearest_canonical_halacha", nearest_fn) return asyncio.run(he._extract_via_panel(CID, dry_run=True)) def test_drops_rejected_keeps_approved_and_pending(monkeypatch): clusters = [ _cluster("A", "approved"), _cluster("B", "pending_review", votes=2, score=0.7), _cluster("C", "rejected", votes=1, score=0.9), ] _patch_common(monkeypatch, clusters) res = _run(monkeypatch, clusters) rules = [p["rule_statement"] for p in res["to_store"]] assert "A" in rules and "B" in rules and "C" not in rules def test_cap_limits_new_to_max(monkeypatch): monkeypatch.setattr(config, "HALACHA_PANEL_MAX_NEW", 3) clusters = [_cluster(f"R{i}", "approved") for i in range(6)] _patch_common(monkeypatch, clusters) async def none_match(emb, threshold=0.85, status_filter=()): return None # all new res = _run(monkeypatch, clusters, none_match) assert res["new"] == 3 and res["dropped_over_cap"] == 3 assert len(res["to_store"]) == 3 def test_linked_existing_does_not_consume_cap(monkeypatch): monkeypatch.setattr(config, "HALACHA_PANEL_MAX_NEW", 2) # 5 candidates; the first 3 are "known" (link), last 2 are new clusters = [_cluster(f"K{i}", "approved") for i in range(3)] + \ [_cluster(f"N{i}", "approved") for i in range(2)] _patch_common(monkeypatch, clusters) known = {"K0", "K1", "K2"} async def nearest(emb, threshold=0.85, status_filter=()): # called per candidate in order; pop from a queue mirroring clusters rule = nearest._order.pop(0) return ("canon", 0.99) if rule in known else None nearest._order = [c["rule_statement"] for c in clusters] res = _run(monkeypatch, clusters, nearest) # 3 linked (free) + 2 new (within cap) → all 5 stored, nothing dropped assert res["linked"] == 3 and res["new"] == 2 and res["dropped_over_cap"] == 0 assert len(res["to_store"]) == 5