feat(principles): canonical_statement synthesis service + throttled backfill (Phase E groundwork, #152)

Grounded (INV-AH) multi-instance synthesis with drift guard + chair gate
(pending_review, G10). Single path used by backfill, MCP tool, nightly drain.
HELD from production run pending the principles-redesign (rename+cull, #152).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-19 10:57:48 +00:00
parent db93735ed6
commit 338a8a947f
14 changed files with 1250 additions and 74 deletions

View File

@@ -162,6 +162,24 @@ HALACHA_AUTO_APPROVE_THRESHOLD = float(
os.environ.get("HALACHA_AUTO_APPROVE_THRESHOLD", "0.80")
)
# ── Tri-model panel extraction regime (legal-principles-redesign, #152) ──────
# chaim 2026-06-19: replace single-model auto-approve with a 3-model panel that
# deep-analyzes each decision. 3 models (Claude local + DeepSeek + Gemini) each
# PROPOSE candidate principles with a 0-1 score; candidates are matched across
# models (cosine ≥ MATCH_COSINE) → votes (# distinct models) + score (mean of the
# voters' scores). Approval rule (chaim): 3 votes → approve (even score<floor) ·
# ≥2 votes AND score≥SCORE_FLOOR → approve · 2 votes AND score<floor → chair
# (pending_review, G10) · 1 vote → drop. Cap MAX_NEW genuinely-new principles per
# decision (by score); recognized-existing (V41 cosine link) don't count against
# the cap. Applies to extraction (going forward) AND the retroactive cull (#152).
HALACHA_PANEL_SCORE_FLOOR = float(os.environ.get("HALACHA_PANEL_SCORE_FLOOR", "0.85"))
HALACHA_PANEL_MAX_NEW = int(os.environ.get("HALACHA_PANEL_MAX_NEW", "5"))
# 0.80: legal-principle paraphrases across models land ~0.78-0.82 on voyage-law-2
# (the canonical-synthesis dry-run showed faithful rewrites at 0.78-0.80); too high
# a floor misses genuine cross-model agreement → undercounts votes → over-culls.
# Calibrate against the gold-set in Phase C before the production cull.
HALACHA_PANEL_MATCH_COSINE = float(os.environ.get("HALACHA_PANEL_MATCH_COSINE", "0.80"))
# Halacha dedup-on-insert — within-precedent semantic cosine ceiling. Before
# storing a halacha, store_halachot_for_chunk skips it if its rule-embedding has
# cosine >= this value against an already-stored halacha of the SAME precedent
@@ -210,6 +228,20 @@ HALACHA_CONSOLIDATE_EFFORT = os.environ.get("HALACHA_CONSOLIDATE_EFFORT", "high"
HALACHA_CANONICAL_LOOKUP_ENABLED = os.environ.get("HALACHA_CANONICAL_LOOKUP_ENABLED", "true").lower() == "true"
HALACHA_CANONICAL_THRESHOLD = float(os.environ.get("HALACHA_CANONICAL_THRESHOLD", "0.85"))
# V41 canonical synthesis (Phase 4) — a claude_session pass that rewrites each
# canonical's statement (carried over verbatim from the representative halacha at
# backfill) into ONE clean, case-independent legal principle, grounded in the
# instances' supporting quotes (INV-AH), then flips review_status
# pending_synthesis → pending_review for the chair gate (G10). Opus by default —
# substance-bearing rewrite, chair-facing. Runs through the local CLI (zero $-cost,
# but consumes subscription usage windows → throttled via usage_limits).
# Drift guard: the synthesized statement is re-embedded and compared (cosine) to
# the source; below the floor the synthesis is REJECTED (kept as-is, flagged) so a
# hallucinated/topic-drifted rewrite never silently overwrites a sound principle.
HALACHA_CANONICAL_SYNTH_MODEL = os.environ.get("HALACHA_CANONICAL_SYNTH_MODEL", HALACHA_EXTRACT_MODEL)
HALACHA_CANONICAL_SYNTH_EFFORT = os.environ.get("HALACHA_CANONICAL_SYNTH_EFFORT", "high")
HALACHA_CANONICAL_SYNTH_DRIFT_FLOOR = float(os.environ.get("HALACHA_CANONICAL_SYNTH_DRIFT_FLOOR", "0.80"))
# Google Cloud Vision (OCR for scanned PDFs)
GOOGLE_CLOUD_VISION_API_KEY = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY", "")

View File

@@ -465,6 +465,13 @@ async def canonical_halacha_get(canonical_id: str) -> str:
return await plib.canonical_halacha_get(canonical_id)
@mcp.tool()
async def canonical_synthesize_pending(limit: int = 20) -> str:
"""סנתז ניסוח-קנוני לעקרונות הממתינים (pending_synthesis) → pending_review (שער-יו"ר). V41 Phase 4.
מעוגן בציטוטי-המופעים (INV-AH) עם שער-drift. on-demand/burst; המסה הראשונית ב-backfill."""
return await plib.canonical_synthesize_pending(limit)
# Documents
@mcp.tool()
async def document_upload(

View File

@@ -0,0 +1,220 @@
"""Canonical-halacha synthesis (V41 Phase 4).
The backfill carried each canonical's ``canonical_statement`` over verbatim from
its representative halacha. This pass asks a local ``claude_session`` model to
rewrite that statement into ONE clean, case-independent legal principle — for the
~6 multi-instance canonicals a genuine merge of the N phrasings, for the singleton
majority a faithful generalising polish — then advances ``review_status``
pending_synthesis → pending_review for the chair gate (G10 / INV-LRN1).
Invariants this module upholds:
• INV-AH — the synthesis is GROUNDED in the instances' ``supporting_quote``s.
The model abstains (``grounded=false``) rather than invent law, no
new case citations may appear, and a re-embedding **drift guard**
rejects any rewrite that drifts too far from the source statement.
• G10/INV-LRN1 — never auto-approves; lands at ``pending_review`` for the chair.
• G9 — every outcome (accepted / kept-original / abstained) is logged + returned.
• G2 — single synthesis path; the backfill script, the on-demand MCP tool and
the nightly drain all call :func:`synthesize_canonical` here.
LLM calls go through ``claude_session`` (local ``claude -p`` CLI) only — never the
Anthropic SDK, never from the FastAPI container (see claude_session docstring).
"""
from __future__ import annotations
import logging
import math
import re
from uuid import UUID
from legal_mcp import config
from legal_mcp.services import claude_session, db, embeddings
logger = logging.getLogger(__name__)
# Case-citation shapes (docket numbers) that must NOT be invented by the rewrite:
# "1234/05", "85074-09-24", "8125-09-24". Statute section refs ("סעיף 197") do not
# match and are legitimately part of a principle.
_CITATION_RE = re.compile(r"\d{3,5}[-/]\d{2}(?:[-/]\d{2,4})?")
_SYSTEM = (
"אתה עורך-דין בכיר המנסח עקרונות-הלכה קנוניים לבסיס-ידע משפטי של ועדת ערר "
"לתכנון ובנייה. תפקידך לזקק ניסוח אחד, כללי ומדויק, של עיקרון משפטי — לא לסכם "
"תיק ולא להמציא דין."
)
def _build_prompt(data: dict) -> str:
instances = data.get("instances") or []
blocks: list[str] = []
for i, inst in enumerate(instances, 1):
parts = [f"### מופע {i} (תיק {inst.get('case_number') or ''}, "
f"סוג: {inst.get('instance_type') or ''})"]
if inst.get("rule_statement"):
parts.append(f"ניסוח-העיקרון: {inst['rule_statement']}")
if inst.get("supporting_quote"):
parts.append(f"ציטוט-תומך (מקור-העיגון): \"{inst['supporting_quote']}\"")
if inst.get("reasoning_summary"):
parts.append(f"נימוק: {inst['reasoning_summary']}")
blocks.append("\n".join(parts))
evidence = "\n\n".join(blocks) if blocks else "(אין מופעים)"
multi = len(instances) > 1
task = (
"מזג את כל ניסוחי-המופעים לעיקרון קנוני אחד המשותף לכולם."
if multi else
"נסח מחדש את העיקרון לניסוח קנוני נקי וכללי."
)
return f"""{_SYSTEM}
הניסוח הקנוני הנוכחי (שיש לשפר):
{data.get('canonical_statement') or '(ריק)'}
מקורות-העיגון (מופעי העיקרון בפסיקה):
{evidence}
## המשימה
{task}
## כללים מחייבים (INV-AH — עיגון, ללא הזיה)
1. **עיגון-מקור בלבד.** הניסוח חייב לנבוע מהציטוטים-התומכים שלמעלה. אסור להוסיף דין, חריג, סייג או תנאי שאינו עולה מהמקורות.
2. **ללא ציטוטי-תיקים חדשים.** אל תוסיף מספרי-תיק/פסקי-דין שאינם מופיעים במקורות. הפניה לסעיף-חוק כללי (למשל "סעיף 197 לחוק התכנון והבניה") מותרת אם היא חלק מהעיקרון.
3. **כללי ובלתי-תלוי-תיק.** הסר שמות-צדדים, עובדות-תיק ספציפיות ומספרים קונקרטיים. נסח עיקרון רב-תחולה, לא סיכום של מקרה.
4. **רגיסטר משפטי נקי** בעברית, משפט אחד עד שניים, ללא מילות-פתיחה ("נקבע כי", "בית-המשפט קבע") — רק העיקרון עצמו.
5. **הימנעות עדיפה על המצאה.** אם אינך יכול לזקק עיקרון מעוגן מהמקורות — החזר grounded=false והשאר את הניסוח הקיים.
## פלט — JSON בלבד, ללא markdown וללא הסבר:
{{
"canonical_statement": "<הניסוח הקנוני המזוקק, או הניסוח הקיים אם grounded=false>",
"grounded": true,
"changed": true,
"reason": "<משפט קצר: מה שונה, או מדוע נמנעת>"
}}"""
def _cosine(a: list[float], b: list[float]) -> float:
dot = sum(x * y for x, y in zip(a, b))
na = math.sqrt(sum(x * x for x in a))
nb = math.sqrt(sum(y * y for y in b))
if na == 0 or nb == 0:
return 0.0
return dot / (na * nb)
def _new_citations(text: str, source_text: str) -> list[str]:
"""Docket-number tokens present in the rewrite but absent from the source evidence."""
src = set(_CITATION_RE.findall(source_text))
return [tok for tok in _CITATION_RE.findall(text) if tok not in src]
async def synthesize_canonical(
canonical_id: UUID,
*,
model: str | None = None,
effort: str | None = None,
drift_floor: float | None = None,
) -> dict:
"""Synthesize one canonical's statement. PURE — does not write to the DB.
Returns a proposal dict the caller applies (or not, for dry-run):
{status, canonical_id, accepted, original, proposed, embedding, drift_cosine, reason}
status ∈ {accepted, abstained, drift_rejected, new_citation, no_instances,
llm_error, not_found}. ``accepted`` carries ``proposed`` + ``embedding``
(the rewrite's vector, to commit alongside the statement). Every other status
keeps the original statement.
"""
model = model or config.HALACHA_CANONICAL_SYNTH_MODEL
effort = effort or config.HALACHA_CANONICAL_SYNTH_EFFORT
drift_floor = config.HALACHA_CANONICAL_SYNTH_DRIFT_FLOOR if drift_floor is None else drift_floor
data = await db.fetch_canonical_synthesis_input(canonical_id)
if data is None:
return {"status": "not_found", "canonical_id": str(canonical_id)}
original = data.get("canonical_statement") or ""
instances = data.get("instances") or []
base = {"status": "", "canonical_id": str(canonical_id), "accepted": False,
"original": original, "proposed": original, "embedding": None,
"drift_cosine": None, "reason": ""}
if not instances:
return {**base, "status": "no_instances", "reason": "no linked instances"}
try:
result = await claude_session.query_json(
_build_prompt(data), model=model, effort=effort, tools="",
)
except Exception as e:
logger.warning("synthesize_canonical %s: LLM error: %s", canonical_id, e)
return {**base, "status": "llm_error", "reason": str(e)}
if not isinstance(result, dict) or not result.get("canonical_statement"):
return {**base, "status": "llm_error", "reason": "malformed LLM output"}
if not result.get("grounded", True):
return {**base, "status": "abstained",
"reason": result.get("reason") or "model abstained (not grounded)"}
proposed = str(result["canonical_statement"]).strip()
if not proposed or proposed == original:
return {**base, "status": "abstained", "reason": "no change proposed"}
# AH-2: no invented docket citations. Source = current statement + all evidence.
source_text = original + " " + " ".join(
f"{i.get('rule_statement', '')} {i.get('supporting_quote', '')}" for i in instances
)
invented = _new_citations(proposed, source_text)
if invented:
return {**base, "status": "new_citation", "proposed": proposed,
"reason": f"introduced citations absent from source: {invented}"}
# Drift guard: re-embed the rewrite, compare to the source statement's vector.
new_emb = (await embeddings.embed_texts([proposed]))[0]
src_emb = data.get("embedding")
if not src_emb:
src_emb = (await embeddings.embed_texts([original]))[0]
drift = _cosine(new_emb, src_emb)
if drift < drift_floor:
return {**base, "status": "drift_rejected", "proposed": proposed,
"drift_cosine": round(drift, 4),
"reason": f"drift {drift:.3f} < floor {drift_floor}"}
return {**base, "status": "accepted", "accepted": True, "proposed": proposed,
"embedding": new_emb, "drift_cosine": round(drift, 4),
"reason": result.get("reason") or "synthesized"}
async def synthesize_and_apply(
canonical_id: UUID,
*,
model: str | None = None,
effort: str | None = None,
drift_floor: float | None = None,
) -> dict:
"""Synthesize one canonical and commit the outcome.
On ``accepted`` writes the new statement + its embedding. On any other terminal
outcome (abstained / drift_rejected / new_citation) the ORIGINAL statement is
kept but ``review_status`` still advances to ``pending_review`` — a synthesis was
attempted, so the row leaves the queue (no infinite re-attempt) and reaches the
chair as-is. ``not_found`` / ``no_instances`` / ``llm_error`` are NOT committed
(transient or empty) so they are retried on the next pass.
"""
proposal = await synthesize_canonical(
canonical_id, model=model, effort=effort, drift_floor=drift_floor,
)
status = proposal["status"]
if status in ("not_found", "no_instances", "llm_error"):
return proposal
if proposal["accepted"]:
await db.apply_canonical_synthesis(
canonical_id, proposal["proposed"], embedding=proposal["embedding"],
)
else:
# keep original statement + embedding, just advance the gate
await db.apply_canonical_synthesis(canonical_id, proposal["original"])
return proposal

View File

@@ -6147,6 +6147,71 @@ async def update_canonical_statement(
return result.split()[-1] != "0"
async def fetch_canonical_synthesis_input(canonical_id: "UUID") -> "dict | None":
"""Fetch everything the canonical-synthesis pass needs for one principle (V41 Phase 4).
Unlike :func:`get_canonical_halacha` (UI-facing) this returns the canonical's own
``embedding`` (as a python list, for the drift guard) AND each instance's full text
fields (``rule_statement`` + ``supporting_quote`` + ``reasoning_summary``) — the
grounding evidence the LLM rewrites from (INV-AH). Returns None if not found.
"""
pool = await get_pool()
row = await pool.fetchrow(
"SELECT id::text, canonical_statement, rule_type, practice_areas, "
" subject_tags, review_status, instance_count, embedding "
"FROM canonical_halachot WHERE id=$1",
canonical_id,
)
if not row:
return None
instances = await pool.fetch(
"SELECT h.instance_type, h.treatment, h.rule_statement, "
" h.supporting_quote, h.reasoning_summary, "
" cl.case_number, cl.case_name "
"FROM halachot h JOIN case_law cl ON cl.id = h.case_law_id "
"WHERE h.canonical_id=$1 "
"ORDER BY (h.instance_type='original') DESC, cl.case_number",
canonical_id,
)
emb = row["embedding"]
out = dict(row)
out["embedding"] = list(emb) if emb is not None else None
out["instances"] = [dict(i) for i in instances]
return out
async def apply_canonical_synthesis(
canonical_id: "UUID",
canonical_statement: str,
embedding: "list[float] | None" = None,
review_status: str = "pending_review",
) -> bool:
"""Atomically commit a synthesis outcome for one canonical (V41 Phase 4).
Always advances ``review_status`` (default → ``pending_review`` for the chair
gate, G10/INV-LRN1) and writes ``canonical_statement``. ``embedding`` is updated
only when provided (None = leave as-is) so the keep-original path on a
drift-rejected/abstained synthesis doesn't need to re-embed. Returns True if the
row existed.
"""
pool = await get_pool()
if embedding is None:
result = await pool.execute(
"UPDATE canonical_halachot "
"SET canonical_statement=$2, review_status=$3, updated_at=now() "
"WHERE id=$1",
canonical_id, canonical_statement, review_status,
)
else:
result = await pool.execute(
"UPDATE canonical_halachot "
"SET canonical_statement=$2, embedding=$3, review_status=$4, updated_at=now() "
"WHERE id=$1",
canonical_id, canonical_statement, embedding, review_status,
)
return result.split()[-1] != "0"
async def list_canonical_instances(canonical_id: "UUID") -> list[dict]:
"""List all halachot (instances) sharing a canonical_id — used by the UI accordion."""
pool = await get_pool()

View File

@@ -0,0 +1,243 @@
"""Tri-model panel extraction regime (legal-principles-redesign, #152).
The shared core (G2) for BOTH the going-forward extractor (Phase B) and the
retroactive cull (Phase C). chaim 2026-06-19:
1. THREE models (Claude local + DeepSeek + Gemini) deep-analyze a decision and
each PROPOSES candidate principles, each with a 0-1 score.
2. Candidates are matched ACROSS models by embedding cosine → a "merged
candidate" carries: votes (# distinct models that proposed it) and score
(mean of the voters' scores).
3. Approval rule:
votes == 3 → approved (even if score < floor)
votes >= 2 AND score >= SCORE_FLOOR → approved
votes == 2 AND score < SCORE_FLOOR → pending_review (chair, G10)
votes <= 1 → rejected (dropped)
4. The CALLER applies the corpus-dedup (V41 link → frees a slot) and the
MAX_NEW cap (top-N approved-new by score). This module is corpus-agnostic
and DB-free so it is unit-testable and reused identically by B and C.
Terminology (#152): a principle from a binding higher court is a הלכה; one from
the appeals committee (internal_committee) is a כלל פרשני (interpretive rule) —
the committee applies law, it does not make binding precedent. The extract prompt
adapts to ``source_kind`` and, for the committee, demands genuine novelty.
"""
from __future__ import annotations
import logging
import math
import httpx
from legal_mcp import config
from legal_mcp.services import embeddings, panel_judges
logger = logging.getLogger(__name__)
_RULE_TYPES = ("holding", "interpretive", "procedural") # citable kinds only
def _extract_system(source_kind: str, is_binding: bool, max_candidates: int) -> str:
if source_kind == "internal_committee":
nature = (
"המקור הוא החלטת ועדת-ערר. ועדת ערר מיישמת דין קיים ואינה יוצרת הלכה מחייבת. "
"חלץ אך ורק כללים פרשניים חדשים לגמרי שהוועדה גיבשה — לא יישום של הלכה ידועה, "
"לא חזרה על דין מוכר, ולא תיאור עובדות. אם אין כלל פרשני חדש אמיתי — החזר []."
)
elif is_binding:
nature = (
"המקור הוא פסק-דין של בית-משפט מחוזי/עליון. חלץ הלכות — כללים משפטיים "
"בני-הכללה והסתמכות שהפסק קובע או מאמץ ומיישם."
)
else:
nature = (
"המקור הוא פסיקה משכנעת (לא-מחייבת). חלץ עקרונות משפטיים בני-הכללה בלבד."
)
return (
"אתה משפטן בכיר בוועדת ערר לתכנון ובנייה, מנתח פסיקה לבסיס-ידע בר-ציטוט. "
f"{nature}\n\n"
"כללי-ברזל:\n"
"• רק עיקרון כללי בר-הכללה והסתמכות — לא החלה תלוית-עובדות/צדדים/סכומים, "
"לא אמרת-אגב (סוגיה שלא הוכרעה), לא חזרה מילולית על הציטוט ללא הפשטה.\n"
"• כל עיקרון חייב עיגון: ציטוט מילולי מהמקור התומך בו (INV-AH).\n"
f"• החזר עד {max_candidates} המועמדים החזקים ביותר בלבד; מוטב מעט ואיכותי.\n\n"
"פלט — JSON array בלבד, ללא markdown:\n"
"[{\n"
' "rule_statement": "<העיקרון, כללי ובלתי-תלוי-תיק>",\n'
' "supporting_quote": "<ציטוט מילולי מהמקור>",\n'
' "reasoning_summary": "<מדוע זה עיקרון בר-הסתמכות>",\n'
' "rule_type": "holding|interpretive|procedural",\n'
' "score": 0.0-1.0\n'
"}]\n"
"אם אין עקרונות ראויים — החזר []."
)
def _coerce_list(reply) -> list[dict]:
"""A judge may return a list, or {"principles":[...]}/{"items":[...]}, or junk."""
if isinstance(reply, list):
items = reply
elif isinstance(reply, dict):
for k in ("principles", "items", "halachot", "results", "candidates"):
if isinstance(reply.get(k), list):
items = reply[k]
break
else:
items = [reply] if reply.get("rule_statement") else []
else:
return []
out = []
for it in items:
if not isinstance(it, dict):
continue
rule = (it.get("rule_statement") or "").strip()
quote = (it.get("supporting_quote") or "").strip()
if not rule or not quote:
continue
rt = (it.get("rule_type") or "interpretive").strip().lower()
try:
score = float(it.get("score", 0.0))
except (TypeError, ValueError):
score = 0.0
out.append({
"rule_statement": rule,
"supporting_quote": quote,
"reasoning_summary": (it.get("reasoning_summary") or "").strip(),
"rule_type": rt if rt in _RULE_TYPES else "interpretive",
"score": max(0.0, min(1.0, score)),
})
return out
def _cosine(a: list[float], b: list[float]) -> float:
dot = sum(x * y for x, y in zip(a, b))
na = math.sqrt(sum(x * x for x in a))
nb = math.sqrt(sum(y * y for y in b))
return 0.0 if na == 0 or nb == 0 else dot / (na * nb)
def classify(votes: int, score: float) -> str:
"""The chair's approval rule → 'approved' | 'pending_review' | 'rejected'."""
floor = config.HALACHA_PANEL_SCORE_FLOOR
if votes >= 3:
return "approved"
if votes == 2:
return "approved" if score >= floor else "pending_review"
return "rejected"
def cluster_candidates(
per_model: dict[str, list[dict]], embs: dict[int, list[float]],
) -> list[dict]:
"""Greedy cross-model clustering. ``per_model`` maps judge→its candidate list;
``embs`` maps id(candidate)→embedding. Each cluster merges near-duplicate
proposals: votes = # distinct models present, score = mean of each model's
BEST score in the cluster, representative = highest-scoring member.
Pure (no I/O) given the embeddings — unit-testable.
"""
match = config.HALACHA_PANEL_MATCH_COSINE
clusters: list[dict] = []
# deterministic order: model order, then model-local order
flat: list[tuple[str, dict]] = []
for m in panel_judges.JUDGE_NAMES:
for c in per_model.get(m, []):
flat.append((m, c))
for model, cand in flat:
emb = embs.get(id(cand))
placed = False
if emb is not None:
for cl in clusters:
if cl["_emb"] is not None and _cosine(cl["_emb"], emb) >= match:
cl["members"].append({"model": model, **cand})
prev = cl["per_model_score"].get(model, -1.0)
cl["per_model_score"][model] = max(prev, cand["score"])
if cand["score"] > cl["score_rep"]:
cl["score_rep"] = cand["score"]
cl["rule_statement"] = cand["rule_statement"]
cl["supporting_quote"] = cand["supporting_quote"]
cl["reasoning_summary"] = cand["reasoning_summary"]
cl["rule_type"] = cand["rule_type"]
cl["_emb"] = emb
placed = True
break
if not placed:
clusters.append({
"rule_statement": cand["rule_statement"],
"supporting_quote": cand["supporting_quote"],
"reasoning_summary": cand["reasoning_summary"],
"rule_type": cand["rule_type"],
"members": [{"model": model, **cand}],
"per_model_score": {model: cand["score"]},
"score_rep": cand["score"],
"_emb": emb,
})
out = []
for cl in clusters:
pms = cl["per_model_score"]
votes = len(pms)
score = sum(pms.values()) / votes if votes else 0.0
out.append({
"rule_statement": cl["rule_statement"],
"supporting_quote": cl["supporting_quote"],
"reasoning_summary": cl["reasoning_summary"],
"rule_type": cl["rule_type"],
"votes": votes,
"score": round(score, 4),
"voters": sorted(pms.keys()),
"verdict": classify(votes, score),
"embedding": cl["_emb"],
})
# strongest first
out.sort(key=lambda c: (c["votes"], c["score"]), reverse=True)
return out
async def _run_three(system: str, user: str, max_tokens: int) -> dict[str, object]:
async with httpx.AsyncClient() as client:
import asyncio
c, ds, gm = await asyncio.gather(
panel_judges.judge_claude(system, user, max_tokens=max_tokens),
panel_judges.judge_deepseek(client, system, user, max_tokens=max_tokens),
panel_judges.judge_gemini(client, system, user, max_tokens=max_tokens),
)
return {"claude": c, "deepseek": ds, "gemini": gm}
async def panel_extract(
text: str,
*,
source_kind: str = "external_upload",
is_binding: bool = True,
propose_n: int | None = None,
) -> list[dict]:
"""Run the 3-model panel over a decision's text → merged candidate principles.
Returns clusters (strongest first), each:
{rule_statement, supporting_quote, reasoning_summary, rule_type,
votes, score, voters, verdict, embedding}
Does NOT dedup vs the corpus and does NOT apply the MAX_NEW cap — the caller
(extractor / cull) owns those (they need DB + differ B vs C).
"""
propose_n = propose_n if propose_n is not None else config.HALACHA_PANEL_MAX_NEW + 3
system = _extract_system(source_kind, is_binding, propose_n)
user = f"--- תחילת המקור ---\n{text}\n--- סוף המקור ---"
replies = await _run_three(system, user, max_tokens=8000)
per_model: dict[str, list[dict]] = {}
for name in panel_judges.JUDGE_NAMES:
per_model[name] = _coerce_list(replies.get(name))
if not any(per_model.values()):
logger.warning("panel_extract: all three judges returned no candidates")
return []
# embed every candidate's rule_statement for cross-model matching
flat = [c for m in panel_judges.JUDGE_NAMES for c in per_model[m]]
embs: dict[int, list[float]] = {}
if flat:
vecs = await embeddings.embed_texts([c["rule_statement"] for c in flat])
for c, v in zip(flat, vecs):
embs[id(c)] = list(v)
return cluster_candidates(per_model, embs)

View File

@@ -0,0 +1,114 @@
"""Three independent-lineage LLM judges — the shared primitive (G2).
Extracted from scripts/halacha_panel_approve.py so the panel-extraction regime
(#152) and the existing approval-triage share ONE implementation of the judges
(no parallel HTTP/auth paths). Diversity of lineage is the point — cross-model
agreement is the reliable signal (gold-set AC1=0.92):
• claude — Opus via claude_session (local CLI, zero marginal cost) [Anthropic]
• deepseek — api.deepseek.com (deepseek-chat) [DeepSeek]
• gemini — generativelanguage (gemini-2.5-flash, #1 LegalBench) [Google]
Every judge has the SAME signature ``(system, user) -> dict | None`` and returns
None on ANY failure (missing key, HTTP error, bad JSON) — callers must tolerate a
missing judge (a 2/3 panel is still actionable).
"""
from __future__ import annotations
import json
import os
from pathlib import Path
import httpx
from legal_mcp.services import claude_session
def _env_key(name: str, *files: str) -> str:
for f in files:
p = Path(f).expanduser()
if p.exists():
for line in p.read_text().splitlines():
if line.startswith(name + "="):
return line.split("=", 1)[1].strip()
return os.environ.get(name, "")
DEEPSEEK_KEY = _env_key("DEEPSEEK_API_KEY", "~/.hermes/profiles/deepseek/.env", "~/.env")
# canonical Infisical name is GOOGLE_GEMINI_API_KEY (/external-apis/gemini); accept
# the bare GEMINI_API_KEY too for back-compat.
GEMINI_KEY = _env_key("GOOGLE_GEMINI_API_KEY", "~/.env") or _env_key("GEMINI_API_KEY", "~/.env")
JUDGE_NAMES = ("claude", "deepseek", "gemini")
def available() -> dict[str, bool]:
return {"claude": True, "deepseek": bool(DEEPSEEK_KEY), "gemini": bool(GEMINI_KEY)}
async def judge_claude(system: str, user: str, *, max_tokens: int = 2000) -> dict | list | None:
try:
# tools="" → no tool_use, so a pure text→JSON extraction never trips
# error_max_turns (and wastes no retries on a web-search detour).
return await claude_session.query_json(user, system=system, tools="")
except Exception:
return None
async def judge_deepseek(
client: httpx.AsyncClient, system: str, user: str, *, max_tokens: int = 2000,
) -> dict | list | None:
if not DEEPSEEK_KEY:
return None
try:
r = await client.post(
"https://api.deepseek.com/v1/chat/completions",
headers={"Authorization": f"Bearer {DEEPSEEK_KEY}", "Content-Type": "application/json"},
json={"model": "deepseek-chat", "temperature": 0, "max_tokens": max_tokens,
"response_format": {"type": "json_object"},
"messages": [{"role": "system", "content": system},
{"role": "user", "content": user}]},
timeout=120,
)
r.raise_for_status()
return json.loads(r.json()["choices"][0]["message"]["content"])
except Exception:
return None
async def judge_gemini(
client: httpx.AsyncClient, system: str, user: str, *, max_tokens: int = 8000,
) -> dict | list | None:
if not GEMINI_KEY:
return None
try:
r = await client.post(
f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key={GEMINI_KEY}",
headers={"Content-Type": "application/json"},
json={"system_instruction": {"parts": [{"text": system}]},
"contents": [{"parts": [{"text": user}]}],
# thinkingBudget=0 disables gemini-2.5-flash's "thinking", which
# otherwise eats the output budget on large inputs → empty parts
# → finishReason MAX_TOKENS → the judge silently dropped out.
"generationConfig": {"temperature": 0, "maxOutputTokens": max_tokens,
"responseMimeType": "application/json",
"thinkingConfig": {"thinkingBudget": 0}}},
timeout=120,
)
r.raise_for_status()
parts = (r.json().get("candidates") or [{}])[0].get("content", {}).get("parts")
if not parts:
return None
return json.loads(parts[0]["text"])
except Exception:
return None
def to_bool(d: dict | None, key: str) -> bool | None:
"""Robust bool coercion for a judge JSON field (handles he/en truthy strings)."""
if not isinstance(d, dict) or key not in d:
return None
v = d[key]
if isinstance(v, bool):
return v
return str(v).strip().lower() in ("true", "1", "yes", "כן")

View File

@@ -21,7 +21,7 @@ from __future__ import annotations
import time
from uuid import UUID
from legal_mcp.services import db, precedent_library, telemetry
from legal_mcp.services import canonical_synthesis, db, precedent_library, telemetry
from legal_mcp.tools.envelope import empty, err as _err, ok as _ok # GAP-48: SSoT envelope
@@ -439,3 +439,34 @@ async def canonical_halacha_get(canonical_id: str) -> str:
if row is None:
return _err("עיקרון קנוני לא נמצא")
return _ok(row)
async def canonical_synthesize_pending(limit: int = 20) -> str:
"""סנתז ניסוח-קנוני לעקרונות הממתינים (review_status='pending_synthesis'). V41 Phase 4.
לכל עיקרון: מודל מקומי (claude_session) מזקק ניסוח אחד, כללי ומעוגן בציטוטי-המופעים
(INV-AH), שער-drift דוחה סטייה גדולה מדי, והסטטוס מתקדם ל-pending_review לשער-היו"ר
(G10). on-demand / burst ידני; המסה הראשונית מטופלת ב-backfill_canonical_synthesis.py.
Args:
limit: מספר מקסימלי לסבב (עד 100). רב-instance מטופלים ראשונים.
"""
pool = await db.get_pool()
rows = await pool.fetch(
"SELECT id::text AS id FROM canonical_halachot "
"WHERE review_status='pending_synthesis' "
"ORDER BY instance_count DESC, created_at LIMIT $1",
min(max(limit, 1), 100),
)
if not rows:
return _ok({"processed": 0, "results": [], "message": "אין עקרונות ממתינים לסינתזה"})
results = []
counts: dict[str, int] = {}
for r in rows:
res = await canonical_synthesis.synthesize_and_apply(UUID(r["id"]))
counts[res["status"]] = counts.get(res["status"], 0) + 1
results.append({
"canonical_id": res["canonical_id"], "status": res["status"],
"drift_cosine": res.get("drift_cosine"), "reason": res.get("reason", ""),
})
return _ok({"processed": len(results), "by_status": counts, "results": results})