Files
legal-ai/scripts/halacha_batch_reconcile.py
Chaim b7b44f4453 feat(halacha): equivalent-halacha (parallel-authority) links across precedents
Cross-precedent recurrence of a principle is real but is NOT citation
corroboration (X11) — the 5 candidate pairs have ZERO citations between their
precedents. Recording them in halacha_citation_corroboration would fabricate
citation data and inflate corroboration_count. This adds a proper, separate
halacha-level link for parallel authority.

Schema (V28): equivalent_halachot — symmetric (halacha_a < halacha_b, CHECK +
UNIQUE), non-citation, cross-precedent-only. ON DELETE CASCADE.

db.py:
- link_equivalent_halachot (idempotent; rejects same-id and SAME-precedent pairs
  — parallel authority is cross-precedent by definition), unlink, and
  list_equivalent_for_halacha.
- list_halachot gains include_equivalents → _annotate_equivalents attaches an
  `equivalents` list (both directions) per row.

API: include_equivalents on GET /api/halachot; GET/POST/DELETE
/api/halachot/{id}/equivalents for the chair to view/link/unlink manually.

scripts/halacha_batch_reconcile.py: --link records found cross-precedent pairs
as equivalent_halachot (non-destructive, idempotent).

web-ui: Halacha.equivalents type; the clean review queue fetches
include_equivalents; the review card shows a gold "עיקרון מקביל ב-N" badge + an
expandable list (case + rule + similarity) labeled "אסמכתה מקבילה — לא ציטוט".

Populated the 5 reviewed pairs (chair decision: keep all + link as parallel
authority). Verified: 5 rows; the 1023-20 hub annotates 3 of its halachot with
equivalents; tsc --noEmit exits 0.

Invariants: G1 (model recurrence at source in its own table, not by abusing the
citator); G2 (no parallel path — extends list_halachot); citator integrity
preserved (corroboration stays citation-only).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 21:29:46 +00:00

124 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""#82.7 — offline CROSS-precedent halacha dedup (conservative, dry-run reporter).
Dedup-on-insert (db.store_halachot_for_chunk) only compares within a single
precedent — the 2026-06-03 audit showed cosine ≥0.90 is reliable only
within-precedent. Across precedents the same principle legitimately recurs, so
this batch job is deliberately STRICTER (cosine ≥0.95) and NON-DESTRUCTIVE: it
only reports candidate cross-precedent near-duplicate pairs to a CSV for the
chair to review. Nothing is skipped, merged, or deleted.
Pairs are found with pgvector's exact cosine (``<=>``) per halacha against
halachot in OTHER precedents; a secondary lexical check (Jaccard/Levenshtein)
is reported alongside so the reviewer can tell "same rule" from "same topic".
cd ~/legal-ai/mcp-server
.venv/bin/python ../scripts/halacha_batch_reconcile.py # cosine ≥0.95
.venv/bin/python ../scripts/halacha_batch_reconcile.py --cosine 0.97
"""
from __future__ import annotations
import argparse
import asyncio
import csv
import sys
from datetime import datetime, timezone
from pathlib import Path
from legal_mcp.services import db, halacha_quality as hq
REPO_ROOT = Path(__file__).resolve().parent.parent
AUDIT_DIR = REPO_ROOT / "data" / "audit"
async def main(args: argparse.Namespace) -> int:
cosine = args.cosine
max_dist = 1.0 - cosine
statuses = ("approved", "published") if not args.include_pending else (
"approved", "published", "pending_review")
pool = await db.get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT h.id, h.case_law_id, cl.case_number, h.rule_statement "
"FROM halachot h JOIN case_law cl ON cl.id = h.case_law_id "
"WHERE h.embedding IS NOT NULL AND h.review_status = ANY($1::text[]) "
"ORDER BY h.case_law_id, h.halacha_index",
list(statuses),
)
print(f"scanning {len(rows)} halachot for cross-precedent pairs "
f"(cosine ≥ {cosine})...", flush=True)
seen: set[frozenset] = set()
pairs: list[dict] = []
for r in rows:
# nearest neighbor in a DIFFERENT precedent
nb = await conn.fetchrow(
"SELECT h2.id, cl2.case_number, h2.rule_statement, "
" (h2.embedding <=> (SELECT embedding FROM halachot WHERE id = $1)) AS dist "
"FROM halachot h2 JOIN case_law cl2 ON cl2.id = h2.case_law_id "
"WHERE h2.embedding IS NOT NULL AND h2.case_law_id <> $2 "
" AND h2.review_status = ANY($3::text[]) "
"ORDER BY h2.embedding <=> (SELECT embedding FROM halachot WHERE id = $1) "
"LIMIT 1",
r["id"], r["case_law_id"], list(statuses),
)
if nb is None or float(nb["dist"]) > max_dist:
continue
key = frozenset({str(r["id"]), str(nb["id"])})
if key in seen:
continue
seen.add(key)
pairs.append({
"case_a": r["case_number"], "id_a": r["id"], "rule_a": r["rule_statement"],
"case_b": nb["case_number"], "id_b": nb["id"], "rule_b": nb["rule_statement"],
"cosine": round(1.0 - float(nb["dist"]), 4),
"jaccard": round(hq.jaccard_shingles(r["rule_statement"], nb["rule_statement"]), 3),
"levenshtein": round(hq.normalized_levenshtein(r["rule_statement"], nb["rule_statement"]), 3),
})
pairs.sort(key=lambda p: -p["cosine"])
print(f"found {len(pairs)} cross-precedent candidate pair(s)", flush=True)
for p in pairs[:30]:
print(f" cos={p['cosine']} jac={p['jaccard']} lev={p['levenshtein']} "
f"{p['case_a']}{p['case_b']}: {p['rule_a'][:60]}...", flush=True)
if pairs:
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
AUDIT_DIR.mkdir(parents=True, exist_ok=True)
out = AUDIT_DIR / f"halacha-cross-precedent-{ts}.csv"
with out.open("w", encoding="utf-8", newline="") as f:
w = csv.DictWriter(f, fieldnames=list(pairs[0].keys()))
w.writeheader()
w.writerows(pairs)
print(f"\nreport: {out}", flush=True)
if args.link and pairs:
# #84.2 — record each pair as parallel authority (equivalent_halachot).
# Non-destructive: links only, never merges/deletes. Idempotent.
linked = 0
for p in pairs:
if await db.link_equivalent_halachot(
p["id_a"], p["id_b"], cosine=p["cosine"],
note="cross-precedent parallel authority (halacha_batch_reconcile)",
created_by="batch_reconcile",
):
linked += 1
print(f"linked {linked}/{len(pairs)} pairs as equivalent_halachot", flush=True)
elif pairs:
print("(review-only — pass --link to record them as equivalent_halachot)", flush=True)
return 0
if __name__ == "__main__":
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--cosine", type=float, default=0.95,
help="min cosine for a cross-precedent candidate (default 0.95)")
ap.add_argument("--include-pending", action="store_true",
help="also scan pending_review halachot (default: approved/published only)")
ap.add_argument("--link", action="store_true",
help="record found pairs as equivalent_halachot (parallel authority, #84.2)")
args = ap.parse_args()
sys.exit(asyncio.run(main(args)))