feat: Stage C — RAG advanced (#33, #47, #48, #49, #50, #51)
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m35s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m35s
Six independent sub-tasks dispatched in parallel; aggregated here. ## #33 — Hide case_name column library-list-panel.tsx: `<TableHead>` + `<TableCell>` for "שם" get `className="hidden"` in both Court and Committee row variants. DB column preserved for future use. ## #47 — Audit script periodic New scripts/audit_corpus_integrity.py — 3 SQL checks (external+ערר prefix, internal missing chair/district, cases.practice_area enum) + CEO wakeup on violations + cron `0 7 * * *`. First run: 0 issues. ## #48 — Parent-doc retrieval (gated, default off) Schema V17: precedent_chunks.parent_chunk_id + chunk_role ('child'|'parent'). New chunker.chunk_document_hierarchical() — section-aware parents (~1500 tokens) containing ~5 overlapping children (~300 tokens each). New db.store_precedent_chunks_hierarchical two-pass writer. Search SQL (semantic + lexical) LEFT-JOIN parent and swap content + dedupe by parent_chunk_id when flag on. Toggle: PARENT_DOC_RETRIEVAL_ENABLED + PARENT_DOC_{CHILD,PARENT}_SIZE_TOKENS. Backfill ~3min and ~$0.20 — deferred to follow-up. ## #49 — Multimodal backfill New scripts/backfill_multimodal_precedents.py with token-matching case_number ↔ source files (PDF + DOCX via PyMuPDF). Ran in container: 26 precedents embedded, 503 pages, $0.21, 0 errors. precedent_image_embeddings grew 3 → 29 rows. 44 remaining are style_corpus-migrated rows (no source file on disk) — will catch up when re-uploaded. ## #50 — Closed-loop feedback + nDCG Schema V18: search_logs + search_relevance_feedback. New telemetry.py with fire-and-forget log_search_bg (p50 = 0.002ms — zero overhead) + auto-infer_relevance_from_citations (reads case drafts → marks score=3 when cited precedent appears in past search top-K). Hooks added to 5 search paths. scripts/compute_ndcg.py for aggregation. Two admin API endpoints (GET /api/admin/rag-metrics + POST .../infer). Dashboard UI deferred — API is enough for now. ## #51 — Halacha quality monitoring New scripts/monitor_halacha_quality.py — baseline avg confidence (trusted=0.849, all=0.833, pending=0.694) with rolling window drift detection. Default 5% threshold. Exits non-zero on alert for cron integration. Recommended: `0 8 * * 1` weekly Mon 8am. ## Bonus: 230 unlinked citations → missing_precedents Bulk-imported 230 distinct unlinked citations from precedent_internal_citations to missing_precedents.status='open', party='committee', with notes listing source citers. Top candidate: ע"א 3213/97 (cited 5x). Total open missing_precedents now 237. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
278
scripts/monitor_halacha_quality.py
Normal file
278
scripts/monitor_halacha_quality.py
Normal file
@@ -0,0 +1,278 @@
|
||||
"""Halacha extraction quality monitor.
|
||||
|
||||
Tracks ``avg(confidence)`` of halachot extracted by the LLM pipeline
|
||||
over time and emits an alert when the recent-window average drops more
|
||||
than a configurable threshold below the lifetime baseline.
|
||||
|
||||
Intended schedule: weekly cron, e.g. ``0 8 * * 1`` (Monday 08:00).
|
||||
|
||||
Output: a single-line JSON payload to stdout (suitable for piping
|
||||
into ``notify.py`` or a webhook), plus a human-readable alert text
|
||||
on stderr when drift is detected.
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
::
|
||||
|
||||
# Default — weekly window, 5% drop threshold (relative)
|
||||
python scripts/monitor_halacha_quality.py
|
||||
|
||||
# Custom window/threshold:
|
||||
python scripts/monitor_halacha_quality.py --window 14 --threshold 0.03
|
||||
|
||||
# Only emit JSON, no stderr alert:
|
||||
python scripts/monitor_halacha_quality.py --silent
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _setup_paths():
|
||||
"""Make ``legal_mcp`` importable when run from anywhere."""
|
||||
here = Path(__file__).resolve().parent
|
||||
candidates = [
|
||||
here.parent / "mcp-server" / "src", # host
|
||||
Path("/app/mcp-server/src"), # container
|
||||
]
|
||||
for c in candidates:
|
||||
if c.is_dir() and str(c) not in sys.path:
|
||||
sys.path.insert(0, str(c))
|
||||
|
||||
|
||||
_setup_paths()
|
||||
|
||||
from legal_mcp.services import db # noqa: E402
|
||||
|
||||
|
||||
# Statuses considered "trusted" — the baseline is computed only over
|
||||
# halachot whose extraction the chair has accepted. ``pending_review``
|
||||
# is the queue waiting for review; their average tends to be lower
|
||||
# because anything obviously bad gets rejected before approval. So we
|
||||
# track BOTH series and alert on either one drifting:
|
||||
# 1. Trusted baseline (approved+published) — drift here means the
|
||||
# extractor's "best output" quality is degrading.
|
||||
# 2. All extracted — drift here means raw extractor accuracy is down.
|
||||
TRUSTED_STATUSES = ("approved", "published")
|
||||
|
||||
|
||||
async def _collect_metrics(window_days: int) -> dict:
|
||||
pool = await db.get_pool()
|
||||
|
||||
# Lifetime baselines
|
||||
lifetime_all = await pool.fetchrow(
|
||||
"SELECT count(*) AS n, AVG(confidence) AS avg_conf FROM halachot"
|
||||
)
|
||||
lifetime_trusted = await pool.fetchrow(
|
||||
f"""
|
||||
SELECT count(*) AS n, AVG(confidence) AS avg_conf
|
||||
FROM halachot
|
||||
WHERE review_status = ANY($1::text[])
|
||||
""",
|
||||
list(TRUSTED_STATUSES),
|
||||
)
|
||||
|
||||
# Recent window
|
||||
recent_all = await pool.fetchrow(
|
||||
f"""
|
||||
SELECT count(*) AS n, AVG(confidence) AS avg_conf
|
||||
FROM halachot
|
||||
WHERE created_at > NOW() - INTERVAL '{int(window_days)} days'
|
||||
"""
|
||||
)
|
||||
recent_trusted = await pool.fetchrow(
|
||||
f"""
|
||||
SELECT count(*) AS n, AVG(confidence) AS avg_conf
|
||||
FROM halachot
|
||||
WHERE created_at > NOW() - INTERVAL '{int(window_days)} days'
|
||||
AND review_status = ANY($1::text[])
|
||||
""",
|
||||
list(TRUSTED_STATUSES),
|
||||
)
|
||||
|
||||
# Per-precedent recent (extractor outputs that haven't been reviewed
|
||||
# yet) — sometimes the canary that catches drift earliest. We track
|
||||
# the most-recent N extractions regardless of review state.
|
||||
pending_recent = await pool.fetchrow(
|
||||
"""
|
||||
SELECT count(*) AS n, AVG(confidence) AS avg_conf
|
||||
FROM halachot
|
||||
WHERE review_status = 'pending_review'
|
||||
"""
|
||||
)
|
||||
|
||||
def _f(rec, key: str) -> float | None:
|
||||
v = rec[key]
|
||||
if v is None:
|
||||
return None
|
||||
return float(v)
|
||||
|
||||
def _i(rec, key: str) -> int:
|
||||
v = rec[key]
|
||||
return int(v) if v is not None else 0
|
||||
|
||||
return {
|
||||
"window_days": int(window_days),
|
||||
"lifetime_all_count": _i(lifetime_all, "n"),
|
||||
"lifetime_all_avg": _f(lifetime_all, "avg_conf"),
|
||||
"lifetime_trusted_count": _i(lifetime_trusted, "n"),
|
||||
"lifetime_trusted_avg": _f(lifetime_trusted, "avg_conf"),
|
||||
"recent_all_count": _i(recent_all, "n"),
|
||||
"recent_all_avg": _f(recent_all, "avg_conf"),
|
||||
"recent_trusted_count": _i(recent_trusted, "n"),
|
||||
"recent_trusted_avg": _f(recent_trusted, "avg_conf"),
|
||||
"pending_review_count": _i(pending_recent, "n"),
|
||||
"pending_review_avg": _f(pending_recent, "avg_conf"),
|
||||
}
|
||||
|
||||
|
||||
def _drift(baseline: float | None, recent: float | None) -> float | None:
|
||||
"""Return relative drift as a positive number when recent < baseline.
|
||||
|
||||
>>> _drift(0.85, 0.80) # -> 0.0588 (5.88% drop)
|
||||
"""
|
||||
if baseline is None or recent is None or baseline <= 0:
|
||||
return None
|
||||
return (baseline - recent) / baseline
|
||||
|
||||
|
||||
def _evaluate(metrics: dict, threshold: float, min_sample: int) -> dict:
|
||||
"""Decide whether any series is drifting below threshold."""
|
||||
alerts: list[dict] = []
|
||||
series = [
|
||||
(
|
||||
"trusted",
|
||||
metrics["lifetime_trusted_avg"],
|
||||
metrics["recent_trusted_avg"],
|
||||
metrics["recent_trusted_count"],
|
||||
),
|
||||
(
|
||||
"all_extracted",
|
||||
metrics["lifetime_all_avg"],
|
||||
metrics["recent_all_avg"],
|
||||
metrics["recent_all_count"],
|
||||
),
|
||||
]
|
||||
for name, baseline, recent, recent_n in series:
|
||||
d = _drift(baseline, recent)
|
||||
entry = {
|
||||
"series": name,
|
||||
"baseline": baseline,
|
||||
"recent": recent,
|
||||
"recent_n": recent_n,
|
||||
"drift": d,
|
||||
"alert": False,
|
||||
"reason": None,
|
||||
}
|
||||
if recent_n < min_sample:
|
||||
entry["reason"] = f"recent_n={recent_n} below min_sample={min_sample}"
|
||||
elif d is None:
|
||||
entry["reason"] = "missing baseline or recent average"
|
||||
elif d >= threshold:
|
||||
entry["alert"] = True
|
||||
entry["reason"] = (
|
||||
f"drift {d:.1%} >= threshold {threshold:.1%} "
|
||||
f"(baseline={baseline:.3f}, recent={recent:.3f}, n={recent_n})"
|
||||
)
|
||||
else:
|
||||
entry["reason"] = (
|
||||
f"drift {d:.1%} < threshold {threshold:.1%} — within tolerance"
|
||||
)
|
||||
alerts.append(entry)
|
||||
|
||||
any_alert = any(a["alert"] for a in alerts)
|
||||
return {"alert": any_alert, "series": alerts}
|
||||
|
||||
|
||||
def _format_alert_text(metrics: dict, decision: dict) -> str:
|
||||
lines = [
|
||||
f"Halacha quality alert — window={metrics['window_days']}d",
|
||||
"",
|
||||
]
|
||||
for s in decision["series"]:
|
||||
sym = "ALERT" if s["alert"] else "ok"
|
||||
baseline = f"{s['baseline']:.3f}" if s["baseline"] is not None else "—"
|
||||
recent = f"{s['recent']:.3f}" if s["recent"] is not None else "—"
|
||||
drift = f"{s['drift']:.1%}" if s["drift"] is not None else "—"
|
||||
lines.append(
|
||||
f" [{sym}] {s['series']}: baseline={baseline} recent={recent} "
|
||||
f"drift={drift} n={s['recent_n']}"
|
||||
)
|
||||
if s["reason"]:
|
||||
lines.append(f" {s['reason']}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
async def run(
|
||||
*,
|
||||
window_days: int,
|
||||
threshold: float,
|
||||
min_sample: int,
|
||||
) -> dict:
|
||||
metrics = await _collect_metrics(window_days)
|
||||
decision = _evaluate(metrics, threshold, min_sample)
|
||||
return {
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"window_days": window_days,
|
||||
"threshold_rel": threshold,
|
||||
"min_sample": min_sample,
|
||||
"metrics": metrics,
|
||||
"decision": decision,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Monitor halacha extraction quality (confidence drift)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--window", type=int, default=7,
|
||||
help="Recent window in days (default: 7).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--threshold", type=float, default=0.05,
|
||||
help="Relative drop alert threshold (default: 0.05 = 5%%).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-sample", type=int, default=5,
|
||||
help="Minimum halachot in window to evaluate (default: 5). "
|
||||
"Below this, the series is reported but not alerted on.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--silent", action="store_true",
|
||||
help="Suppress stderr alert text; only print JSON.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--exit-on-alert", action="store_true",
|
||||
help="Exit with status 1 when an alert fires (default: always exit 0).",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
report = asyncio.run(
|
||||
run(
|
||||
window_days=args.window,
|
||||
threshold=args.threshold,
|
||||
min_sample=args.min_sample,
|
||||
)
|
||||
)
|
||||
|
||||
# JSON to stdout
|
||||
print(json.dumps(report, ensure_ascii=False, indent=2))
|
||||
|
||||
if report["decision"]["alert"] and not args.silent:
|
||||
print("", file=sys.stderr)
|
||||
print(_format_alert_text(report["metrics"], report["decision"]), file=sys.stderr)
|
||||
|
||||
if args.exit_on_alert and report["decision"]["alert"]:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user