Files
legal-ai/mcp-server/src/legal_mcp/services/metrics.py
Chaim 420cb819f5 feat(halacha-triage): quality-gated + prioritized review queue + metrics (#84)
Backend for the halacha approval-queue triage (#84). The keyboard UI, batch
actions and defer/reject (#84.4–6) already shipped; this adds the gating,
prioritization and metrics the queue was missing.

db.list_halachot — two opt-in triage controls:
  * exclude_low_quality (#84.1): drop items carrying ANY quality_flag
    (application / quote_unverified / truncated / non_decision / thin /
    nli_unsupported / near_duplicate) — they belong in a 'needs extraction fix'
    bucket, not the chair's approve queue.
  * order_by_priority (#84.3): active-learning order — negatively-treated
    first, then most-uncertain (lowest confidence), then oldest — instead of
    FIFO, so the highest-value decisions surface first.

halachot_pending (MCP) — now gated + prioritized BY DEFAULT; include_low_quality=
true reveals the needs-fix bucket. The agent review path benefits immediately.

GET /api/halachot — same two params, default OFF (non-breaking; the UI opts in).

metrics.halacha_backlog (#84.7) — splits pending into clean vs flagged, adds
deferred, reviewed_total, approve_ratio, and a pending_by_flag breakdown, so the
backlog distinguishes real review work from extraction noise.

Deferred (documented): #84.2 near-duplicate cluster cards and wiring the UI
fetch to the new params require frontend work + an api:types regen AFTER this
deploys (the new query params aren't in prod's OpenAPI until then) — a clean
follow-up. The backend fully supports both now.

Verified against the live DB (read-only):
- pending 177 → gated-clean 110, 0 flagged items leak into the clean queue.
- priority order surfaces the lowest-confidence items first (0.55, 0.55, ...).
- backlog: pending_clean=110 / pending_flagged=67 / approve_ratio=0.916,
  pending_by_flag={nli_unsupported:59, quote_unverified:3, thin:3, truncated:2}.
- pytest tests/test_halacha_quality.py — 52 passed (no regression).

Invariants: G1 (gate at source — SQL filter, not post-hoc); G2 (no parallel
path — same list_halachot); §6 (flagged items routed to a bucket, never dropped).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 20:00:52 +00:00

227 lines
8.9 KiB
Python

"""מדדי הצלחה (KPIs) לתהליך כתיבת החלטות.
מדדים:
1. אחוז שינוי — השוואת טיוטה לגרסה סופית (יעד: <10%)
2. אפס הזיות — ספירת הפניות לא מבוססות
3. מענה לכל טענה — כיסוי טענות בדיון
4. משקלות בטווח — עמידה ביחסי הזהב
5. רקע ניטרלי — ללא מילות שיפוט
6. זמן עיבוד — מקליטה עד טיוטה
"""
from __future__ import annotations
import json
import logging
from datetime import datetime
from uuid import UUID
from legal_mcp.services import db
logger = logging.getLogger(__name__)
async def get_case_metrics(case_id: UUID) -> dict:
"""חישוב מדדים לתיק בודד."""
case = await db.get_case(case_id)
if not case:
raise ValueError(f"Case {case_id} not found")
decision = await db.get_decision_by_case(case_id)
pool = await db.get_pool()
metrics = {
"case_number": case["case_number"],
"title": case.get("title", ""),
"status": case.get("status", ""),
}
# 1. Change percentage (if final version exists)
if decision and decision.get("status") == "final":
async with pool.acquire() as conn:
# Get draft word count
draft_words = await conn.fetchval(
"SELECT SUM(word_count) FROM decision_blocks WHERE decision_id = $1",
UUID(decision["id"]),
)
metrics["draft_words"] = draft_words or 0
# Change percent is stored during learning loop
metrics["change_percent"] = None # populated from learning_loop results
else:
metrics["draft_words"] = 0
metrics["change_percent"] = None
# 2. QA results
async with pool.acquire() as conn:
qa_rows = await conn.fetch(
"SELECT check_name, passed, severity, errors FROM qa_results WHERE case_id = $1",
case_id,
)
if qa_rows:
qa_results = {}
for row in qa_rows:
errors = json.loads(row["errors"]) if isinstance(row["errors"], str) else row["errors"]
qa_results[row["check_name"]] = {
"passed": row["passed"],
"severity": row["severity"],
"error_count": len(errors) if errors else 0,
}
metrics["qa"] = qa_results
metrics["qa_passed"] = all(r["passed"] for r in qa_results.values())
metrics["qa_critical_failures"] = sum(
1 for r in qa_results.values()
if not r["passed"] and r["severity"] == "critical"
)
else:
metrics["qa"] = None
metrics["qa_passed"] = None
# 3. Claims coverage
claims = await db.get_claims(case_id)
metrics["total_claims"] = len(claims)
# 4. Documents
docs = await db.list_documents(case_id)
metrics["total_documents"] = len(docs)
# 5. Processing time
if docs and decision:
first_doc_time = min(
d.get("created_at", datetime.max) for d in docs
if d.get("created_at")
)
decision_time = decision.get("created_at")
if first_doc_time and decision_time:
delta = decision_time - first_doc_time
metrics["processing_hours"] = round(delta.total_seconds() / 3600, 1)
else:
metrics["processing_hours"] = None
else:
metrics["processing_hours"] = None
return metrics
async def halacha_backlog(conn) -> dict:
"""תור אישור-ההלכות (GAP-14 / INV-QA1 / G10) — נראות ה-backlog האנושי.
הלכות נכנסות כ-`pending_review` ובלתי-נראות לחיפוש עד אישור היו"ר; בלי ספירה
גלויה, אישור-חסר נשאר סמוי (10/19 התגלה במקרה). מקבל connection פתוח כדי
שאפשר יהיה לשלב בסנאפ-שוט קיים (get_dashboard, /api/system/diagnostics).
"""
rows = await conn.fetch(
"SELECT review_status, COUNT(*) AS n FROM halachot GROUP BY review_status"
)
counts = {r["review_status"]: r["n"] for r in rows}
oldest = await conn.fetchval(
"SELECT MIN(created_at) FROM halachot WHERE review_status = 'pending_review'"
)
# #84.7 — split the pending bucket: how many are genuine candidates (clean)
# vs flagged 'needs extraction fix', and the breakdown by flag, so the chair
# sees how much of the backlog is real review vs extraction noise.
pending_clean = await conn.fetchval(
"SELECT COUNT(*) FROM halachot WHERE review_status = 'pending_review' "
"AND COALESCE(array_length(quality_flags, 1), 0) = 0"
)
flag_rows = await conn.fetch(
"SELECT flag, COUNT(*) AS n FROM ("
" SELECT unnest(quality_flags) AS flag FROM halachot "
" WHERE review_status = 'pending_review'"
") t GROUP BY flag ORDER BY n DESC"
)
pending_total = counts.get("pending_review", 0)
reviewed = counts.get("approved", 0) + counts.get("rejected", 0) + counts.get("published", 0)
return {
"pending_review": pending_total,
"pending_clean": pending_clean, # real review candidates (#84.1)
"pending_flagged": pending_total - pending_clean, # needs-fix bucket
"approved": counts.get("approved", 0),
"rejected": counts.get("rejected", 0),
"deferred": counts.get("deferred", 0),
"published": counts.get("published", 0),
"total": sum(counts.values()),
"reviewed_total": reviewed,
"approve_ratio": round(counts.get("approved", 0) / reviewed, 3) if reviewed else None,
"pending_by_flag": {r["flag"]: r["n"] for r in flag_rows},
"oldest_pending_at": oldest.isoformat() if oldest else None,
}
async def get_dashboard() -> dict:
"""דשבורד כולל — סיכום מדדים על כל התיקים."""
pool = await db.get_pool()
async with pool.acquire() as conn:
# Case counts by status
status_rows = await conn.fetch(
"SELECT status, COUNT(*) as cnt FROM cases GROUP BY status ORDER BY cnt DESC"
)
cases_by_status = {r["status"]: r["cnt"] for r in status_rows}
# Total counts
total_cases = await conn.fetchval("SELECT COUNT(*) FROM cases")
total_docs = await conn.fetchval("SELECT COUNT(*) FROM documents")
total_claims = await conn.fetchval("SELECT COUNT(*) FROM claims")
total_chunks = await conn.fetchval("SELECT COUNT(*) FROM document_chunks")
total_decisions = await conn.fetchval("SELECT COUNT(*) FROM decisions")
total_corpus = await conn.fetchval("SELECT COUNT(*) FROM style_corpus")
total_patterns = await conn.fetchval("SELECT COUNT(*) FROM style_patterns")
total_case_law = await conn.fetchval("SELECT COUNT(*) FROM case_law")
non_searchable_case_law = await conn.fetchval(
"SELECT COUNT(*) FROM case_law WHERE NOT searchable"
)
cases_with_stale_blocks = await conn.fetchval(
"SELECT COUNT(*) FROM cases WHERE blocks_stale"
)
stale_embedding_case_law = await conn.fetchval(
"SELECT COUNT(*) FROM case_law "
"WHERE coalesce(full_text,'') <> '' AND content_hash IS DISTINCT FROM indexed_hash")
# QA summary
qa_total = await conn.fetchval("SELECT COUNT(DISTINCT case_id) FROM qa_results")
qa_passed = await conn.fetchval(
"""SELECT COUNT(DISTINCT case_id) FROM qa_results
WHERE case_id NOT IN (
SELECT case_id FROM qa_results WHERE passed = false AND severity = 'critical'
)"""
)
# Final decisions
final_count = await conn.fetchval(
"SELECT COUNT(*) FROM decisions WHERE status = 'final'"
)
# Average words per decision
avg_words = await conn.fetchval(
"SELECT AVG(total_words) FROM decisions WHERE total_words > 0"
)
# Halacha review backlog (GAP-14 / INV-QA1 / G10)
backlog = await halacha_backlog(conn)
return {
"summary": {
"total_cases": total_cases,
"total_documents": total_docs,
"total_claims": total_claims,
"total_chunks": total_chunks,
"total_decisions": total_decisions,
"final_decisions": final_count,
"style_corpus": total_corpus,
"style_patterns": total_patterns,
"case_law_entries": total_case_law,
"non_searchable_case_law": non_searchable_case_law,
"cases_with_stale_blocks": cases_with_stale_blocks,
"stale_embedding_case_law": stale_embedding_case_law,
},
"cases_by_status": cases_by_status,
"halacha_backlog": backlog,
"qa": {
"cases_validated": qa_total,
"cases_passed": qa_passed,
"pass_rate": round(qa_passed / qa_total * 100, 1) if qa_total else None,
},
"avg_decision_words": round(avg_words) if avg_words else None,
}