Files
legal-ai/mcp-server/src/legal_mcp/services/metrics.py
Chaim 6ff2e36bf9 feat(eval): FU-5 — retrieval eval harness + halacha backlog visibility (#63)
Covers GAP-11 (INV-RET4/G8) and GAP-14 (INV-QA1/G10). Retrieval quality was
never measured (only telemetry observation) and the halacha review backlog was
invisible (the 10/19 gap was found by accident).

Unit B — backlog visibility (pure code, container):
- metrics.halacha_backlog(conn) → {pending_review, approved, rejected, published,
  total, oldest_pending_at}; surfaced in metrics.get_dashboard() (get_metrics MCP
  tool) and /api/system/diagnostics. Live count revealed 178 pending / 1552 total,
  oldest from 2026-05-03 — previously invisible.

Unit A — retrieval eval harness (host-side scripts):
- scripts/eval_gold_bootstrap.py — seeds data/eval/gold-set.jsonl. Two sources:
  citations (cited==relevant via search_relevance_feedback — empty until decisions
  cite precedents) and known_item (query=case_name → relevant=self; a real
  citation-free signal, the methodology #52 checked by hand). Idempotent; preserves
  source='chair' rows.
- scripts/eval_retrieval.py — runs the production retrieval path (search_library /
  search_internal) over the gold-set; computes precision@k, recall@k, MRR, nDCG@k
  (k=5,10); aggregates overall + per-corpus + per-practice_area; writes a report and
  a delta vs committed baseline.json (which records the retrieval_config it reflects).
  --self-test unit-checks the metric math offline.

Gold-set strategy = hybrid (chair decision): bootstrap + chair review. The citation
source is empty today (0 cited precedents in decisions), so the seed is known-item
(77 queries: 54 internal_decisions + 23 precedent_library). The gold-set is
PROVISIONAL until Dafna reviews it (the domain chair-gate).

Baseline (production config: multimodal+rerank on): R@10=0.987, MRR=0.837,
nDCG@10=0.872. Finding: MULTIMODAL_ENABLED=true slightly lowers known-item recall
(image-page results displace exact name matches) — relevant to #15. precedent_library
weaker than internal (R@10 0.957 vs 1.0) — one external precedent unfindable by name.

"CI gate" realized as discipline (re-runnable harness + committed baseline + run
before/after any retrieval-layer change) — retrieval needs prod DB + Voyage, no CI
runner has that access.

Spec: docs/superpowers/specs/2026-05-31-fu5-eval-harness-design.md

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-31 14:58:13 +00:00

206 lines
7.7 KiB
Python

"""מדדי הצלחה (KPIs) לתהליך כתיבת החלטות.
מדדים:
1. אחוז שינוי — השוואת טיוטה לגרסה סופית (יעד: <10%)
2. אפס הזיות — ספירת הפניות לא מבוססות
3. מענה לכל טענה — כיסוי טענות בדיון
4. משקלות בטווח — עמידה ביחסי הזהב
5. רקע ניטרלי — ללא מילות שיפוט
6. זמן עיבוד — מקליטה עד טיוטה
"""
from __future__ import annotations
import json
import logging
from datetime import datetime
from uuid import UUID
from legal_mcp.services import db
logger = logging.getLogger(__name__)
async def get_case_metrics(case_id: UUID) -> dict:
"""חישוב מדדים לתיק בודד."""
case = await db.get_case(case_id)
if not case:
raise ValueError(f"Case {case_id} not found")
decision = await db.get_decision_by_case(case_id)
pool = await db.get_pool()
metrics = {
"case_number": case["case_number"],
"title": case.get("title", ""),
"status": case.get("status", ""),
}
# 1. Change percentage (if final version exists)
if decision and decision.get("status") == "final":
async with pool.acquire() as conn:
# Get draft word count
draft_words = await conn.fetchval(
"SELECT SUM(word_count) FROM decision_blocks WHERE decision_id = $1",
UUID(decision["id"]),
)
metrics["draft_words"] = draft_words or 0
# Change percent is stored during learning loop
metrics["change_percent"] = None # populated from learning_loop results
else:
metrics["draft_words"] = 0
metrics["change_percent"] = None
# 2. QA results
async with pool.acquire() as conn:
qa_rows = await conn.fetch(
"SELECT check_name, passed, severity, errors FROM qa_results WHERE case_id = $1",
case_id,
)
if qa_rows:
qa_results = {}
for row in qa_rows:
errors = json.loads(row["errors"]) if isinstance(row["errors"], str) else row["errors"]
qa_results[row["check_name"]] = {
"passed": row["passed"],
"severity": row["severity"],
"error_count": len(errors) if errors else 0,
}
metrics["qa"] = qa_results
metrics["qa_passed"] = all(r["passed"] for r in qa_results.values())
metrics["qa_critical_failures"] = sum(
1 for r in qa_results.values()
if not r["passed"] and r["severity"] == "critical"
)
else:
metrics["qa"] = None
metrics["qa_passed"] = None
# 3. Claims coverage
claims = await db.get_claims(case_id)
metrics["total_claims"] = len(claims)
# 4. Documents
docs = await db.list_documents(case_id)
metrics["total_documents"] = len(docs)
# 5. Processing time
if docs and decision:
first_doc_time = min(
d.get("created_at", datetime.max) for d in docs
if d.get("created_at")
)
decision_time = decision.get("created_at")
if first_doc_time and decision_time:
delta = decision_time - first_doc_time
metrics["processing_hours"] = round(delta.total_seconds() / 3600, 1)
else:
metrics["processing_hours"] = None
else:
metrics["processing_hours"] = None
return metrics
async def halacha_backlog(conn) -> dict:
"""תור אישור-ההלכות (GAP-14 / INV-QA1 / G10) — נראות ה-backlog האנושי.
הלכות נכנסות כ-`pending_review` ובלתי-נראות לחיפוש עד אישור היו"ר; בלי ספירה
גלויה, אישור-חסר נשאר סמוי (10/19 התגלה במקרה). מקבל connection פתוח כדי
שאפשר יהיה לשלב בסנאפ-שוט קיים (get_dashboard, /api/system/diagnostics).
"""
rows = await conn.fetch(
"SELECT review_status, COUNT(*) AS n FROM halachot GROUP BY review_status"
)
counts = {r["review_status"]: r["n"] for r in rows}
oldest = await conn.fetchval(
"SELECT MIN(created_at) FROM halachot WHERE review_status = 'pending_review'"
)
return {
"pending_review": counts.get("pending_review", 0),
"approved": counts.get("approved", 0),
"rejected": counts.get("rejected", 0),
"published": counts.get("published", 0),
"total": sum(counts.values()),
"oldest_pending_at": oldest.isoformat() if oldest else None,
}
async def get_dashboard() -> dict:
"""דשבורד כולל — סיכום מדדים על כל התיקים."""
pool = await db.get_pool()
async with pool.acquire() as conn:
# Case counts by status
status_rows = await conn.fetch(
"SELECT status, COUNT(*) as cnt FROM cases GROUP BY status ORDER BY cnt DESC"
)
cases_by_status = {r["status"]: r["cnt"] for r in status_rows}
# Total counts
total_cases = await conn.fetchval("SELECT COUNT(*) FROM cases")
total_docs = await conn.fetchval("SELECT COUNT(*) FROM documents")
total_claims = await conn.fetchval("SELECT COUNT(*) FROM claims")
total_chunks = await conn.fetchval("SELECT COUNT(*) FROM document_chunks")
total_decisions = await conn.fetchval("SELECT COUNT(*) FROM decisions")
total_corpus = await conn.fetchval("SELECT COUNT(*) FROM style_corpus")
total_patterns = await conn.fetchval("SELECT COUNT(*) FROM style_patterns")
total_case_law = await conn.fetchval("SELECT COUNT(*) FROM case_law")
non_searchable_case_law = await conn.fetchval(
"SELECT COUNT(*) FROM case_law WHERE NOT searchable"
)
cases_with_stale_blocks = await conn.fetchval(
"SELECT COUNT(*) FROM cases WHERE blocks_stale"
)
stale_embedding_case_law = await conn.fetchval(
"SELECT COUNT(*) FROM case_law "
"WHERE coalesce(full_text,'') <> '' AND content_hash IS DISTINCT FROM indexed_hash")
# QA summary
qa_total = await conn.fetchval("SELECT COUNT(DISTINCT case_id) FROM qa_results")
qa_passed = await conn.fetchval(
"""SELECT COUNT(DISTINCT case_id) FROM qa_results
WHERE case_id NOT IN (
SELECT case_id FROM qa_results WHERE passed = false AND severity = 'critical'
)"""
)
# Final decisions
final_count = await conn.fetchval(
"SELECT COUNT(*) FROM decisions WHERE status = 'final'"
)
# Average words per decision
avg_words = await conn.fetchval(
"SELECT AVG(total_words) FROM decisions WHERE total_words > 0"
)
# Halacha review backlog (GAP-14 / INV-QA1 / G10)
backlog = await halacha_backlog(conn)
return {
"summary": {
"total_cases": total_cases,
"total_documents": total_docs,
"total_claims": total_claims,
"total_chunks": total_chunks,
"total_decisions": total_decisions,
"final_decisions": final_count,
"style_corpus": total_corpus,
"style_patterns": total_patterns,
"case_law_entries": total_case_law,
"non_searchable_case_law": non_searchable_case_law,
"cases_with_stale_blocks": cases_with_stale_blocks,
"stale_embedding_case_law": stale_embedding_case_law,
},
"cases_by_status": cases_by_status,
"halacha_backlog": backlog,
"qa": {
"cases_validated": qa_total,
"cases_passed": qa_passed,
"pass_rate": round(qa_passed / qa_total * 100, 1) if qa_total else None,
},
"avg_decision_words": round(avg_words) if avg_words else None,
}