Merge pull request 'feat(style-acq T7): מדד מרחק-סגנון — סוגר את ה-MVP' (#76) from worktree-style-acquisition-mvp into main

2026-06-06 17:33:50 +00:00
parent caeaf51db4 a3451775fa
commit dd46ffb3e3
3 changed files with 155 additions and 0 deletions
--- a/mcp-server/src/legal_mcp/server.py
+++ b/mcp-server/src/legal_mcp/server.py
@@ -528,6 +528,16 @@ async def get_style_guide() -> str:
    return await drafting.get_style_guide()
@mcp.tool()
 async def style_distance(case_number: str) -> str:
    """מדד מרחק-סגנון (T7) — האם הטיוטה מתכנסת לסגנון דפנה: סטיית יחסי-זהב,
    ספירת אנטי-דפוסים, ושיעור-השינוי draft→final מפנקס-ההתאמה. ללא LLM."""
    import json as _json
    from legal_mcp.services import style_distance as _sd
    result = await _sd.style_distance(case_number)
    return _json.dumps(result, ensure_ascii=False, indent=2)
@mcp.tool()
 async def draft_section(
    case_number: str,
--- a/mcp-server/src/legal_mcp/services/lessons.py
+++ b/mcp-server/src/legal_mcp/services/lessons.py
@@ -42,6 +42,25 @@ GOLDEN_RATIOS: dict[str, dict[str, tuple[int, int]]] = {
    "partial_acceptance": {"background": (25, 35), "claims": (25, 30), "discussion": (40, 47), "summary": (2, 3)},
 }
 # ── Anti-patterns (what Dafna avoids) — detectable signals for style-distance (T7) ──
 # Derived from daphna-voice-fingerprint.md §3 (corrected 2026-06-06: sequential
 # paragraph numbering is REQUIRED — applied as Word auto-numbering at export — so the
 # anti-pattern is MANUAL numbers typed as text, not numbering itself).
 ANTI_PATTERNS: list[dict] = [
    {"name": "manual_paragraph_numbers",
     "regex": r"(?m)^\s*\d{1,3}\.\s",
     "note": "מספרים ידניים כטקסט בראש פסקה — אמורים להיות auto-numbering בייצוא"},
    {"name": "inline_numbered_fragments",
     "regex": r"\([0-9]\)[^\n]{0,200}\([0-9]\)",
     "note": "פיצול טיעון לרשימת-מיני (1)...(2) בתוך פסקת-אנליזה"},
    {"name": "markdown_headers",
     "regex": r"(?m)^#{1,6}\s",
     "note": "כותרות markdown — אינן בהחלטה הסופית"},
    {"name": "bullet_lists",
     "regex": r"(?m)^\s*[-*•]\s",
     "note": "רשימות תבליטים באנליזה — דפנה כותבת נרטיב רציף"},
 ]
 # ── Paragraph length guidance (word counts) ────────────────────────
 PARAGRAPH_LENGTHS = {
--- a/mcp-server/src/legal_mcp/services/style_distance.py
+++ b/mcp-server/src/legal_mcp/services/style_distance.py
@@ -0,0 +1,126 @@
 """מדד מרחק-סגנון (T7) — האם הטיוטות מתכנסות לדפנה לאורך זמן.
 שלושה רכיבים, כולם ללא LLM (דטרמיניסטי, זול):
 1. golden_ratio_adherence — סטיית אחוזי-הסעיפים מ-GOLDEN_RATIOS לפי תוצאה.
 2. anti_pattern_hits — ספירת אנטי-דפוסים (מ-lessons.ANTI_PATTERNS) בטקסט הטיוטה.
 3. draft_to_final_diff — change_percent מ-draft_final_pairs (ככל שיורד → מתכנס).
 זהו מטא-אות על בריאות-הלמידה (INV-LRN4) — נצרך ע"י לוח-מחוונים / QA, לא ע"י הכותב.
 """
 from __future__ import annotations
 import logging
 import re
 from uuid import UUID
 from legal_mcp.services import db
 from legal_mcp.services.lessons import ANTI_PATTERNS, GOLDEN_RATIOS, canonical_outcome
 logger = logging.getLogger(__name__)
 # block_id → golden-ratio section
 _BLOCK_TO_SECTION = {
    "block-vav": "background",
    "block-zayin": "claims",
    "block-yod": "discussion",
    "block-yod-alef": "summary",
 }
 def count_anti_patterns(text: str) -> dict:
    """Count each anti-pattern occurrence in text. Lower = closer to Dafna."""
    hits = {}
    total = 0
    for ap in ANTI_PATTERNS:
        n = len(re.findall(ap["regex"], text or ""))
        if n:
            hits[ap["name"]] = {"count": n, "note": ap["note"]}
            total += n
    return {"total": total, "by_pattern": hits}
 def golden_ratio_adherence(block_word_counts: dict[str, int], outcome: str) -> dict:
    """% of total per section vs GOLDEN_RATIOS target range. deviation=0 ⇒ within range."""
    outcome = canonical_outcome(outcome)
    targets = GOLDEN_RATIOS.get(outcome)
    total = sum(block_word_counts.values())
    if not targets or total == 0:
        return {"outcome": outcome, "total_words": total, "sections": {}, "max_deviation": None}
    sections = {}
    max_dev = 0.0
    for block_id, section in _BLOCK_TO_SECTION.items():
        if section not in targets:
            continue
        pct = round(block_word_counts.get(block_id, 0) / total * 100, 1)
        lo, hi = targets[section]
        if pct < lo:
            dev = round(lo - pct, 1)
        elif pct > hi:
            dev = round(pct - hi, 1)
        else:
            dev = 0.0
        max_dev = max(max_dev, dev)
        sections[section] = {"actual_pct": pct, "target": [lo, hi], "deviation_pp": dev}
    return {"outcome": outcome, "total_words": total, "sections": sections, "max_deviation": max_dev}
 async def style_distance(case_number: str) -> dict:
    """Assemble the 3 style-distance components for one case (T7)."""
    case = await db.get_case_by_number(case_number)
    if not case:
        return {"error": f"case {case_number} not found"}
    case_id = UUID(case["id"])
    decision = await db.get_decision_by_case(case_id)
    outcome = (decision or {}).get("outcome", "rejection")
    pool = await db.get_pool()
    async with pool.acquire() as conn:
        block_rows = []
        draft_text = ""
        if decision:
            block_rows = await conn.fetch(
                "SELECT block_id, content, word_count FROM decision_blocks "
                "WHERE decision_id = $1 ORDER BY block_index",
                UUID(decision["id"]),
            )
            draft_text = "\n\n".join(b["content"] for b in block_rows if b["content"])
        pair = await conn.fetchrow(
            "SELECT draft_text, diff_stats, status FROM draft_final_pairs "
            "WHERE case_id = $1 ORDER BY created_at DESC LIMIT 1",
            case_id,
        )
    # Prefer the immutable snapshot's draft text when present.
    if pair and pair["draft_text"]:
        draft_text = pair["draft_text"]
    word_counts = {b["block_id"]: (b["word_count"] or 0) for b in block_rows}
    ratios = golden_ratio_adherence(word_counts, outcome)
    anti = count_anti_patterns(draft_text)
    diff = None
    if pair and pair["diff_stats"]:
        raw = pair["diff_stats"]
        if isinstance(raw, str):
            import json
            try:
                raw = json.loads(raw)
            except (json.JSONDecodeError, TypeError):
                raw = None
        diff = raw
    return {
        "case_number": case_number,
        "outcome": canonical_outcome(outcome),
        "golden_ratio_adherence": ratios,
        "anti_pattern_hits": anti,
        "draft_to_final_diff": diff,
        "pair_status": pair["status"] if pair else None,
        "summary": {
            "ratio_max_deviation_pp": ratios.get("max_deviation"),
            "anti_pattern_total": anti["total"],
            "change_percent": (diff or {}).get("change_percent") if diff else None,
        },
    }