Add style report dashboard — Dafna's style portrait

Visual dashboard at #/style-report with 4 sections: - Hero: 24 decisions, char counts, subject donut, timeline - Anatomy: average section-length breakdown (intro → ruling → conclusion) - Signature Phrases Wall: pattern cards with real corpus frequencies, filter chips by type, click → modal with examples - Contribution: per-decision "new vs confirmed" patterns, growth curve SVG Backend: - /api/training/style-report endpoint computes all 4 sections in one call - Headlines in Hebrew are computed server-side from real data - Backfill script for style_patterns.frequency using _strip_nikud + pattern-variant extraction (templates with [placeholders], / alternatives, ellipsis all handled) Real findings from the 24-decision corpus: - דיון משפטי = 49% of avg decision (the focus) - 23/24 use "לפנינו ערר" opening formula - 21/24 use "ניתנה פה אחד" closing - After 7 decisions we already learned 85% of her style patterns Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 11:34:37 +00:00
parent 32f18de049
commit 858333b386
3 changed files with 1088 additions and 0 deletions
--- a/web/app.py
+++ b/web/app.py
@@ -390,6 +390,369 @@ async def training_analyze_style_status():
    return state


+# ── Style Report — visual dashboard data ─────────────────────────
+
+
+_SECTION_TYPE_HEBREW = {
+    "intro": "פתיחה",
+    "facts": "רקע",
+    "appellant_claims": "טענות העורר",
+    "respondent_claims": "טענות המשיב",
+    "legal_analysis": "דיון משפטי",
+    "ruling": "הכרעה",
+    "conclusion": "סוף דבר",
+}
+
+_SECTION_DISPLAY_ORDER = [
+    "intro", "facts", "appellant_claims", "respondent_claims",
+    "legal_analysis", "ruling", "conclusion",
+]
+
+
+def _strip_nikud(text: str) -> str:
+    import unicodedata
+    return "".join(
+        c for c in unicodedata.normalize("NFD", text)
+        if not unicodedata.combining(c)
+    )
+
+
+def _extract_pattern_variants(pattern_text: str) -> list[str]:
+    """Mirror of scripts/backfill_pattern_frequency.py logic for matching."""
+    alternatives = re.split(r"\s*/\s*|\s+או\s+", pattern_text)
+    variants: list[str] = []
+    for alt in alternatives:
+        alt = alt.strip()
+        if not alt:
+            continue
+        alt = re.sub(r"\[[^\]]*\]", "|", alt)
+        alt = re.sub(r"\.{2,}", "|", alt)
+        alt = alt.replace("…", "|")
+        segments = [s.strip(" ,.:;\"'") for s in alt.split("|")]
+        good = [s for s in segments if len(s) >= 4]
+        if good:
+            variants.append(max(good, key=len))
+    return list(dict.fromkeys(variants))
+
+
+async def _compute_corpus_stats(conn) -> dict:
+    """Hero section: decision count, chars, subject distribution, timeline."""
+    stats = await conn.fetchrow(
+        "SELECT count(*) as n, "
+        "       sum(length(full_text)) as total_chars, "
+        "       avg(length(full_text))::int as avg_chars, "
+        "       min(decision_date) as min_date, "
+        "       max(decision_date) as max_date "
+        "FROM style_corpus"
+    )
+
+    decisions = await conn.fetch(
+        "SELECT decision_number, decision_date, length(full_text) as chars, "
+        "       subject_categories "
+        "FROM style_corpus ORDER BY decision_date NULLS LAST"
+    )
+
+    # Subject distribution
+    from collections import Counter
+    subject_counter: Counter = Counter()
+    for d in decisions:
+        cats = d["subject_categories"]
+        if isinstance(cats, str):
+            try:
+                cats = json.loads(cats)
+            except Exception:
+                cats = []
+        for c in (cats or []):
+            subject_counter[c] += 1
+
+    # Cap at top 6 subjects, collapse rest to "אחר"
+    top = subject_counter.most_common(6)
+    other_count = sum(subject_counter.values()) - sum(c for _, c in top)
+    subject_distribution = [{"label": label, "count": count} for label, count in top]
+    if other_count > 0:
+        subject_distribution.append({"label": "אחר", "count": other_count})
+
+    n = stats["n"]
+    top_subject = top[0] if top else None
+    headline = (
+        f"קראתי {n} מההחלטות שלך. ממוצע {stats['avg_chars']:,} תווים לכל החלטה"
+        + (f", הנושא הנפוץ אצלך: {top_subject[0]} ({top_subject[1]} החלטות)" if top_subject else "")
+    )
+
+    return {
+        "decision_count": n,
+        "total_chars": stats["total_chars"],
+        "avg_chars": stats["avg_chars"],
+        "date_range": [
+            str(stats["min_date"]) if stats["min_date"] else None,
+            str(stats["max_date"]) if stats["max_date"] else None,
+        ],
+        "decisions": [
+            {
+                "number": d["decision_number"] or "",
+                "date": str(d["decision_date"]) if d["decision_date"] else "",
+                "chars": d["chars"],
+                "subjects": (
+                    json.loads(d["subject_categories"])
+                    if isinstance(d["subject_categories"], str)
+                    else (d["subject_categories"] or [])
+                ),
+            }
+            for d in decisions
+        ],
+        "subject_distribution": subject_distribution,
+        "headline": headline,
+    }
+
+
+async def _compute_anatomy(conn) -> dict:
+    """Section 2: average section lengths across the training corpus."""
+    rows = await conn.fetch(
+        """
+        SELECT dc.section_type,
+               sum(length(dc.content))::int as total_chars,
+               count(distinct dc.document_id) as docs
+        FROM document_chunks dc
+        JOIN documents d ON dc.document_id = d.id
+        WHERE d.title LIKE '[קורפוס]%'
+          AND dc.section_type IS NOT NULL
+        GROUP BY dc.section_type
+        """
+    )
+
+    if not rows:
+        return {
+            "sections": [],
+            "total_coverage": 0,
+            "headline": "אין עדיין נתונים על מבנה ההחלטות",
+        }
+
+    # Map to average per decision (total_chars / docs that have this section)
+    sections_raw = {r["section_type"]: r for r in rows}
+
+    # Compute avg chars per section across decisions that contain it
+    items = []
+    total_all_chars = sum(r["total_chars"] for r in rows)
+
+    for st_key in _SECTION_DISPLAY_ORDER:
+        if st_key not in sections_raw:
+            continue
+        r = sections_raw[st_key]
+        avg = round(r["total_chars"] / r["docs"]) if r["docs"] else 0
+        pct = r["total_chars"] / total_all_chars if total_all_chars else 0
+        items.append({
+            "type": st_key,
+            "label": _SECTION_TYPE_HEBREW.get(st_key, st_key),
+            "avg_chars": avg,
+            "pct": round(pct, 4),
+            "coverage": r["docs"],
+        })
+
+    # Max coverage (decisions that had any chunks)
+    total_coverage = await conn.fetchval(
+        "SELECT count(distinct dc.document_id) "
+        "FROM document_chunks dc JOIN documents d ON dc.document_id=d.id "
+        "WHERE d.title LIKE '[קורפוס]%'"
+    )
+
+    # Headline: biggest section
+    biggest = max(items, key=lambda x: x["pct"]) if items else None
+    if biggest:
+        pct_int = round(biggest["pct"] * 100)
+        headline = f"{biggest['label']} הוא {pct_int}% מכל החלטה אצלך — זה המוקד שלך"
+    else:
+        headline = ""
+
+    return {
+        "sections": items,
+        "total_coverage": total_coverage,
+        "headline": headline,
+    }
+
+
+async def _compute_signature_phrases(conn) -> dict:
+    """Section 3: all patterns with real frequencies, plus headline about top."""
+    rows = await conn.fetch(
+        "SELECT pattern_type, pattern_text, context, frequency, examples "
+        "FROM style_patterns "
+        "WHERE frequency > 0 "
+        "ORDER BY frequency DESC"
+    )
+
+    items = []
+    for r in rows:
+        examples = r["examples"]
+        if isinstance(examples, str):
+            try:
+                examples = json.loads(examples)
+            except Exception:
+                examples = []
+        items.append({
+            "type": r["pattern_type"],
+            "text": r["pattern_text"],
+            "context": r["context"] or "",
+            "frequency": r["frequency"],
+            "examples": examples or [],
+        })
+
+    # Total decision count for denominator
+    total_decisions = await conn.fetchval("SELECT count(*) FROM style_corpus")
+
+    if items:
+        top = items[0]
+        # Clean up for display: strip placeholder brackets and split alternatives
+        display = re.sub(r"\[[^\]]*\]", "", top["text"]).replace("  ", " ").strip()
+        display = display.split(" / ")[0].split(" או ")[0].strip(" .,:;\"'")
+        if len(display) > 60:
+            display = display[:57] + "..."
+        headline = f'הפטרן האהוב עלייך: "{display}" — מופיע ב-{top["frequency"]} מתוך {total_decisions} החלטות'
+    else:
+        headline = "טרם חולצו דפוסים — הרץ ניתוח קורפוס"
+
+    return {"items": items, "total_decisions": total_decisions, "headline": headline}
+
+
+async def _compute_contribution(conn) -> dict:
+    """Section 4: per-decision contribution + growth curve."""
+    decisions = await conn.fetch(
+        "SELECT id, decision_number, decision_date, full_text, "
+        "       length(full_text) as chars, subject_categories "
+        "FROM style_corpus ORDER BY decision_date NULLS LAST, created_at"
+    )
+    patterns = await conn.fetch(
+        "SELECT id, pattern_type, pattern_text, context "
+        "FROM style_patterns WHERE frequency > 0"
+    )
+
+    if not decisions or not patterns:
+        return {
+            "growth_curve": [],
+            "decision_contributions": [],
+            "headline": "אין עדיין מספיק נתונים",
+        }
+
+    # Normalize texts once
+    normalized_decisions = [
+        (d["id"], d["decision_number"], _strip_nikud(d["full_text"]))
+        for d in decisions
+    ]
+
+    # For each pattern, find first decision (chronologically) that contains it
+    # and the full set of decisions that contain it
+    pattern_info: dict = {}  # pattern_id → {"first": decision_id, "all": set}
+
+    for p in patterns:
+        variants = _extract_pattern_variants(_strip_nikud(p["pattern_text"]))
+        if not variants:
+            continue
+
+        first_seen = None
+        all_matches = set()
+        for dec_id, _, text in normalized_decisions:
+            if any(v in text for v in variants):
+                if first_seen is None:
+                    first_seen = dec_id
+                all_matches.add(dec_id)
+
+        if first_seen is not None:
+            pattern_info[p["id"]] = {
+                "first": first_seen,
+                "all": all_matches,
+                "type": p["pattern_type"],
+                "text": p["pattern_text"],
+                "context": p["context"] or "",
+            }
+
+    # Per-decision: which patterns are new vs confirmed
+    decision_contributions = []
+    cumulative_patterns: set = set()
+    growth_curve = []
+
+    for d in decisions:
+        dec_id = d["id"]
+        new_patterns = []
+        confirmed_patterns = []
+
+        for pid, info in pattern_info.items():
+            if info["first"] == dec_id:
+                new_patterns.append(info)
+            elif dec_id in info["all"]:
+                confirmed_patterns.append(info)
+
+        # First 3 new patterns as "highlight"
+        highlight = new_patterns[0] if new_patterns else None
+
+        decision_contributions.append({
+            "decision_number": d["decision_number"] or "",
+            "decision_date": str(d["decision_date"]) if d["decision_date"] else "",
+            "chars": d["chars"],
+            "subjects": (
+                json.loads(d["subject_categories"])
+                if isinstance(d["subject_categories"], str)
+                else (d["subject_categories"] or [])
+            ),
+            "new_count": len(new_patterns),
+            "confirmed_count": len(confirmed_patterns),
+            "new_patterns": [
+                {"type": p["type"], "text": p["text"], "context": p["context"]}
+                for p in new_patterns[:10]  # cap to keep payload small
+            ],
+            "highlight": (
+                {"type": highlight["type"], "text": highlight["text"]}
+                if highlight else None
+            ),
+        })
+
+        cumulative_patterns.update(pid for pid, info in pattern_info.items() if info["first"] == dec_id)
+        growth_curve.append({
+            "decision_number": d["decision_number"] or "",
+            "date": str(d["decision_date"]) if d["decision_date"] else "",
+            "cumulative": len(cumulative_patterns),
+        })
+
+    # Headline: when did we hit ~85%?
+    total_patterns = len(pattern_info)
+    threshold = int(total_patterns * 0.85)
+    n_decisions_to_85pct = None
+    for i, point in enumerate(growth_curve, 1):
+        if point["cumulative"] >= threshold:
+            n_decisions_to_85pct = i
+            break
+
+    if n_decisions_to_85pct:
+        headline = (
+            f"אחרי {n_decisions_to_85pct} החלטות כבר למדתי 85% "
+            f"מהסגנון שלך — השאר מיקד וחידד את הידע"
+        )
+    else:
+        headline = f"למדתי {total_patterns} דפוסים מ-{len(decisions)} החלטות"
+
+    return {
+        "growth_curve": growth_curve,
+        "decision_contributions": decision_contributions,
+        "total_patterns": total_patterns,
+        "headline": headline,
+    }
+
+
+@app.get("/api/training/style-report")
+async def training_style_report():
+    """Visual dashboard data for Dafna's Style Portrait page."""
+    pool = await db.get_pool()
+    async with pool.acquire() as conn:
+        corpus = await _compute_corpus_stats(conn)
+        anatomy = await _compute_anatomy(conn)
+        phrases = await _compute_signature_phrases(conn)
+        contribution = await _compute_contribution(conn)
+
+    return {
+        "corpus": corpus,
+        "anatomy": anatomy,
+        "signature_phrases": phrases,
+        "contribution": contribution,
+    }
+
+
@app.get("/api/training/corpus")
 async def training_corpus_list():
    """List all decisions currently in the style corpus."""