Add style report dashboard — Dafna's style portrait

Visual dashboard at #/style-report with 4 sections:
- Hero: 24 decisions, char counts, subject donut, timeline
- Anatomy: average section-length breakdown (intro → ruling → conclusion)
- Signature Phrases Wall: pattern cards with real corpus frequencies, filter
  chips by type, click → modal with examples
- Contribution: per-decision "new vs confirmed" patterns, growth curve SVG

Backend:
- /api/training/style-report endpoint computes all 4 sections in one call
- Headlines in Hebrew are computed server-side from real data
- Backfill script for style_patterns.frequency using _strip_nikud +
  pattern-variant extraction (templates with [placeholders], / alternatives,
  ellipsis all handled)

Real findings from the 24-decision corpus:
- דיון משפטי = 49% of avg decision (the focus)
- 23/24 use "לפנינו ערר" opening formula
- 21/24 use "ניתנה פה אחד" closing
- After 7 decisions we already learned 85% of her style patterns

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-11 11:34:37 +00:00
parent 32f18de049
commit 858333b386
3 changed files with 1088 additions and 0 deletions

View File

@@ -390,6 +390,369 @@ async def training_analyze_style_status():
return state
# ── Style Report — visual dashboard data ─────────────────────────
_SECTION_TYPE_HEBREW = {
"intro": "פתיחה",
"facts": "רקע",
"appellant_claims": "טענות העורר",
"respondent_claims": "טענות המשיב",
"legal_analysis": "דיון משפטי",
"ruling": "הכרעה",
"conclusion": "סוף דבר",
}
_SECTION_DISPLAY_ORDER = [
"intro", "facts", "appellant_claims", "respondent_claims",
"legal_analysis", "ruling", "conclusion",
]
def _strip_nikud(text: str) -> str:
import unicodedata
return "".join(
c for c in unicodedata.normalize("NFD", text)
if not unicodedata.combining(c)
)
def _extract_pattern_variants(pattern_text: str) -> list[str]:
"""Mirror of scripts/backfill_pattern_frequency.py logic for matching."""
alternatives = re.split(r"\s*/\s*|\s+או\s+", pattern_text)
variants: list[str] = []
for alt in alternatives:
alt = alt.strip()
if not alt:
continue
alt = re.sub(r"\[[^\]]*\]", "|", alt)
alt = re.sub(r"\.{2,}", "|", alt)
alt = alt.replace("", "|")
segments = [s.strip(" ,.:;\"'") for s in alt.split("|")]
good = [s for s in segments if len(s) >= 4]
if good:
variants.append(max(good, key=len))
return list(dict.fromkeys(variants))
async def _compute_corpus_stats(conn) -> dict:
"""Hero section: decision count, chars, subject distribution, timeline."""
stats = await conn.fetchrow(
"SELECT count(*) as n, "
" sum(length(full_text)) as total_chars, "
" avg(length(full_text))::int as avg_chars, "
" min(decision_date) as min_date, "
" max(decision_date) as max_date "
"FROM style_corpus"
)
decisions = await conn.fetch(
"SELECT decision_number, decision_date, length(full_text) as chars, "
" subject_categories "
"FROM style_corpus ORDER BY decision_date NULLS LAST"
)
# Subject distribution
from collections import Counter
subject_counter: Counter = Counter()
for d in decisions:
cats = d["subject_categories"]
if isinstance(cats, str):
try:
cats = json.loads(cats)
except Exception:
cats = []
for c in (cats or []):
subject_counter[c] += 1
# Cap at top 6 subjects, collapse rest to "אחר"
top = subject_counter.most_common(6)
other_count = sum(subject_counter.values()) - sum(c for _, c in top)
subject_distribution = [{"label": label, "count": count} for label, count in top]
if other_count > 0:
subject_distribution.append({"label": "אחר", "count": other_count})
n = stats["n"]
top_subject = top[0] if top else None
headline = (
f"קראתי {n} מההחלטות שלך. ממוצע {stats['avg_chars']:,} תווים לכל החלטה"
+ (f", הנושא הנפוץ אצלך: {top_subject[0]} ({top_subject[1]} החלטות)" if top_subject else "")
)
return {
"decision_count": n,
"total_chars": stats["total_chars"],
"avg_chars": stats["avg_chars"],
"date_range": [
str(stats["min_date"]) if stats["min_date"] else None,
str(stats["max_date"]) if stats["max_date"] else None,
],
"decisions": [
{
"number": d["decision_number"] or "",
"date": str(d["decision_date"]) if d["decision_date"] else "",
"chars": d["chars"],
"subjects": (
json.loads(d["subject_categories"])
if isinstance(d["subject_categories"], str)
else (d["subject_categories"] or [])
),
}
for d in decisions
],
"subject_distribution": subject_distribution,
"headline": headline,
}
async def _compute_anatomy(conn) -> dict:
"""Section 2: average section lengths across the training corpus."""
rows = await conn.fetch(
"""
SELECT dc.section_type,
sum(length(dc.content))::int as total_chars,
count(distinct dc.document_id) as docs
FROM document_chunks dc
JOIN documents d ON dc.document_id = d.id
WHERE d.title LIKE '[קורפוס]%'
AND dc.section_type IS NOT NULL
GROUP BY dc.section_type
"""
)
if not rows:
return {
"sections": [],
"total_coverage": 0,
"headline": "אין עדיין נתונים על מבנה ההחלטות",
}
# Map to average per decision (total_chars / docs that have this section)
sections_raw = {r["section_type"]: r for r in rows}
# Compute avg chars per section across decisions that contain it
items = []
total_all_chars = sum(r["total_chars"] for r in rows)
for st_key in _SECTION_DISPLAY_ORDER:
if st_key not in sections_raw:
continue
r = sections_raw[st_key]
avg = round(r["total_chars"] / r["docs"]) if r["docs"] else 0
pct = r["total_chars"] / total_all_chars if total_all_chars else 0
items.append({
"type": st_key,
"label": _SECTION_TYPE_HEBREW.get(st_key, st_key),
"avg_chars": avg,
"pct": round(pct, 4),
"coverage": r["docs"],
})
# Max coverage (decisions that had any chunks)
total_coverage = await conn.fetchval(
"SELECT count(distinct dc.document_id) "
"FROM document_chunks dc JOIN documents d ON dc.document_id=d.id "
"WHERE d.title LIKE '[קורפוס]%'"
)
# Headline: biggest section
biggest = max(items, key=lambda x: x["pct"]) if items else None
if biggest:
pct_int = round(biggest["pct"] * 100)
headline = f"{biggest['label']} הוא {pct_int}% מכל החלטה אצלך — זה המוקד שלך"
else:
headline = ""
return {
"sections": items,
"total_coverage": total_coverage,
"headline": headline,
}
async def _compute_signature_phrases(conn) -> dict:
"""Section 3: all patterns with real frequencies, plus headline about top."""
rows = await conn.fetch(
"SELECT pattern_type, pattern_text, context, frequency, examples "
"FROM style_patterns "
"WHERE frequency > 0 "
"ORDER BY frequency DESC"
)
items = []
for r in rows:
examples = r["examples"]
if isinstance(examples, str):
try:
examples = json.loads(examples)
except Exception:
examples = []
items.append({
"type": r["pattern_type"],
"text": r["pattern_text"],
"context": r["context"] or "",
"frequency": r["frequency"],
"examples": examples or [],
})
# Total decision count for denominator
total_decisions = await conn.fetchval("SELECT count(*) FROM style_corpus")
if items:
top = items[0]
# Clean up for display: strip placeholder brackets and split alternatives
display = re.sub(r"\[[^\]]*\]", "", top["text"]).replace(" ", " ").strip()
display = display.split(" / ")[0].split(" או ")[0].strip(" .,:;\"'")
if len(display) > 60:
display = display[:57] + "..."
headline = f'הפטרן האהוב עלייך: "{display}" — מופיע ב-{top["frequency"]} מתוך {total_decisions} החלטות'
else:
headline = "טרם חולצו דפוסים — הרץ ניתוח קורפוס"
return {"items": items, "total_decisions": total_decisions, "headline": headline}
async def _compute_contribution(conn) -> dict:
"""Section 4: per-decision contribution + growth curve."""
decisions = await conn.fetch(
"SELECT id, decision_number, decision_date, full_text, "
" length(full_text) as chars, subject_categories "
"FROM style_corpus ORDER BY decision_date NULLS LAST, created_at"
)
patterns = await conn.fetch(
"SELECT id, pattern_type, pattern_text, context "
"FROM style_patterns WHERE frequency > 0"
)
if not decisions or not patterns:
return {
"growth_curve": [],
"decision_contributions": [],
"headline": "אין עדיין מספיק נתונים",
}
# Normalize texts once
normalized_decisions = [
(d["id"], d["decision_number"], _strip_nikud(d["full_text"]))
for d in decisions
]
# For each pattern, find first decision (chronologically) that contains it
# and the full set of decisions that contain it
pattern_info: dict = {} # pattern_id → {"first": decision_id, "all": set}
for p in patterns:
variants = _extract_pattern_variants(_strip_nikud(p["pattern_text"]))
if not variants:
continue
first_seen = None
all_matches = set()
for dec_id, _, text in normalized_decisions:
if any(v in text for v in variants):
if first_seen is None:
first_seen = dec_id
all_matches.add(dec_id)
if first_seen is not None:
pattern_info[p["id"]] = {
"first": first_seen,
"all": all_matches,
"type": p["pattern_type"],
"text": p["pattern_text"],
"context": p["context"] or "",
}
# Per-decision: which patterns are new vs confirmed
decision_contributions = []
cumulative_patterns: set = set()
growth_curve = []
for d in decisions:
dec_id = d["id"]
new_patterns = []
confirmed_patterns = []
for pid, info in pattern_info.items():
if info["first"] == dec_id:
new_patterns.append(info)
elif dec_id in info["all"]:
confirmed_patterns.append(info)
# First 3 new patterns as "highlight"
highlight = new_patterns[0] if new_patterns else None
decision_contributions.append({
"decision_number": d["decision_number"] or "",
"decision_date": str(d["decision_date"]) if d["decision_date"] else "",
"chars": d["chars"],
"subjects": (
json.loads(d["subject_categories"])
if isinstance(d["subject_categories"], str)
else (d["subject_categories"] or [])
),
"new_count": len(new_patterns),
"confirmed_count": len(confirmed_patterns),
"new_patterns": [
{"type": p["type"], "text": p["text"], "context": p["context"]}
for p in new_patterns[:10] # cap to keep payload small
],
"highlight": (
{"type": highlight["type"], "text": highlight["text"]}
if highlight else None
),
})
cumulative_patterns.update(pid for pid, info in pattern_info.items() if info["first"] == dec_id)
growth_curve.append({
"decision_number": d["decision_number"] or "",
"date": str(d["decision_date"]) if d["decision_date"] else "",
"cumulative": len(cumulative_patterns),
})
# Headline: when did we hit ~85%?
total_patterns = len(pattern_info)
threshold = int(total_patterns * 0.85)
n_decisions_to_85pct = None
for i, point in enumerate(growth_curve, 1):
if point["cumulative"] >= threshold:
n_decisions_to_85pct = i
break
if n_decisions_to_85pct:
headline = (
f"אחרי {n_decisions_to_85pct} החלטות כבר למדתי 85% "
f"מהסגנון שלך — השאר מיקד וחידד את הידע"
)
else:
headline = f"למדתי {total_patterns} דפוסים מ-{len(decisions)} החלטות"
return {
"growth_curve": growth_curve,
"decision_contributions": decision_contributions,
"total_patterns": total_patterns,
"headline": headline,
}
@app.get("/api/training/style-report")
async def training_style_report():
"""Visual dashboard data for Dafna's Style Portrait page."""
pool = await db.get_pool()
async with pool.acquire() as conn:
corpus = await _compute_corpus_stats(conn)
anatomy = await _compute_anatomy(conn)
phrases = await _compute_signature_phrases(conn)
contribution = await _compute_contribution(conn)
return {
"corpus": corpus,
"anatomy": anatomy,
"signature_phrases": phrases,
"contribution": contribution,
}
@app.get("/api/training/corpus")
async def training_corpus_list():
"""List all decisions currently in the style corpus."""