Add style report dashboard — Dafna's style portrait
Visual dashboard at #/style-report with 4 sections: - Hero: 24 decisions, char counts, subject donut, timeline - Anatomy: average section-length breakdown (intro → ruling → conclusion) - Signature Phrases Wall: pattern cards with real corpus frequencies, filter chips by type, click → modal with examples - Contribution: per-decision "new vs confirmed" patterns, growth curve SVG Backend: - /api/training/style-report endpoint computes all 4 sections in one call - Headlines in Hebrew are computed server-side from real data - Backfill script for style_patterns.frequency using _strip_nikud + pattern-variant extraction (templates with [placeholders], / alternatives, ellipsis all handled) Real findings from the 24-decision corpus: - דיון משפטי = 49% of avg decision (the focus) - 23/24 use "לפנינו ערר" opening formula - 21/24 use "ניתנה פה אחד" closing - After 7 decisions we already learned 85% of her style patterns Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
363
web/app.py
363
web/app.py
@@ -390,6 +390,369 @@ async def training_analyze_style_status():
|
||||
return state
|
||||
|
||||
|
||||
# ── Style Report — visual dashboard data ─────────────────────────
|
||||
|
||||
|
||||
_SECTION_TYPE_HEBREW = {
|
||||
"intro": "פתיחה",
|
||||
"facts": "רקע",
|
||||
"appellant_claims": "טענות העורר",
|
||||
"respondent_claims": "טענות המשיב",
|
||||
"legal_analysis": "דיון משפטי",
|
||||
"ruling": "הכרעה",
|
||||
"conclusion": "סוף דבר",
|
||||
}
|
||||
|
||||
_SECTION_DISPLAY_ORDER = [
|
||||
"intro", "facts", "appellant_claims", "respondent_claims",
|
||||
"legal_analysis", "ruling", "conclusion",
|
||||
]
|
||||
|
||||
|
||||
def _strip_nikud(text: str) -> str:
|
||||
import unicodedata
|
||||
return "".join(
|
||||
c for c in unicodedata.normalize("NFD", text)
|
||||
if not unicodedata.combining(c)
|
||||
)
|
||||
|
||||
|
||||
def _extract_pattern_variants(pattern_text: str) -> list[str]:
|
||||
"""Mirror of scripts/backfill_pattern_frequency.py logic for matching."""
|
||||
alternatives = re.split(r"\s*/\s*|\s+או\s+", pattern_text)
|
||||
variants: list[str] = []
|
||||
for alt in alternatives:
|
||||
alt = alt.strip()
|
||||
if not alt:
|
||||
continue
|
||||
alt = re.sub(r"\[[^\]]*\]", "|", alt)
|
||||
alt = re.sub(r"\.{2,}", "|", alt)
|
||||
alt = alt.replace("…", "|")
|
||||
segments = [s.strip(" ,.:;\"'") for s in alt.split("|")]
|
||||
good = [s for s in segments if len(s) >= 4]
|
||||
if good:
|
||||
variants.append(max(good, key=len))
|
||||
return list(dict.fromkeys(variants))
|
||||
|
||||
|
||||
async def _compute_corpus_stats(conn) -> dict:
|
||||
"""Hero section: decision count, chars, subject distribution, timeline."""
|
||||
stats = await conn.fetchrow(
|
||||
"SELECT count(*) as n, "
|
||||
" sum(length(full_text)) as total_chars, "
|
||||
" avg(length(full_text))::int as avg_chars, "
|
||||
" min(decision_date) as min_date, "
|
||||
" max(decision_date) as max_date "
|
||||
"FROM style_corpus"
|
||||
)
|
||||
|
||||
decisions = await conn.fetch(
|
||||
"SELECT decision_number, decision_date, length(full_text) as chars, "
|
||||
" subject_categories "
|
||||
"FROM style_corpus ORDER BY decision_date NULLS LAST"
|
||||
)
|
||||
|
||||
# Subject distribution
|
||||
from collections import Counter
|
||||
subject_counter: Counter = Counter()
|
||||
for d in decisions:
|
||||
cats = d["subject_categories"]
|
||||
if isinstance(cats, str):
|
||||
try:
|
||||
cats = json.loads(cats)
|
||||
except Exception:
|
||||
cats = []
|
||||
for c in (cats or []):
|
||||
subject_counter[c] += 1
|
||||
|
||||
# Cap at top 6 subjects, collapse rest to "אחר"
|
||||
top = subject_counter.most_common(6)
|
||||
other_count = sum(subject_counter.values()) - sum(c for _, c in top)
|
||||
subject_distribution = [{"label": label, "count": count} for label, count in top]
|
||||
if other_count > 0:
|
||||
subject_distribution.append({"label": "אחר", "count": other_count})
|
||||
|
||||
n = stats["n"]
|
||||
top_subject = top[0] if top else None
|
||||
headline = (
|
||||
f"קראתי {n} מההחלטות שלך. ממוצע {stats['avg_chars']:,} תווים לכל החלטה"
|
||||
+ (f", הנושא הנפוץ אצלך: {top_subject[0]} ({top_subject[1]} החלטות)" if top_subject else "")
|
||||
)
|
||||
|
||||
return {
|
||||
"decision_count": n,
|
||||
"total_chars": stats["total_chars"],
|
||||
"avg_chars": stats["avg_chars"],
|
||||
"date_range": [
|
||||
str(stats["min_date"]) if stats["min_date"] else None,
|
||||
str(stats["max_date"]) if stats["max_date"] else None,
|
||||
],
|
||||
"decisions": [
|
||||
{
|
||||
"number": d["decision_number"] or "",
|
||||
"date": str(d["decision_date"]) if d["decision_date"] else "",
|
||||
"chars": d["chars"],
|
||||
"subjects": (
|
||||
json.loads(d["subject_categories"])
|
||||
if isinstance(d["subject_categories"], str)
|
||||
else (d["subject_categories"] or [])
|
||||
),
|
||||
}
|
||||
for d in decisions
|
||||
],
|
||||
"subject_distribution": subject_distribution,
|
||||
"headline": headline,
|
||||
}
|
||||
|
||||
|
||||
async def _compute_anatomy(conn) -> dict:
|
||||
"""Section 2: average section lengths across the training corpus."""
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT dc.section_type,
|
||||
sum(length(dc.content))::int as total_chars,
|
||||
count(distinct dc.document_id) as docs
|
||||
FROM document_chunks dc
|
||||
JOIN documents d ON dc.document_id = d.id
|
||||
WHERE d.title LIKE '[קורפוס]%'
|
||||
AND dc.section_type IS NOT NULL
|
||||
GROUP BY dc.section_type
|
||||
"""
|
||||
)
|
||||
|
||||
if not rows:
|
||||
return {
|
||||
"sections": [],
|
||||
"total_coverage": 0,
|
||||
"headline": "אין עדיין נתונים על מבנה ההחלטות",
|
||||
}
|
||||
|
||||
# Map to average per decision (total_chars / docs that have this section)
|
||||
sections_raw = {r["section_type"]: r for r in rows}
|
||||
|
||||
# Compute avg chars per section across decisions that contain it
|
||||
items = []
|
||||
total_all_chars = sum(r["total_chars"] for r in rows)
|
||||
|
||||
for st_key in _SECTION_DISPLAY_ORDER:
|
||||
if st_key not in sections_raw:
|
||||
continue
|
||||
r = sections_raw[st_key]
|
||||
avg = round(r["total_chars"] / r["docs"]) if r["docs"] else 0
|
||||
pct = r["total_chars"] / total_all_chars if total_all_chars else 0
|
||||
items.append({
|
||||
"type": st_key,
|
||||
"label": _SECTION_TYPE_HEBREW.get(st_key, st_key),
|
||||
"avg_chars": avg,
|
||||
"pct": round(pct, 4),
|
||||
"coverage": r["docs"],
|
||||
})
|
||||
|
||||
# Max coverage (decisions that had any chunks)
|
||||
total_coverage = await conn.fetchval(
|
||||
"SELECT count(distinct dc.document_id) "
|
||||
"FROM document_chunks dc JOIN documents d ON dc.document_id=d.id "
|
||||
"WHERE d.title LIKE '[קורפוס]%'"
|
||||
)
|
||||
|
||||
# Headline: biggest section
|
||||
biggest = max(items, key=lambda x: x["pct"]) if items else None
|
||||
if biggest:
|
||||
pct_int = round(biggest["pct"] * 100)
|
||||
headline = f"{biggest['label']} הוא {pct_int}% מכל החלטה אצלך — זה המוקד שלך"
|
||||
else:
|
||||
headline = ""
|
||||
|
||||
return {
|
||||
"sections": items,
|
||||
"total_coverage": total_coverage,
|
||||
"headline": headline,
|
||||
}
|
||||
|
||||
|
||||
async def _compute_signature_phrases(conn) -> dict:
|
||||
"""Section 3: all patterns with real frequencies, plus headline about top."""
|
||||
rows = await conn.fetch(
|
||||
"SELECT pattern_type, pattern_text, context, frequency, examples "
|
||||
"FROM style_patterns "
|
||||
"WHERE frequency > 0 "
|
||||
"ORDER BY frequency DESC"
|
||||
)
|
||||
|
||||
items = []
|
||||
for r in rows:
|
||||
examples = r["examples"]
|
||||
if isinstance(examples, str):
|
||||
try:
|
||||
examples = json.loads(examples)
|
||||
except Exception:
|
||||
examples = []
|
||||
items.append({
|
||||
"type": r["pattern_type"],
|
||||
"text": r["pattern_text"],
|
||||
"context": r["context"] or "",
|
||||
"frequency": r["frequency"],
|
||||
"examples": examples or [],
|
||||
})
|
||||
|
||||
# Total decision count for denominator
|
||||
total_decisions = await conn.fetchval("SELECT count(*) FROM style_corpus")
|
||||
|
||||
if items:
|
||||
top = items[0]
|
||||
# Clean up for display: strip placeholder brackets and split alternatives
|
||||
display = re.sub(r"\[[^\]]*\]", "", top["text"]).replace(" ", " ").strip()
|
||||
display = display.split(" / ")[0].split(" או ")[0].strip(" .,:;\"'")
|
||||
if len(display) > 60:
|
||||
display = display[:57] + "..."
|
||||
headline = f'הפטרן האהוב עלייך: "{display}" — מופיע ב-{top["frequency"]} מתוך {total_decisions} החלטות'
|
||||
else:
|
||||
headline = "טרם חולצו דפוסים — הרץ ניתוח קורפוס"
|
||||
|
||||
return {"items": items, "total_decisions": total_decisions, "headline": headline}
|
||||
|
||||
|
||||
async def _compute_contribution(conn) -> dict:
|
||||
"""Section 4: per-decision contribution + growth curve."""
|
||||
decisions = await conn.fetch(
|
||||
"SELECT id, decision_number, decision_date, full_text, "
|
||||
" length(full_text) as chars, subject_categories "
|
||||
"FROM style_corpus ORDER BY decision_date NULLS LAST, created_at"
|
||||
)
|
||||
patterns = await conn.fetch(
|
||||
"SELECT id, pattern_type, pattern_text, context "
|
||||
"FROM style_patterns WHERE frequency > 0"
|
||||
)
|
||||
|
||||
if not decisions or not patterns:
|
||||
return {
|
||||
"growth_curve": [],
|
||||
"decision_contributions": [],
|
||||
"headline": "אין עדיין מספיק נתונים",
|
||||
}
|
||||
|
||||
# Normalize texts once
|
||||
normalized_decisions = [
|
||||
(d["id"], d["decision_number"], _strip_nikud(d["full_text"]))
|
||||
for d in decisions
|
||||
]
|
||||
|
||||
# For each pattern, find first decision (chronologically) that contains it
|
||||
# and the full set of decisions that contain it
|
||||
pattern_info: dict = {} # pattern_id → {"first": decision_id, "all": set}
|
||||
|
||||
for p in patterns:
|
||||
variants = _extract_pattern_variants(_strip_nikud(p["pattern_text"]))
|
||||
if not variants:
|
||||
continue
|
||||
|
||||
first_seen = None
|
||||
all_matches = set()
|
||||
for dec_id, _, text in normalized_decisions:
|
||||
if any(v in text for v in variants):
|
||||
if first_seen is None:
|
||||
first_seen = dec_id
|
||||
all_matches.add(dec_id)
|
||||
|
||||
if first_seen is not None:
|
||||
pattern_info[p["id"]] = {
|
||||
"first": first_seen,
|
||||
"all": all_matches,
|
||||
"type": p["pattern_type"],
|
||||
"text": p["pattern_text"],
|
||||
"context": p["context"] or "",
|
||||
}
|
||||
|
||||
# Per-decision: which patterns are new vs confirmed
|
||||
decision_contributions = []
|
||||
cumulative_patterns: set = set()
|
||||
growth_curve = []
|
||||
|
||||
for d in decisions:
|
||||
dec_id = d["id"]
|
||||
new_patterns = []
|
||||
confirmed_patterns = []
|
||||
|
||||
for pid, info in pattern_info.items():
|
||||
if info["first"] == dec_id:
|
||||
new_patterns.append(info)
|
||||
elif dec_id in info["all"]:
|
||||
confirmed_patterns.append(info)
|
||||
|
||||
# First 3 new patterns as "highlight"
|
||||
highlight = new_patterns[0] if new_patterns else None
|
||||
|
||||
decision_contributions.append({
|
||||
"decision_number": d["decision_number"] or "",
|
||||
"decision_date": str(d["decision_date"]) if d["decision_date"] else "",
|
||||
"chars": d["chars"],
|
||||
"subjects": (
|
||||
json.loads(d["subject_categories"])
|
||||
if isinstance(d["subject_categories"], str)
|
||||
else (d["subject_categories"] or [])
|
||||
),
|
||||
"new_count": len(new_patterns),
|
||||
"confirmed_count": len(confirmed_patterns),
|
||||
"new_patterns": [
|
||||
{"type": p["type"], "text": p["text"], "context": p["context"]}
|
||||
for p in new_patterns[:10] # cap to keep payload small
|
||||
],
|
||||
"highlight": (
|
||||
{"type": highlight["type"], "text": highlight["text"]}
|
||||
if highlight else None
|
||||
),
|
||||
})
|
||||
|
||||
cumulative_patterns.update(pid for pid, info in pattern_info.items() if info["first"] == dec_id)
|
||||
growth_curve.append({
|
||||
"decision_number": d["decision_number"] or "",
|
||||
"date": str(d["decision_date"]) if d["decision_date"] else "",
|
||||
"cumulative": len(cumulative_patterns),
|
||||
})
|
||||
|
||||
# Headline: when did we hit ~85%?
|
||||
total_patterns = len(pattern_info)
|
||||
threshold = int(total_patterns * 0.85)
|
||||
n_decisions_to_85pct = None
|
||||
for i, point in enumerate(growth_curve, 1):
|
||||
if point["cumulative"] >= threshold:
|
||||
n_decisions_to_85pct = i
|
||||
break
|
||||
|
||||
if n_decisions_to_85pct:
|
||||
headline = (
|
||||
f"אחרי {n_decisions_to_85pct} החלטות כבר למדתי 85% "
|
||||
f"מהסגנון שלך — השאר מיקד וחידד את הידע"
|
||||
)
|
||||
else:
|
||||
headline = f"למדתי {total_patterns} דפוסים מ-{len(decisions)} החלטות"
|
||||
|
||||
return {
|
||||
"growth_curve": growth_curve,
|
||||
"decision_contributions": decision_contributions,
|
||||
"total_patterns": total_patterns,
|
||||
"headline": headline,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/training/style-report")
|
||||
async def training_style_report():
|
||||
"""Visual dashboard data for Dafna's Style Portrait page."""
|
||||
pool = await db.get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
corpus = await _compute_corpus_stats(conn)
|
||||
anatomy = await _compute_anatomy(conn)
|
||||
phrases = await _compute_signature_phrases(conn)
|
||||
contribution = await _compute_contribution(conn)
|
||||
|
||||
return {
|
||||
"corpus": corpus,
|
||||
"anatomy": anatomy,
|
||||
"signature_phrases": phrases,
|
||||
"contribution": contribution,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/training/corpus")
|
||||
async def training_corpus_list():
|
||||
"""List all decisions currently in the style corpus."""
|
||||
|
||||
Reference in New Issue
Block a user