feat(training): Style Studio — upload, rich corpus, lessons, curator portrait, chat

Six-phase upgrade of /training from a read-only dashboard into a full Style Studio for managing Daphna's style corpus. - Upload Sheet on /training: file → proofread preview → commit (no more CLI-only `upload-training` skill). - Rich corpus metadata: GET /api/training/corpus returns summary, outcome, key_principles, page_count, parties (regex), legal_citation, lessons_count. PATCH endpoint for chair edits. CorpusDetailDrawer with 4 tabs (details /content/lessons/patterns) replaces the bare table row. - LLM metadata enrichment: style_metadata_extractor + MCP tools (style_corpus_enrich, style_corpus_pending_enrichment) fill summary /outcome/key_principles via claude_session (free, host-side). - Per-decision lessons: new decision_lessons table + 4 REST endpoints + LessonsTab in drawer; hermes-curator now auto-posts findings as decision_lessons(source=curator). - Curator Portrait tab: prompt rendered with link to Gitea, recent curator findings, style_analyzer training prompts, propose-change form that writes proposals to data/curator-proposals/ for manual chair review (no auto-mutation of the agent file). - Style chat tab: SSE-streamed conversations with the style agent. New host-side pm2 service (legal-chat-service, port 8770) wraps claude CLI with stream-json + --resume continuation; FastAPI proxies via host.docker.internal. Zero API cost — uses chaim's claude.ai subscription. chat_conversations + chat_messages persist history. Architecture: keeps the existing rule that claude_session only runs on the host (not the container). The new legal-chat-service is the canonical bridge between the container and the local CLI for the chat feature; everything else (upload, metadata, lessons) stays within the container's existing capabilities. Audit script (scripts/audit_training_corpus.py) included for verifying which corpus rows still need enrichment. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 10:06:22 +00:00
parent 0629f19d5f
commit bb0cd7c6a2
23 changed files with 4568 additions and 75 deletions
--- a/web/app.py
+++ b/web/app.py
@@ -12,6 +12,7 @@ import subprocess
 import sys
 import time
 from contextlib import asynccontextmanager
+from datetime import date as date_type
 from pathlib import Path
 from uuid import UUID, uuid4

@@ -945,32 +946,648 @@ async def training_corpus_delete(corpus_id: str):
    return result


+def _format_legal_citation(decision_number: str, decision_date: str) -> str:
+    """Compose the Israeli ועדת ערר citation string from corpus metadata.
+
+    Mirrors how decisions are referenced in Daphna's own writing — e.g.
+    "ערר 1130-25 ועדת ערר ירושלים (26.4.2026)". Empty parts are dropped
+    gracefully so partially populated rows still produce a readable label.
+    """
+    if not decision_number:
+        return ""
+    parts = [f"ערר {decision_number}", "ועדת ערר ירושלים"]
+    if decision_date:
+        try:
+            d = date_type.fromisoformat(decision_date)
+            parts.append(f"({d.day}.{d.month}.{d.year})")
+        except ValueError:
+            pass
+    return " ".join(parts)
+
+
+_PARTIES_PATTERNS = (
+    # "העורר: X" or "העוררים: X". Captures up to a newline / end of stanza.
+    re.compile(r"העורר(?:ים|ת)?[:\s]+([^\n]{3,120})"),
+    re.compile(r"המבקש(?:ים|ת)?[:\s]+([^\n]{3,120})"),
+    re.compile(r"בעניין[:\s]+([^\n]{3,120})"),
+)
+_RESPONDENT_PATTERNS = (
+    re.compile(r"המשיב(?:ים|ה|ות)?[:\s]+([^\n]{3,120})"),
+    re.compile(r"נגד\s*\n+\s*([^\n]{3,120})"),
+)
+
+
+def _extract_parties(text: str) -> dict[str, str]:
+    """Best-effort regex extraction of עורר/משיב from the first 5K of full_text.
+
+    We only scan the head of the document because the parties are always
+    declared at the top in Israeli legal decisions. The result is a hint
+    for display — never authoritative — so a miss returns an empty string
+    rather than raising.
+    """
+    head = (text or "")[:5000]
+    appellant = respondent = ""
+    for pat in _PARTIES_PATTERNS:
+        m = pat.search(head)
+        if m:
+            appellant = m.group(1).strip(" .,-—")
+            break
+    for pat in _RESPONDENT_PATTERNS:
+        m = pat.search(head)
+        if m:
+            respondent = m.group(1).strip(" .,-—")
+            break
+    return {"appellant": appellant, "respondent": respondent}
+
+
@app.get("/api/training/corpus")
 async def training_corpus_list():
-    """List all decisions currently in the style corpus."""
+    """List all decisions currently in the style corpus, with enriched metadata.
+
+    Joins to ``documents`` via FK when available, falling back to the
+    title-token match used in the chunking pipeline so legacy rows with
+    ``style_corpus.document_id IS NULL`` still resolve to their page_count
+    and chunk counts.
+    """
    pool = await db.get_pool()
    async with pool.acquire() as conn:
        rows = await conn.fetch(
-            "SELECT id, decision_number, decision_date, subject_categories, "
-            "       length(full_text) as chars, created_at "
-            "FROM style_corpus "
-            "ORDER BY created_at DESC"
+            """
+            SELECT sc.id,
+                   sc.decision_number,
+                   sc.decision_date,
+                   sc.subject_categories,
+                   length(sc.full_text) AS chars,
+                   substring(sc.full_text from 1 for 5000) AS head_text,
+                   sc.summary,
+                   sc.outcome,
+                   sc.key_principles,
+                   sc.appeal_subtype,
+                   sc.practice_area,
+                   sc.document_id,
+                   sc.created_at,
+                   d.page_count AS page_count,
+                   d.title       AS doc_title
+            FROM style_corpus sc
+            LEFT JOIN documents d ON d.id = sc.document_id
+            ORDER BY sc.created_at DESC
+            """
        )
-    return [
-        {
+    lessons_counts = await db.count_decision_lessons_per_corpus()
+    out = []
+    for r in rows:
+        cats = r["subject_categories"]
+        if isinstance(cats, str):
+            try:
+                cats = json.loads(cats)
+            except json.JSONDecodeError:
+                cats = []
+        kp = r["key_principles"]
+        if isinstance(kp, str):
+            try:
+                kp = json.loads(kp)
+            except json.JSONDecodeError:
+                kp = []
+        decision_date = str(r["decision_date"]) if r["decision_date"] else ""
+        parties = _extract_parties(r["head_text"] or "")
+        out.append({
            "id": str(r["id"]),
            "decision_number": r["decision_number"] or "",
-            "decision_date": str(r["decision_date"]) if r["decision_date"] else "",
-            "subject_categories": (
-                json.loads(r["subject_categories"])
-                if isinstance(r["subject_categories"], str)
-                else r["subject_categories"] or []
-            ),
+            "decision_date": decision_date,
+            "subject_categories": cats or [],
            "chars": r["chars"],
            "created_at": r["created_at"].isoformat() if r["created_at"] else "",
+            # ── enriched fields ──
+            "summary": r["summary"] or "",
+            "outcome": r["outcome"] or "",
+            "key_principles": kp or [],
+            "appeal_subtype": r["appeal_subtype"] or "",
+            "practice_area": r["practice_area"] or "",
+            "page_count": r["page_count"] or 0,
+            "document_id": str(r["document_id"]) if r["document_id"] else None,
+            "doc_title": r["doc_title"] or "",
+            "parties": parties,
+            "legal_citation": _format_legal_citation(r["decision_number"] or "", decision_date),
+            "lessons_count": lessons_counts.get(str(r["id"]), 0),
+        })
+    return out
+
+
+# ── Style-agent chat (delegated to legal-chat-service on host) ─────
+
+
+class ChatConversationCreate(BaseModel):
+    title: str = "שיחה חדשה"
+    style_corpus_id: str | None = None     # optional — scope chat to a decision
+
+
+class ChatMessageRequest(BaseModel):
+    content: str
+
+
+def _conv_to_json(row: dict) -> dict:
+    """Serialize a chat_conversations row for the API."""
+    return {
+        "id": str(row["id"]),
+        "title": row.get("title") or "",
+        "style_corpus_id": str(row["style_corpus_id"]) if row.get("style_corpus_id") else None,
+        "decision_number": row.get("decision_number") or "",
+        "claude_session_id": row.get("claude_session_id"),
+        "message_count": row.get("message_count", 0),
+        "created_at": row["created_at"].isoformat() if row.get("created_at") else "",
+        "last_message_at": row["last_message_at"].isoformat() if row.get("last_message_at") else "",
+    }
+
+
+def _msg_to_json(row: dict) -> dict:
+    return {
+        "id": str(row["id"]),
+        "role": row["role"],
+        "content": row["content"],
+        "created_at": row["created_at"].isoformat() if row.get("created_at") else "",
+    }
+
+
+@app.post("/api/training/chat/conversations")
+async def chat_create_conversation(body: ChatConversationCreate):
+    """Create a new style-agent chat conversation."""
+    corpus_uuid: UUID | None = None
+    if body.style_corpus_id:
+        try:
+            corpus_uuid = UUID(body.style_corpus_id)
+        except ValueError:
+            raise HTTPException(400, "invalid style_corpus_id")
+    row = await db.create_chat_conversation(
+        title=body.title.strip() or "שיחה חדשה",
+        style_corpus_id=corpus_uuid,
+    )
+    if not row:
+        raise HTTPException(500, "failed to create conversation")
+    return _conv_to_json(row)
+
+
+@app.get("/api/training/chat/conversations")
+async def chat_list_conversations(limit: int = 50):
+    rows = await db.list_chat_conversations(limit=limit)
+    return [_conv_to_json(r) for r in rows]
+
+
+@app.get("/api/training/chat/conversations/{conv_id}")
+async def chat_get_conversation(conv_id: str):
+    try:
+        cid = UUID(conv_id)
+    except ValueError:
+        raise HTTPException(400, "invalid conv_id")
+    conv = await db.get_chat_conversation(cid)
+    if not conv:
+        raise HTTPException(404, "conversation not found")
+    messages = await db.list_chat_messages(cid)
+    return {
+        "conversation": _conv_to_json(conv),
+        "messages": [_msg_to_json(m) for m in messages],
+    }
+
+
+@app.delete("/api/training/chat/conversations/{conv_id}")
+async def chat_delete_conversation(conv_id: str):
+    try:
+        cid = UUID(conv_id)
+    except ValueError:
+        raise HTTPException(400, "invalid conv_id")
+    result = await db.delete_chat_conversation(cid)
+    if not result.get("deleted"):
+        raise HTTPException(404, "conversation not found")
+    return result
+
+
+@app.post("/api/training/chat/conversations/{conv_id}/messages")
+async def chat_send_message(conv_id: str, body: ChatMessageRequest):
+    """Send a user message; stream the assistant response as SSE.
+
+    Proxies through ``web.chat_proxy.stream_chat_message`` to the
+    legal-chat-service running on the host.
+    """
+    try:
+        cid = UUID(conv_id)
+    except ValueError:
+        raise HTTPException(400, "invalid conv_id")
+    text = (body.content or "").strip()
+    if not text:
+        raise HTTPException(400, "content is required")
+    from web import chat_proxy
+    return await chat_proxy.stream_chat_message(cid, text)
+
+
+@app.get("/api/training/chat/health")
+async def chat_health():
+    """Probe legal-chat-service liveness from inside the container.
+
+    Useful when the UI wants to gracefully degrade ("שירות הצ'אט אינו
+    זמין") instead of letting messages fail mid-stream.
+    """
+    from web import chat_proxy
+    try:
+        async with httpx.AsyncClient(timeout=httpx.Timeout(5.0)) as client:
+            r = await client.get(f"{chat_proxy.CHAT_SERVICE_URL}/health")
+        return {"reachable": r.status_code == 200, "status": r.status_code,
+                "url": chat_proxy.CHAT_SERVICE_URL}
+    except Exception as e:
+        return {"reachable": False, "error": str(e),
+                "url": chat_proxy.CHAT_SERVICE_URL}
+
+
+# ── Curator portrait — read prompt + stats + accept proposals ──────
+
+
+# The curator agent's prompt is symlinked into Paperclip, but the source
+# lives in the legal-ai repo. Resolve via env so the container (where the
+# agent file is mounted from a different path) and the host both work.
+_AGENTS_DIR = Path(os.environ.get(
+    "AGENTS_DIR",
+    str(Path(__file__).resolve().parent.parent / ".claude" / "agents"),
+))
+_CURATOR_PROPOSALS_DIR = Path(os.environ.get(
+    "CURATOR_PROPOSALS_DIR",
+    str(Path(__file__).resolve().parent.parent / "data" / "curator-proposals"),
+))
+_GITEA_REPO_BASE = os.environ.get(
+    "GITEA_REPO_BASE",
+    "https://gitea.nautilus.marcusgroup.org/ezer-mishpati/legal-ai",
+)
+
+
+@app.get("/api/training/curator/prompt")
+async def get_curator_prompt():
+    """Return the hermes-curator agent's prompt (read-only) + Gitea source URL.
+
+    The file is the canonical source of how the curator analyzes Daphna's
+    final decisions. Changes go through git/Gitea, not the UI — the UI just
+    surfaces it for transparency.
+    """
+    path = _AGENTS_DIR / "hermes-curator.md"
+    if not path.exists():
+        raise HTTPException(404, f"curator prompt not found at {path}")
+    try:
+        content = path.read_text(encoding="utf-8")
+        stat = path.stat()
+    except OSError as e:
+        raise HTTPException(500, f"failed to read curator prompt: {e}")
+    gitea_url = (
+        f"{_GITEA_REPO_BASE}/src/branch/main/.claude/agents/hermes-curator.md"
+    )
+    return {
+        "content": content,
+        "filename": path.name,
+        "bytes": stat.st_size,
+        "last_modified": stat.st_mtime,
+        "gitea_url": gitea_url,
+    }
+
+
+@app.get("/api/training/curator/style-analyzer-prompt")
+async def get_style_analyzer_prompt():
+    """Return the system prompt that style_analyzer.py uses to extract patterns.
+
+    Surfaces the *training-time* prompt (Claude Opus 1M context) so the
+    chair can compare it against the curator's post-export prompt. Both
+    are shown side-by-side in the curator-portrait tab.
+    """
+    # Embedded as a string so we don't need to import the service module
+    # here (which would pull in claude_session + db). The prompt is the
+    # one defined in mcp-server/src/legal_mcp/services/style_analyzer.py.
+    try:
+        from legal_mcp.services import style_analyzer
+        return {
+            "analysis_prompt": style_analyzer.ANALYSIS_PROMPT,
+            "single_decision_prompt": style_analyzer.SINGLE_DECISION_PROMPT,
+            "synthesis_prompt": style_analyzer.SYNTHESIS_PROMPT,
+            "max_input_tokens": style_analyzer.MAX_INPUT_TOKENS,
        }
-        for r in rows
-    ]
+    except Exception as e:
+        raise HTTPException(500, f"failed to load style_analyzer prompt: {e}")
+
+
+@app.get("/api/training/curator/stats")
+async def get_curator_stats():
+    """Cheap aggregate stats over decision_lessons + style_corpus.
+
+    Used by the Curator-Portrait tab to show "10 curator findings across 24
+    decisions". We deliberately keep this server-side and aggregate so the
+    UI can render a single card without fanning out N queries.
+    """
+    pool = await db.get_pool()
+    async with pool.acquire() as conn:
+        total_lessons = await conn.fetchval(
+            "SELECT count(*) FROM decision_lessons WHERE source = 'curator'"
+        )
+        decisions_with_findings = await conn.fetchval(
+            "SELECT count(DISTINCT style_corpus_id) FROM decision_lessons "
+            "WHERE source = 'curator'"
+        )
+        total_corpus = await conn.fetchval("SELECT count(*) FROM style_corpus")
+        applied = await conn.fetchval(
+            "SELECT count(*) FROM decision_lessons "
+            "WHERE source = 'curator' AND applied_to_skill = true"
+        )
+        # Last 10 curator findings — newest first
+        recent_rows = await conn.fetch(
+            """
+            SELECT dl.id, dl.lesson_text, dl.category, dl.applied_to_skill,
+                   dl.created_at,
+                   sc.decision_number, sc.decision_date
+            FROM decision_lessons dl
+            JOIN style_corpus sc ON sc.id = dl.style_corpus_id
+            WHERE dl.source = 'curator'
+            ORDER BY dl.created_at DESC
+            LIMIT 10
+            """
+        )
+    return {
+        "total_findings": total_lessons or 0,
+        "decisions_with_findings": decisions_with_findings or 0,
+        "decisions_total": total_corpus or 0,
+        "findings_applied": applied or 0,
+        "recent_findings": [
+            {
+                "id": str(r["id"]),
+                "lesson_text": r["lesson_text"],
+                "category": r["category"],
+                "applied_to_skill": bool(r["applied_to_skill"]),
+                "decision_number": r["decision_number"] or "",
+                "decision_date": str(r["decision_date"]) if r["decision_date"] else "",
+                "created_at": r["created_at"].isoformat() if r["created_at"] else "",
+            }
+            for r in recent_rows
+        ],
+    }
+
+
+class CuratorProposal(BaseModel):
+    title: str
+    proposed_change: str       # markdown — what to change in the prompt
+    rationale: str             # markdown — why
+
+
+@app.post("/api/training/curator/proposals")
+async def create_curator_proposal(body: CuratorProposal):
+    """Save a proposed change to the curator prompt as a file on disk.
+
+    No automatic commit, no overwrite — the chair (chaim) reviews the
+    file manually and applies it through git. This is intentional: the
+    prompt is too load-bearing to mutate from a web UI.
+    """
+    title = (body.title or "").strip()
+    if not title:
+        raise HTTPException(400, "title is required")
+    if not body.proposed_change.strip():
+        raise HTTPException(400, "proposed_change is required")
+
+    _CURATOR_PROPOSALS_DIR.mkdir(parents=True, exist_ok=True)
+    # Slug-ish filename — strip anything that isn't a Hebrew letter, ASCII
+    # letter, digit, hyphen, or underscore. Hebrew letters are explicitly
+    # allowed because most proposals will be in Hebrew.
+    slug = re.sub(r"[^\w֐-׿\-]+", "-", title)[:60].strip("-_") or "proposal"
+    today = date_type.today().isoformat()
+    fname = f"{today}-{slug}.md"
+    path = _CURATOR_PROPOSALS_DIR / fname
+
+    # If a proposal with the same slug already exists today, append a
+    # numeric suffix so we don't silently overwrite.
+    idx = 2
+    while path.exists():
+        path = _CURATOR_PROPOSALS_DIR / f"{today}-{slug}-{idx}.md"
+        idx += 1
+
+    md = (
+        f"# הצעת שינוי לפרומפט hermes-curator\n\n"
+        f"- **תאריך:** {today}\n"
+        f"- **כותרת:** {title}\n\n"
+        f"## שינוי מוצע\n\n{body.proposed_change.strip()}\n\n"
+        f"## נימוק\n\n{body.rationale.strip() or '(לא ניתן)'}\n"
+    )
+    try:
+        path.write_text(md, encoding="utf-8")
+    except OSError as e:
+        raise HTTPException(500, f"failed to write proposal: {e}")
+    return {
+        "saved": True,
+        "filename": path.name,
+        "path": str(path),
+        "bytes": len(md.encode("utf-8")),
+    }
+
+
+@app.get("/api/training/curator/proposals")
+async def list_curator_proposals():
+    """List proposed-change files in data/curator-proposals/, newest first."""
+    if not _CURATOR_PROPOSALS_DIR.exists():
+        return []
+    items = []
+    for p in sorted(_CURATOR_PROPOSALS_DIR.iterdir(),
+                    key=lambda f: f.stat().st_mtime, reverse=True):
+        if not p.is_file() or p.suffix.lower() != ".md":
+            continue
+        stat = p.stat()
+        items.append({
+            "filename": p.name,
+            "bytes": stat.st_size,
+            "modified_at": stat.st_mtime,
+        })
+    return items
+
+
+# ── Per-decision lessons (decision_lessons table) ──────────────────
+
+
+class LessonCreate(BaseModel):
+    lesson_text: str
+    category: str = "general"
+    source: str = "manual"
+
+
+class LessonPatch(BaseModel):
+    lesson_text: str | None = None
+    category: str | None = None
+    applied_to_skill: bool | None = None
+
+
+_LESSON_CATEGORIES = {"style", "structure", "lexicon", "tabular", "general"}
+_LESSON_SOURCES = {"manual", "curator", "chair", "style_analyzer"}
+
+
+def _lesson_to_json(row: dict) -> dict:
+    return {
+        "id": str(row["id"]),
+        "style_corpus_id": str(row["style_corpus_id"]),
+        "lesson_text": row["lesson_text"],
+        "category": row["category"],
+        "source": row["source"],
+        "applied_to_skill": bool(row["applied_to_skill"]),
+        "created_by": row.get("created_by", ""),
+        "created_at": row["created_at"].isoformat() if row.get("created_at") else "",
+        "updated_at": row["updated_at"].isoformat() if row.get("updated_at") else "",
+    }
+
+
+@app.get("/api/training/corpus/{corpus_id}/lessons")
+async def list_corpus_lessons(corpus_id: str):
+    try:
+        cid = UUID(corpus_id)
+    except ValueError:
+        raise HTTPException(400, "invalid corpus_id")
+    rows = await db.list_decision_lessons(cid)
+    return [_lesson_to_json(r) for r in rows]
+
+
+@app.post("/api/training/corpus/{corpus_id}/lessons")
+async def add_corpus_lesson(corpus_id: str, body: LessonCreate):
+    try:
+        cid = UUID(corpus_id)
+    except ValueError:
+        raise HTTPException(400, "invalid corpus_id")
+    text = (body.lesson_text or "").strip()
+    if not text:
+        raise HTTPException(400, "lesson_text is required")
+    if body.category not in _LESSON_CATEGORIES:
+        raise HTTPException(400, f"invalid category; allowed: {sorted(_LESSON_CATEGORIES)}")
+    if body.source not in _LESSON_SOURCES:
+        raise HTTPException(400, f"invalid source; allowed: {sorted(_LESSON_SOURCES)}")
+    row = await db.add_decision_lesson(
+        cid, lesson_text=text, category=body.category, source=body.source,
+    )
+    if not row:
+        raise HTTPException(500, "failed to insert lesson")
+    return _lesson_to_json(row)
+
+
+@app.patch("/api/training/lessons/{lesson_id}")
+async def patch_corpus_lesson(lesson_id: str, body: LessonPatch):
+    try:
+        lid = UUID(lesson_id)
+    except ValueError:
+        raise HTTPException(400, "invalid lesson_id")
+    if body.category is not None and body.category not in _LESSON_CATEGORIES:
+        raise HTTPException(400, f"invalid category; allowed: {sorted(_LESSON_CATEGORIES)}")
+    result = await db.update_decision_lesson(
+        lid,
+        lesson_text=body.lesson_text,
+        category=body.category,
+        applied_to_skill=body.applied_to_skill,
+    )
+    if not result.get("updated"):
+        if result.get("reason") == "not found":
+            raise HTTPException(404, "lesson not found")
+        return result  # "nothing to update" — 200 with reason
+    return result
+
+
+@app.delete("/api/training/lessons/{lesson_id}")
+async def delete_corpus_lesson(lesson_id: str):
+    try:
+        lid = UUID(lesson_id)
+    except ValueError:
+        raise HTTPException(400, "invalid lesson_id")
+    result = await db.delete_decision_lesson(lid)
+    if not result.get("deleted"):
+        raise HTTPException(404, "lesson not found")
+    return result
+
+
+@app.get("/api/training/corpus/{corpus_id}/full-text")
+async def training_corpus_full_text(corpus_id: str):
+    """Return the proofread full_text for a single corpus row.
+
+    Kept out of the list endpoint because full_text is large (50K-650K chars
+    per decision) and the table view only needs counts. The drawer fetches
+    it on demand when the chair opens the "content" tab.
+    """
+    try:
+        cid = UUID(corpus_id)
+    except ValueError:
+        raise HTTPException(400, "invalid corpus_id")
+    pool = await db.get_pool()
+    async with pool.acquire() as conn:
+        row = await conn.fetchrow(
+            "SELECT decision_number, full_text FROM style_corpus WHERE id = $1",
+            cid,
+        )
+    if not row:
+        raise HTTPException(404, "corpus row not found")
+    return {
+        "id": corpus_id,
+        "decision_number": row["decision_number"] or "",
+        "full_text": row["full_text"] or "",
+    }
+
+
+class TrainingCorpusPatch(BaseModel):
+    """Editable metadata fields on a style_corpus row.
+
+    full_text is intentionally NOT editable — the corpus is write-once.
+    For corrections, re-upload the decision via /api/training/upload.
+    """
+    decision_number: str | None = None
+    decision_date: str | None = None       # ISO YYYY-MM-DD, or "" to clear
+    subject_categories: list[str] | None = None
+    summary: str | None = None
+    outcome: str | None = None
+    key_principles: list[str] | None = None
+    appeal_subtype: str | None = None
+    practice_area: str | None = None
+
+
+@app.patch("/api/training/corpus/{corpus_id}")
+async def training_corpus_patch(corpus_id: str, patch: TrainingCorpusPatch):
+    """Update metadata fields on a corpus row. Only provided fields are touched."""
+    try:
+        cid = UUID(corpus_id)
+    except ValueError:
+        raise HTTPException(400, "invalid corpus_id")
+
+    fields = patch.model_dump(exclude_none=True)
+    if not fields:
+        return {"updated": False, "reason": "no fields to update"}
+
+    # Coerce decision_date "" → SQL NULL, otherwise parse as DATE.
+    if "decision_date" in fields:
+        v = fields["decision_date"]
+        if v == "":
+            fields["decision_date"] = None
+        else:
+            try:
+                fields["decision_date"] = date_type.fromisoformat(v)
+            except ValueError as e:
+                raise HTTPException(400, f"invalid decision_date: {e}")
+
+    # subject_categories + key_principles are JSONB columns.
+    if "subject_categories" in fields:
+        fields["subject_categories"] = json.dumps(fields["subject_categories"])
+    if "key_principles" in fields:
+        fields["key_principles"] = json.dumps(fields["key_principles"])
+
+    # Build a positional UPDATE — asyncpg doesn't support named parameters.
+    cols = list(fields.keys())
+    set_clause = ", ".join(f"{c} = ${i + 2}" for i, c in enumerate(cols))
+    values = [fields[c] for c in cols]
+
+    pool = await db.get_pool()
+    async with pool.acquire() as conn:
+        result = await conn.fetchrow(
+            f"UPDATE style_corpus SET {set_clause} "
+            f"WHERE id = $1 "
+            f"RETURNING id, decision_number, decision_date, summary, outcome",
+            cid, *values,
+        )
+    if not result:
+        raise HTTPException(404, "corpus row not found")
+    return {
+        "updated": True,
+        "id": str(result["id"]),
+        "decision_number": result["decision_number"] or "",
+        "decision_date": str(result["decision_date"]) if result["decision_date"] else "",
+        "summary_len": len(result["summary"] or ""),
+        "outcome_len": len(result["outcome"] or ""),
+    }


 # Headers that defeat proxy buffering for SSE streams. `X-Accel-Buffering: no`