legal-ai/mcp-server/src/legal_mcp/tools/training_enrichment.py

"""MCP tool wrappers for the style_corpus metadata-enrichment flow.

The actual extractor lives in
``legal_mcp.services.style_metadata_extractor``; this module just exposes
it as MCP tools that the chair (or a future automation) can call from
Claude Code.

Why these tools matter: the upload pipeline (`/api/training/upload` →
`_process_proofread_training`) inserts a style_corpus row with
``summary=''``, ``outcome=''``, ``key_principles=[]`` because LLM
extraction can't run from the FastAPI container (no claude CLI there).
This module fills that gap — call it from the host, where ``claude``
CLI is available, and the row gets enriched.
"""

from __future__ import annotations

from uuid import UUID

from legal_mcp.services import db, style_metadata_extractor
from legal_mcp.tools.envelope import err as _err, ok as _ok  # GAP-48: SSoT envelope


async def extract_decision_metadata(corpus_id: str, overwrite: bool = False) -> str:
    """חילוץ מטא-דאטה (summary, outcome, key_principles, appeal_subtype) להחלטה בקורפוס הסגנון.

    ברירת מחדל ``overwrite=False`` ממלא רק שדות ריקים. הזן ``overwrite=true``
    כדי לרענן ערכים שכבר נכתבו.
    """
    try:
        cid = UUID(corpus_id)
    except ValueError:
        return _err("corpus_id לא תקין")
    try:
        result = await style_metadata_extractor.extract_and_apply(cid, overwrite=overwrite)
    except Exception as e:
        return _err(str(e))
    return _ok(result)


async def list_corpus_pending_enrichment(limit: int = 50) -> str:
    """רשימת רשומות style_corpus שחסר להן summary/outcome/key_principles — מועמדות להעשרה."""
    pool = await db.get_pool()
    async with pool.acquire() as conn:
        rows = await conn.fetch(
            """
            SELECT id, decision_number, decision_date,
                   length(full_text) AS chars,
                   coalesce(summary, '') = '' AS missing_summary,
                   coalesce(outcome, '') = '' AS missing_outcome,
                   coalesce(jsonb_array_length(key_principles), 0) = 0 AS missing_principles
            FROM style_corpus
            WHERE coalesce(summary, '') = ''
               OR coalesce(outcome, '') = ''
               OR coalesce(jsonb_array_length(key_principles), 0) = 0
            ORDER BY decision_date NULLS LAST
            LIMIT $1
            """,
            limit,
        )
    items = [
        {
            "corpus_id": str(r["id"]),
            "decision_number": r["decision_number"] or "",
            "decision_date": str(r["decision_date"]) if r["decision_date"] else "",
            "chars": r["chars"],
            "missing": [
                f for f, v in (
                    ("summary", r["missing_summary"]),
                    ("outcome", r["missing_outcome"]),
                    ("key_principles", r["missing_principles"]),
                ) if v
            ],
        }
        for r in rows
    ]
    return _ok({"count": len(items), "items": items})