Add training corpus UI with Nevo proofreading pipeline

- New proofreader service strips Nevo editorial additions (front matter, postamble, page headers, watermarks, inline codes) from DOCX/PDF/MD - PDF pages use Google Vision OCR for clean Hebrew RTL extraction - New training page at #/training with drag-and-drop upload, automatic metadata extraction (decision number, date, categories), reviewable preview, and style pattern report grouped by type - API endpoints: /api/training/{analyze,upload,corpus,patterns, analyze-style,analyze-style/status} - Fix claude_session.query to pipe prompt via stdin, avoiding ARG_MAX overflow when analyzing 900K+ char corpus - CLI scripts for batch proofreading and corpus upload Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 11:04:58 +00:00
parent ecda95d610
commit 32f18de049
6 changed files with 1960 additions and 3 deletions
--- a/mcp-server/src/legal_mcp/services/claude_session.py
+++ b/mcp-server/src/legal_mcp/services/claude_session.py
@@ -24,6 +24,9 @@ LONG_TIMEOUT = 300  # For complex tasks like block writing
 def query(prompt: str, timeout: int = DEFAULT_TIMEOUT, max_turns: int = 1) -> str:
    """Send a prompt to Claude Code headless and return the text response.
    Passes the prompt via stdin (not argv) to avoid the OS ARG_MAX limit —
    prompts can be 500K+ chars when analyzing a full style corpus.
    Args:
        prompt: The prompt to send.
        timeout: Max seconds to wait.
@@ -36,14 +39,18 @@ def query(prompt: str, timeout: int = DEFAULT_TIMEOUT, max_turns: int = 1) -> st
        RuntimeError: If claude CLI is not available or fails.
    """
    cmd = [
-        "claude", "-p", prompt,
+        "claude", "-p",
        "--output-format", "json",
        "--max-turns", str(max_turns),
    ]
    try:
        result = subprocess.run(
-            cmd, capture_output=True, text=True, timeout=timeout,
+            cmd,
            input=prompt,
            capture_output=True,
            text=True,
            timeout=timeout,
        )
    except FileNotFoundError:
        raise RuntimeError("Claude CLI not found. Install Claude Code or add 'claude' to PATH.")
--- a/mcp-server/src/legal_mcp/services/proofreader.py
+++ b/mcp-server/src/legal_mcp/services/proofreader.py
@@ -0,0 +1,404 @@
 """Nevo proofreading service for training corpus.
 Strips Nevo editorial additions (front matter, back matter, page headers,
 watermarks, inline watermark codes) from legal decision DOCX/PDF/MD files.
 Also extracts metadata (decision number, date, subject categories) via
 heuristics on cleaned text.
 Used by:
  * CLI script: scripts/proofread_training_corpus.py
  * Web API:    /api/training/analyze
 """
 from __future__ import annotations
 import asyncio
 import re
 import time
 from datetime import date as date_type
 from pathlib import Path
 from typing import Any
 import fitz
 from docx import Document
 from google.cloud import vision
 from legal_mcp import config
 # ── Nevo pattern detection ────────────────────────────────────────
 NEVO_PREAMBLE_HEADERS = (
    "ספרות:",
    "חקיקה שאוזכרה:",
    "מיני-רציו:",
 )
 DECISION_OPENING = re.compile(
    r"^(עניינו\s|ענייננו\s|עסקינן\s|בפנינו\s|לפנינו\s|בערר\s+שלפנינו|זהו\s+ערר)"
 )
 DECISION_SECTION_HEADERS = {
    "רקע",
    "פתח דבר",
    "תמצית טענות הצדדים",
    "העובדות",
    "הרקע העובדתי",
    "מבוא",
 }
 NEVO_POSTAMBLE_MARKERS = (
    "5129371512937154678313",
    "בעניין עריכה ושינויים במסמכי פסיקה",
    "נוסח מסמך זה כפוף לשינויי ניסוח ועריכה",
 )
 NEVO_INLINE_CODE_RE = re.compile(r"^0?(5129371|54678313)\d*")
 PDF_PAGE_HEADER_RE = re.compile(
    r"\s*עמוד\s*\n?\s*\d+\s*\n?\s*(?:מתוך|בן)\s*\n?\s*\d+\s*"
 )
 PDF_PAGE_ORPHAN_RE = re.compile(r"(?m)^עמוד[^\n]{0,12}$")
 PDF_PAGE_NUM_LINE_RE = re.compile(r"(?m)^\s*עמוד\s*\n?\s*\d+[·.*]?\s*$")
 NEVO_URL_RE = re.compile(
    r"(nevo\.co\.il|neto\.co\.il|netocoal|neetocoal|nevocoal|nevo\.co|rawo\.co\.il)",
    re.IGNORECASE,
 )
 _FOOTER_JUNK_RE = re.compile(
    r"^("
    r"\s*|"
    r"[-·*.\"\'׳״]+|"
    r"\d{1,3}[\s\-·*.\"\'׳״]*|"
    r"עמוד[\s\d\-·*.\"\'׳״]*|"
    r"[-·*\s\"\'׳״]*[a-zA-Z][a-zA-Z0-9 .\-·*_]{0,30}"
    r")$"
 )
 # Hebrew abbreviation quote fixes — Google Vision renders ״ as 'יי'
 _HEBREW_ABBREV_FIXES: dict[str, str] = {
    "עוהייד": 'עוה"ד', "עוייד": 'עו"ד', "הנייל": 'הנ"ל', "מצייב": 'מצ"ב',
    "ביהמייש": 'ביהמ"ש', "תייז": 'ת"ז', "עייי": 'ע"י', "אחייכ": 'אח"כ',
    "סייק": 'ס"ק', "דייר": 'ד"ר', "חווייד": 'חוו"ד', "מייר": 'מ"ר',
    "יחייד": 'יח"ד', "בייכ": 'ב"כ', "בייה": 'ב"ה', "שייח": 'ש"ח',
    "יוייר": 'יו"ר', "בליימ": 'בל"מ', "תבייע": 'תב"ע', "תמייא": 'תמ"א',
    "סייה": 'ס"ה', "שייפ": 'ש"פ', "שצייפ": 'שצ"פ', "שבייצ": 'שב"צ',
    "עסיים": 'עס"ם', "הייה": 'ה"ה', "פסייד": 'פס"ד', "תיידא": 'תיד"א',
    "בגייץ": 'בג"ץ', "עתיים": 'עת"ם', "עעיים": 'עע"ם',
    "כייא": 'כ"א', "כייב": 'כ"ב', "כייג": 'כ"ג', "כייד": 'כ"ד',
    "כייה": 'כ"ה', "כייו": 'כ"ו', "כייז": 'כ"ז', "כייח": 'כ"ח', "כייט": 'כ"ט',
    "לייא": 'ל"א',
    "יייא": 'י"א', "יייב": 'י"ב', "יייג": 'י"ג', "יייד": 'י"ד',
    "טייו": 'ט"ו', "טייז": 'ט"ז', "יייז": 'י"ז', "יייח": 'י"ח', "יייט": 'י"ט',
    "תשפייא": 'תשפ"א', "תשפייב": 'תשפ"ב', "תשפייג": 'תשפ"ג',
    "תשפייד": 'תשפ"ד', "תשפייה": 'תשפ"ה', "תשפייו": 'תשפ"ו',
    "תשפיין": 'תשפ"ן',
 }
 _ABBREV_PATTERN = re.compile(
    "|".join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True))
 )
 def _fix_hebrew_quotes(text: str) -> str:
    return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
 # ── Google Vision OCR ────────────────────────────────────────────
 _vision_client: vision.ImageAnnotatorClient | None = None
 def _get_vision_client() -> vision.ImageAnnotatorClient:
    global _vision_client
    if _vision_client is None:
        if not config.GOOGLE_CLOUD_VISION_API_KEY:
            raise RuntimeError("GOOGLE_CLOUD_VISION_API_KEY not set")
        _vision_client = vision.ImageAnnotatorClient(
            client_options={"api_key": config.GOOGLE_CLOUD_VISION_API_KEY}
        )
    return _vision_client
 def _ocr_page_image(image_bytes: bytes, page_num: int) -> str:
    client = _get_vision_client()
    image = vision.Image(content=image_bytes)
    response = client.document_text_detection(
        image=image,
        image_context=vision.ImageContext(language_hints=["he"]),
    )
    if response.error.message:
        raise RuntimeError(f"Vision error page {page_num}: {response.error.message}")
    text = response.full_text_annotation.text if response.full_text_annotation else ""
    return _fix_hebrew_quotes(text)
 # ── DOCX proofreading ────────────────────────────────────────────
 def _find_decision_start(paragraphs: list[str]) -> int:
    """Find first real decision paragraph, skipping Nevo preamble."""
    has_nevo_preamble = any(
        any(p.startswith(h) for h in NEVO_PREAMBLE_HEADERS) for p in paragraphs[:10]
    )
    if not has_nevo_preamble:
        return 0
    for i, p in enumerate(paragraphs):
        stripped = p.strip()
        if stripped in DECISION_SECTION_HEADERS:
            return i
        if DECISION_OPENING.match(stripped):
            return i
    for i, p in enumerate(paragraphs):
        if "קבעה כלהלן" in p or "קבעה את הדברים הבאים" in p:
            for j in range(i + 1, min(i + 15, len(paragraphs))):
                if len(paragraphs[j]) > 80 and not paragraphs[j].strip().startswith("*"):
                    return j
            break
    return min(10, len(paragraphs) - 1)
 def _find_decision_end(paragraphs: list[str]) -> int:
    """First paragraph that is a Nevo postamble marker (exclusive end)."""
    for i, p in enumerate(paragraphs):
        for marker in NEVO_POSTAMBLE_MARKERS:
            if marker in p:
                return i
    return len(paragraphs)
 def _strip_inline_nevo_codes(paragraphs: list[str]) -> list[str]:
    out: list[str] = []
    for p in paragraphs:
        stripped = NEVO_INLINE_CODE_RE.sub("", p).strip()
        if stripped:
            out.append(stripped)
    return out
 def proofread_docx(path: Path) -> tuple[str, dict]:
    """Extract clean decision text from Nevo DOCX. Returns (markdown, stats)."""
    doc = Document(str(path))
    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
    start = _find_decision_start(paragraphs)
    end = _find_decision_end(paragraphs)
    clean = _strip_inline_nevo_codes(paragraphs[start:end])
    md = "\n\n".join(clean)
    return md, {
        "source_type": "docx",
        "total_paragraphs": len(paragraphs),
        "preamble_stripped": start,
        "postamble_stripped": len(paragraphs) - end,
        "clean_paragraphs": len(clean),
    }
 # ── PDF proofreading ─────────────────────────────────────────────
 def _clean_page_text(text: str) -> str:
    text = PDF_PAGE_HEADER_RE.sub("\n", text)
    lines = text.split("\n")
    while lines and _FOOTER_JUNK_RE.match(lines[-1].strip()):
        lines.pop()
    text = "\n".join(lines)
    text = NEVO_URL_RE.sub("", text)
    text = PDF_PAGE_NUM_LINE_RE.sub("", text)
    text = PDF_PAGE_ORPHAN_RE.sub("", text)
    return text.strip()
 async def proofread_pdf(path: Path) -> tuple[str, dict]:
    """Extract clean decision text from Nevo PDF via Google Vision OCR."""
    doc = fitz.open(str(path))
    pages: list[str] = []
    for i, page in enumerate(doc):
        pix = page.get_pixmap(dpi=300)
        img_bytes = pix.tobytes("png")
        text = await asyncio.to_thread(_ocr_page_image, img_bytes, i + 1)
        pages.append(_clean_page_text(text))
        await asyncio.sleep(0.1)
    doc.close()
    body = "\n\n".join(p for p in pages if p)
    body = re.sub(r"\n{3,}", "\n\n", body)
    body = re.sub(r"[ \t]+\n", "\n", body)
    for marker in NEVO_POSTAMBLE_MARKERS:
        idx = body.find(marker)
        if idx != -1:
            body = body[:idx].rstrip()
            break
    return body, {
        "source_type": "pdf",
        "pages": len(pages),
        "chars": len(body),
    }
 # ── MD/TXT passthrough ───────────────────────────────────────────
 def proofread_md(path: Path) -> tuple[str, dict]:
    """Plain text passthrough for already-clean .md/.txt files."""
    text = path.read_text(encoding="utf-8")
    return text, {"source_type": "md", "chars": len(text)}
 async def proofread(path: Path) -> tuple[str, dict]:
    """Proofread a file based on its extension. Returns (clean_text, stats)."""
    suffix = path.suffix.lower()
    if suffix == ".docx":
        return proofread_docx(path)
    if suffix == ".pdf":
        return await proofread_pdf(path)
    if suffix in (".md", ".txt"):
        return proofread_md(path)
    raise ValueError(f"Unsupported file type: {suffix}")
 # ── Metadata extraction ──────────────────────────────────────────
 FILENAME_NUMBER_PATTERNS = [
    re.compile(r"^ARAR-(\d{2})-(\d{3,4})"),
    re.compile(r"^ערר\s+(\d{3,4})-(\d{2})"),
    re.compile(r"^ערר\s+(\d{3,4})\s*-"),
 ]
 LEGACY_MULTI_PATTERN = re.compile(r"(\d{3,4})\+(\d{3,4})")
 def decision_number_from_filename(stem: str) -> str | None:
    """Extract NUMBER/YY from a filename stem."""
    m = FILENAME_NUMBER_PATTERNS[0].match(stem)
    if m:
        return f"{m.group(2)}/{m.group(1)}"
    m = FILENAME_NUMBER_PATTERNS[1].match(stem)
    if m:
        return f"{m.group(1)}/{m.group(2)}"
    m = FILENAME_NUMBER_PATTERNS[2].match(stem)
    if m:
        return f"{m.group(1)}/??"
    m = LEGACY_MULTI_PATTERN.search(stem)
    if m:
        return f"{m.group(1)}+{m.group(2)}/??"
    return None
 HEBREW_MONTHS = {
    "ינואר": 1, "בינואר": 1, "פברואר": 2, "בפברואר": 2,
    "מרץ": 3, "מרס": 3, "במרץ": 3, "במרס": 3,
    "אפריל": 4, "באפריל": 4, "מאי": 5, "במאי": 5,
    "יוני": 6, "ביוני": 6, "יולי": 7, "ביולי": 7,
    "אוגוסט": 8, "באוגוסט": 8, "ספטמבר": 9, "בספטמבר": 9,
    "אוקטובר": 10, "באוקטובר": 10, "נובמבר": 11, "בנובמבר": 11,
    "דצמבר": 12, "בדצמבר": 12,
 }
 DATE_RE = re.compile(
    r"(\d{1,2})\s+(ב?(?:ינואר|פברואר|מרץ|מרס|אפריל|מאי|יוני|יולי|אוגוסט|ספטמבר|אוקטובר|נובמבר|דצמבר))\s*[,.]?\s*(\d{4})"
 )
 NITNA_RE = re.compile(r"ניתנ[הו]?\s+(?:פה\s+אחד|בדעת\s+רוב|היום)?")
 def decision_date_from_text(text: str) -> str | None:
    tail = text[-2500:] if len(text) > 2500 else text
    nitna_match = NITNA_RE.search(tail)
    search_text = tail[nitna_match.start():] if nitna_match else tail
    m = DATE_RE.search(search_text)
    if not m:
        m = DATE_RE.search(tail)
    if not m:
        return None
    day = int(m.group(1))
    month = HEBREW_MONTHS.get(m.group(2))
    year = int(m.group(3))
    if not month:
        return None
    try:
        return date_type(year, month, day).isoformat()
    except ValueError:
        return None
 def finalize_decision_number(number: str | None, date_iso: str | None) -> str:
    if not number:
        return f"??/{date_iso[2:4]}" if date_iso else ""
    if number.endswith("/??"):
        return number.replace("/??", f"/{date_iso[2:4]}") if date_iso else number.replace("/??", "")
    return number
 def categorize(text: str) -> list[str]:
    """Heuristic subject category detection based on opening + repetition."""
    opening = text[:2000]
    t = text
    cats: list[str] = []
    if re.search(r'תמ[״"\']?א\s*38|תמא\s*38', t):
        cats.append('תמ"א 38')
    if len(re.findall(r"היטל(?:י)?\s+השבחה", t)) >= 3 or re.search(r"היטל(?:י)?\s+השבחה", opening):
        cats.append("היטל השבחה")
    p197_re = r"פיצויים\s+לפי\s+(?:ס(?:עיף|')\s*)?197|סעיף\s*197|ס['\"]?\s*197"
    if len(re.findall(p197_re, t)) >= 2 or re.search(p197_re, opening):
        cats.append("פיצויים 197")
    if t.count("שימוש חורג") >= 3 or "שימוש חורג" in opening:
        cats.append("שימוש חורג")
    if len(re.findall(r"\bהקלה\b|\bהקלות\b", t)) >= 3 and re.search(r"\bהקלה\b|\bהקלות\b", opening):
        cats.append("הקלה")
    if re.search(r"איחוד\s+וחלוקה|חלוקה\s+חדשה|תכנית\s+לחלוקה", t):
        cats.append("חלוקה")
    if re.search(
        r"הפקדת\s+ה?תכנית|אישור\s+ה?תכנית|המלצה\s+להפקיד|"
        r"להפקיד\s+את\s+ה?תכנית|לדון\s+בתכנית|דנה\s+בתכנית|"
        r"החלטה\s+לאשר\s+ה?תכנית",
        opening,
    ):
        cats.append("תכנית")
    if re.search(r"בקשה\s+להיתר|היתר\s+בני(?:י)?ה", opening):
        cats.append("היתר")
    has_permit_subject = "היתר" in cats or "הקלה" in cats or 'תמ"א 38' in cats
    if has_permit_subject and "בנייה" not in cats:
        cats.append("בנייה")
    return cats or ["בנייה"]
 async def analyze_file(path: Path) -> dict[str, Any]:
    """Proofread a file and extract metadata for review.
    Returns a dict suitable for UI preview with: clean text, metadata,
    stats, and a short text preview for visual verification.
    """
    clean_text, stats = await proofread(path)
    num_raw = decision_number_from_filename(path.stem)
    d_iso = decision_date_from_text(clean_text)
    number = finalize_decision_number(num_raw, d_iso)
    cats = categorize(clean_text)
    return {
        "filename": path.name,
        "clean_text": clean_text,
        "preview": clean_text[:500],
        "decision_number": number,
        "decision_date": d_iso or "",
        "subject_categories": cats,
        "stats": stats,
        "chars": len(clean_text),
    }
--- a/scripts/batch_upload_training.py
+++ b/scripts/batch_upload_training.py
@@ -0,0 +1,349 @@
 """Batch upload proofread training corpus to style DB.
 Two-phase workflow:
  --preview    Extract metadata from all .md files, print review table, don't upload
  --upload     Actually upload all files (with optional --only FILE to run one)
 Metadata extraction:
  * decision_number: from filename (ARAR-YY-NNNN / ערר NNNN-YY) or decision date year
  * decision_date:   from "ניתנה ... <day> ב<Hebrew month> <YYYY>" near end of text
  * categories:      keyword heuristics on body text
 """
 from __future__ import annotations
 import argparse
 import asyncio
 import os
 import re
 import sys
 from pathlib import Path
 PROOFREAD_DIR = Path("/home/chaim/legal-ai/data/training/proofread")
 # Manual metadata overrides for files where auto-extraction can't determine values.
 METADATA_OVERRIDES: dict[str, dict] = {
    "ARAR-25-1067 - יחיעם יפה ואח׳.md": {
        "decision_date": "2025-11-27",  # no "ניתנה" signature in file; user-provided
    },
 }
 # Files to skip — already in style_corpus from legacy ingestion
 # (verified by exact character-count match with existing DB rows).
 SKIP_FILES = {
    "תמא 38-בית הכרם-1126+1141-החלטה.md",       # → corpus: 1126/1141
    "היתר בניה-בית שמש-1180+1181-החלטה.md",     # → corpus: 1180/1181
    "היתר בניה-הראל-1043+1054-החלטה.md",        # → corpus: 1043/1054
    "היתר בניה-הראל-1071+1077-החלטה.md",        # → corpus: 1071/1077
 }
 # Load env vars needed by mcp-server
 ENV_FILE = Path.home() / ".env"
 if ENV_FILE.exists():
    for line in ENV_FILE.read_text().splitlines():
        if "=" in line and not line.startswith("#"):
            k, v = line.split("=", 1)
            os.environ.setdefault(k.strip(), v.strip().strip('"').strip("'"))
 # Make mcp-server package importable
 sys.path.insert(0, "/home/chaim/legal-ai/mcp-server/src")
 # ── Decision number extraction ───────────────────────────────────
 FILENAME_NUMBER_PATTERNS = [
    # ARAR-YY-NNNN[-X] - title.md
    re.compile(r"^ARAR-(\d{2})-(\d{3,4})"),
    # ערר NNNN-YY title.md  or  ערר NNNN-YY title
    re.compile(r"^ערר\s+(\d{3,4})-(\d{2})"),
    # ערר NNNN - title (no year in filename — needs date lookup)
    re.compile(r"^ערר\s+(\d{3,4})\s*-"),
 ]
 LEGACY_MULTI_PATTERN = re.compile(r"(\d{3,4})\+(\d{3,4})")
 def decision_number_from_filename(stem: str) -> tuple[str | None, str | None]:
    """Return (number, year_short) or (multi_number, None) or (None, None).
    year_short is YY (last 2 digits) if extractable from filename.
    For legacy files with 'NNNN+NNNN' or no year, returns partial info
    that must be completed from decision date.
    """
    # ARAR-YY-NNNN
    m = FILENAME_NUMBER_PATTERNS[0].match(stem)
    if m:
        year, num = m.group(1), m.group(2)
        return f"{num}/{year}", year
    # ערר NNNN-YY
    m = FILENAME_NUMBER_PATTERNS[1].match(stem)
    if m:
        num, year = m.group(1), m.group(2)
        return f"{num}/{year}", year
    # ערר NNNN - title (no year)
    m = FILENAME_NUMBER_PATTERNS[2].match(stem)
    if m:
        num = m.group(1)
        return f"{num}/??", None
    # Legacy: "NNNN+NNNN" merged decisions
    m = LEGACY_MULTI_PATTERN.search(stem)
    if m:
        return f"{m.group(1)}+{m.group(2)}/??", None
    return None, None
 # ── Decision date extraction ─────────────────────────────────────
 HEBREW_MONTHS = {
    "ינואר": 1, "בינואר": 1,
    "פברואר": 2, "בפברואר": 2,
    "מרץ": 3, "מרס": 3, "במרץ": 3, "במרס": 3,
    "אפריל": 4, "באפריל": 4,
    "מאי": 5, "במאי": 5,
    "יוני": 6, "ביוני": 6,
    "יולי": 7, "ביולי": 7,
    "אוגוסט": 8, "באוגוסט": 8,
    "ספטמבר": 9, "בספטמבר": 9,
    "אוקטובר": 10, "באוקטובר": 10,
    "נובמבר": 11, "בנובמבר": 11,
    "דצמבר": 12, "בדצמבר": 12,
 }
 # Matches "<day> ב<month>, <year>" or "<day> <month>, <year>" (with optional commas)
 DATE_RE = re.compile(
    r"(\d{1,2})\s+(ב?(?:ינואר|פברואר|מרץ|מרס|אפריל|מאי|יוני|יולי|אוגוסט|ספטמבר|אוקטובר|נובמבר|דצמבר))\s*[,.]?\s*(\d{4})"
 )
 NITNA_RE = re.compile(r"ניתנ[הו]?\s+(?:פה\s+אחד|בדעת\s+רוב|היום)?")
 def decision_date_from_text(text: str) -> str | None:
    """Extract decision date in YYYY-MM-DD format from 'ניתנה... DATE' section.
    Searches the last ~2000 chars where the signing block lives.
    """
    tail = text[-2500:] if len(text) > 2500 else text
    # Prefer dates near "ניתנה" marker
    nitna_match = NITNA_RE.search(tail)
    search_text = tail[nitna_match.start():] if nitna_match else tail
    m = DATE_RE.search(search_text)
    if not m:
        # Fall back: search whole tail
        m = DATE_RE.search(tail)
    if not m:
        return None
    day = int(m.group(1))
    month = HEBREW_MONTHS.get(m.group(2))
    year = int(m.group(3))
    if not month:
        return None
    try:
        from datetime import date
        return date(year, month, day).isoformat()
    except ValueError:
        return None
 # ── Subject category extraction ──────────────────────────────────
 # Categories as defined in the tool signature.
 ALL_CATEGORIES = [
    "בנייה", "שימוש חורג", "תכנית", "היתר", "הקלה",
    "חלוקה", 'תמ"א 38', "היטל השבחה", "פיצויים 197",
 ]
 def categorize(text: str) -> list[str]:
    """Heuristic category detection based on subject matter, not incidental mentions.
    Strategy: the real subject is established in the opening 2000 chars
    (first decision-opening paragraph). Secondary signal is repetition count
    — casual mentions in law citations don't repeat.
    """
    opening = text[:2000]  # subject is stated up front
    t = text
    cats: list[str] = []
    # תמ"א 38 — very specific marker, single mention is fine
    if re.search(r'תמ[״"\']?א\s*38|תמא\s*38', t):
        cats.append('תמ"א 38')
    # היטל השבחה — require real engagement: must appear in opening OR 3+ times
    hsbacha_count = len(re.findall(r"היטל(?:י)?\s+השבחה", t))
    if hsbacha_count >= 3 or re.search(r"היטל(?:י)?\s+השבחה", opening):
        cats.append("היטל השבחה")
    # פיצויים 197 — require multiple mentions OR in opening
    p197_re = r"פיצויים\s+לפי\s+(?:ס(?:עיף|')\s*)?197|סעיף\s*197|ס['\"]?\s*197"
    p197_count = len(re.findall(p197_re, t))
    if p197_count >= 2 or re.search(p197_re, opening):
        cats.append("פיצויים 197")
    # שימוש חורג — must appear in opening OR 3+ times (avoids law-quote false positives)
    shimush_count = t.count("שימוש חורג")
    if shimush_count >= 3 or "שימוש חורג" in opening:
        cats.append("שימוש חורג")
    # הקלה — real subject if 3+ mentions AND appears in opening
    hakala_count = len(re.findall(r"\bהקלה\b|\bהקלות\b", t))
    if hakala_count >= 3 and re.search(r"\bהקלה\b|\bהקלות\b", opening):
        cats.append("הקלה")
    # חלוקה — "איחוד וחלוקה" or "חלוקה חדשה" (specific phrases)
    if re.search(r"איחוד\s+וחלוקה|חלוקה\s+חדשה|תכנית\s+לחלוקה", t):
        cats.append("חלוקה")
    # תכנית — plan-level appeal (primary subject). Allow ה/ב/ל prefixes on תכנית.
    tochnit_opening = bool(re.search(
        r"הפקדת\s+ה?תכנית|"
        r"אישור\s+ה?תכנית|"
        r"המלצה\s+להפקיד|"
        r"להפקיד\s+את\s+ה?תכנית|"
        r"לדון\s+בתכנית|"
        r"דנה\s+בתכנית|"
        r"החלטה\s+לאשר\s+ה?תכנית",
        opening,
    ))
    if tochnit_opening:
        cats.append("תכנית")
    # היתר — "בקשה להיתר" or "היתר בניה" as subject in opening
    if re.search(r"בקשה\s+להיתר|היתר\s+בני(?:י)?ה", opening):
        cats.append("היתר")
    # בנייה — default/fallback for building-permit cases
    # (not for plan-level תכנית-only cases)
    has_permit_subject = "היתר" in cats or "הקלה" in cats or 'תמ"א 38' in cats
    if has_permit_subject and "בנייה" not in cats:
        cats.append("בנייה")
    # If nothing matched, default to בנייה
    return cats or ["בנייה"]
 # ── Year fallback from date ──────────────────────────────────────
 def finalize_decision_number(number: str | None, date_iso: str | None) -> str:
    """If filename number is missing year, fill it from decision date."""
    if not number:
        if date_iso:
            # Extract last 2 digits of Hebrew year via Gregorian year
            return f"??/{date_iso[2:4]}"
        return ""
    if number.endswith("/??"):
        if date_iso:
            yy = date_iso[2:4]
            return number.replace("/??", f"/{yy}")
        return number.replace("/??", "")
    return number
 # ── Main metadata extraction ─────────────────────────────────────
 def extract_metadata(path: Path) -> dict:
    text = path.read_text(encoding="utf-8")
    num_from_name, _ = decision_number_from_filename(path.stem)
    date_iso = decision_date_from_text(text)
    decision_number = finalize_decision_number(num_from_name, date_iso)
    cats = categorize(text)
    meta = {
        "file": path.name,
        "decision_number": decision_number,
        "decision_date": date_iso or "??",
        "categories": cats,
        "chars": len(text),
    }
    # Apply manual overrides
    if path.name in METADATA_OVERRIDES:
        meta.update(METADATA_OVERRIDES[path.name])
    return meta
 def print_preview(results: list[dict]) -> None:
    """Print review table of metadata for all files."""
    print(f"\n{'#':<3} {'FILE':<55} {'NUMBER':<15} {'DATE':<12} {'CATEGORIES'}")
    print("-" * 130)
    for i, r in enumerate(results, 1):
        file_short = r["file"] if len(r["file"]) <= 53 else r["file"][:50] + "..."
        cats = ", ".join(r["categories"])
        print(f"{i:<3} {file_short:<55} {r['decision_number']:<15} {r['decision_date']:<12} {cats}")
    print()
    # Highlight issues
    issues = [r for r in results if r["decision_date"] == "??" or not r["decision_number"] or "??" in r["decision_number"]]
    if issues:
        print(f"⚠️  {len(issues)} files with incomplete metadata:")
        for r in issues:
            print(f"   - {r['file']}  → number={r['decision_number']!r} date={r['decision_date']!r}")
 # ── Upload ───────────────────────────────────────────────────────
 async def upload_one(meta: dict) -> dict:
    from legal_mcp.tools.documents import document_upload_training
    path = PROOFREAD_DIR / meta["file"]
    result = await document_upload_training(
        file_path=str(path),
        decision_number=meta["decision_number"],
        decision_date=meta["decision_date"] if meta["decision_date"] != "??" else "",
        subject_categories=meta["categories"],
        title=path.stem,
    )
    return {"file": meta["file"], "result": result}
 async def upload_all(results: list[dict]) -> None:
    for i, meta in enumerate(results, 1):
        try:
            r = await upload_one(meta)
            print(f"[{i}/{len(results)}] ✓ {meta['file']}")
            print(f"    {r['result'][:200]}")
        except Exception as e:
            print(f"[{i}/{len(results)}] ✗ {meta['file']}: {e}")
 # ── CLI ──────────────────────────────────────────────────────────
 def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--preview", action="store_true", help="Show metadata table without uploading")
    ap.add_argument("--upload", action="store_true", help="Upload all files to style corpus")
    ap.add_argument("--only", help="Only process this specific filename")
    args = ap.parse_args()
    files = sorted(PROOFREAD_DIR.glob("*.md"))
    files = [f for f in files if f.name not in SKIP_FILES]
    if args.only:
        files = [f for f in files if f.name == args.only]
        if not files:
            print(f"File not found: {args.only}")
            return 1
    results = [extract_metadata(f) for f in files]
    if args.preview or not args.upload:
        print_preview(results)
        if not args.upload:
            return 0
    if args.upload:
        print(f"\n>>> Uploading {len(results)} files to style corpus...\n")
        asyncio.run(upload_all(results))
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/proofread_training_corpus.py
+++ b/scripts/proofread_training_corpus.py
@@ -0,0 +1,382 @@
 """Proofread training corpus: strip Nevo additions from DOCX/PDF, output clean Markdown.
 Nevo DOCX additions:
  Front: ספרות / חקיקה שאוזכרה / מיני-רציו / topic tags / Nevo summary paragraphs
  Back:  5129371512937154678313 / "בעניין עריכה ושינויים" link / "54678313-..." / "נוסח מסמך זה כפוף"
 Nevo PDF additions:
  "עמוד X מתוך Y" header on every page
 PDF text extraction uses Google Cloud Vision OCR — PyMuPDF fragments Hebrew RTL
 text unusably (words split mid-word, reading order broken). OCR gives clean output.
 """
 from __future__ import annotations
 import io
 import os
 import re
 import sys
 import time
 from pathlib import Path
 import fitz
 from docx import Document
 # Load GOOGLE_CLOUD_VISION_API_KEY from ~/.env if not already set
 if not os.environ.get("GOOGLE_CLOUD_VISION_API_KEY"):
    env_path = Path.home() / ".env"
    if env_path.exists():
        for line in env_path.read_text().splitlines():
            if line.startswith("GOOGLE_CLOUD_VISION_API_KEY="):
                os.environ["GOOGLE_CLOUD_VISION_API_KEY"] = line.split("=", 1)[1].strip().strip('"').strip("'")
                break
 from google.cloud import vision  # noqa: E402
 TRAINING_DIR = Path("/home/chaim/legal-ai/data/training")
 OUTPUT_DIR = TRAINING_DIR / "proofread"
 RAW_DIR = TRAINING_DIR / "raw"
 # ── Nevo pattern detection ────────────────────────────────────────
 NEVO_PREAMBLE_HEADERS = (
    "ספרות:",
    "חקיקה שאוזכרה:",
    "מיני-רציו:",
 )
 # Strong decision-opening patterns — highly distinctive first words of real decision
 # body. These rarely appear inside Nevo's own summary block, so first match wins.
 DECISION_OPENING = re.compile(
    r"^(עניינו\s|ענייננו\s|עסקינן\s|בפנינו\s|לפנינו\s|בערר\s+שלפנינו|זהו\s+ערר)"
 )
 # Section headers that definitively mark decision body start.
 DECISION_SECTION_HEADERS = {
    "רקע",
    "פתח דבר",
    "תמצית טענות הצדדים",
    "העובדות",
    "הרקע העובדתי",
    "מבוא",
 }
 # Nevo postamble markers — everything from first match onwards is stripped.
 NEVO_POSTAMBLE_MARKERS = (
    "5129371512937154678313",
    "בעניין עריכה ושינויים במסמכי פסיקה",
    "נוסח מסמך זה כפוף לשינויי ניסוח ועריכה",
 )
 # Nevo inline watermark codes — appear as prefixes embedded in real paragraphs
 # (e.g. "5129371ניתנה פה אחד" or "054678313האם ההיתר..."). These must be
 # stripped from paragraph content, not used as postamble boundaries.
 NEVO_INLINE_CODE_RE = re.compile(r"^0?(5129371|54678313)\d*")
 # Nevo PDF page header: "עמוד X מתוך Y" or "עמוד X בן Y" (Hebrew variants)
 PDF_PAGE_HEADER_RE = re.compile(
    r"\s*עמוד\s*\n?\s*\d+\s*\n?\s*(?:מתוך|בן)\s*\n?\s*\d+\s*"
 )
 # Short orphan lines starting with "עמוד" — OCR artifacts from merged footer text
 # (e.g. "עמודירבי", "עמוד :", "עמודי", "עמוד ר"). Conservative: up to 12 chars.
 PDF_PAGE_ORPHAN_RE = re.compile(r"(?m)^עמוד[^\n]{0,12}$")
 # "עמוד" followed by number (with optional garbled Nevo URL line after)
 PDF_PAGE_BLOCK_RE = re.compile(
    r"(?m)^\s*עמוד\s*\n\s*\d+[·.]?\s*\n[^\n]*\n", re.UNICODE
 )
 # Standalone "עמוד N" at line start
 PDF_PAGE_NUM_LINE_RE = re.compile(r"(?m)^\s*עמוד\s*\n?\s*\d+[·.]?\s*$")
 # Nevo watermark URL (and common OCR-garbled variants)
 NEVO_URL_RE = re.compile(
    r"(nevo\.co\.il|neto\.co\.il|netocoal|neetocoal|nevocoal|nevo\.co|rawo\.co\.il)",
    re.IGNORECASE,
 )
 def find_decision_start(paragraphs: list[str]) -> int:
    """Find index of first real decision paragraph, skipping Nevo preamble.
    Strategy:
      1. If no Nevo headers present → start at 0.
      2. Otherwise, scan past Nevo headers; look for first paragraph matching
         DECISION_OPENING regex or DECISION_SECTION_HEADERS.
      3. Fallback: first paragraph after "ועדת הערר ... קבעה כלהלן:" bullet block
         that doesn't look like summary (heuristic: longer, has proper sentence).
    """
    has_nevo_preamble = any(
        any(p.startswith(h) for h in NEVO_PREAMBLE_HEADERS) for p in paragraphs[:10]
    )
    if not has_nevo_preamble:
        return 0
    # Scan for strong decision-opening markers
    for i, p in enumerate(paragraphs):
        stripped = p.strip()
        if stripped in DECISION_SECTION_HEADERS:
            return i
        if DECISION_OPENING.match(stripped):
            return i
    # Fallback: find "ועדת הערר ... קבעה כלהלן" and take first long para after bullets
    for i, p in enumerate(paragraphs):
        if "קבעה כלהלן" in p or "קבעה את הדברים הבאים" in p:
            # Skip summary paragraphs (Nevo typically has 3-8 of these)
            for j in range(i + 1, min(i + 15, len(paragraphs))):
                if len(paragraphs[j]) > 80 and not paragraphs[j].strip().startswith("*"):
                    # Check if this looks like real decision content
                    return j
            break
    # Last resort: strip only the first 10 paragraphs of preamble
    return min(10, len(paragraphs) - 1)
 def find_decision_end(paragraphs: list[str]) -> int:
    """Find exclusive end index: first paragraph that is a Nevo postamble marker."""
    for i, p in enumerate(paragraphs):
        for marker in NEVO_POSTAMBLE_MARKERS:
            if marker in p:
                return i
    return len(paragraphs)
 # ── DOCX proofreading ─────────────────────────────────────────────
 def _strip_inline_nevo_codes(paragraphs: list[str]) -> list[str]:
    """Remove Nevo inline watermark codes from paragraph prefixes; drop pure-code paras."""
    out: list[str] = []
    for p in paragraphs:
        stripped = NEVO_INLINE_CODE_RE.sub("", p).strip()
        if stripped:
            out.append(stripped)
    return out
 def proofread_docx(path: Path) -> tuple[str, dict]:
    """Extract clean decision text from Nevo DOCX. Returns (markdown, stats)."""
    doc = Document(str(path))
    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
    start = find_decision_start(paragraphs)
    end = find_decision_end(paragraphs)
    clean = _strip_inline_nevo_codes(paragraphs[start:end])
    md = "\n\n".join(clean)
    return md, {
        "total_paragraphs": len(paragraphs),
        "preamble_stripped": start,
        "postamble_stripped": len(paragraphs) - end,
        "clean_paragraphs": len(clean),
    }
 # ── PDF proofreading (Google Vision OCR) ──────────────────────────
 _vision_client: vision.ImageAnnotatorClient | None = None
 def _get_vision_client() -> vision.ImageAnnotatorClient:
    global _vision_client
    if _vision_client is None:
        api_key = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY")
        if not api_key:
            raise RuntimeError("GOOGLE_CLOUD_VISION_API_KEY not set")
        _vision_client = vision.ImageAnnotatorClient(
            client_options={"api_key": api_key}
        )
    return _vision_client
 # Hebrew abbreviation quote fixes — Google Vision renders ״ as 'יי'
 _HEBREW_ABBREV_FIXES: dict[str, str] = {
    "עוהייד": 'עוה"ד',
    "עוייד": 'עו"ד',
    "הנייל": 'הנ"ל',
    "מצייב": 'מצ"ב',
    "ביהמייש": 'ביהמ"ש',
    "תייז": 'ת"ז',
    "עייי": 'ע"י',
    "אחייכ": 'אח"כ',
    "סייק": 'ס"ק',
    "דייר": 'ד"ר',
    "חווייד": 'חוו"ד',
    "מייר": 'מ"ר',
    "יחייד": 'יח"ד',
    "בייכ": 'ב"כ',
    "בייה": 'ב"ה',
    "שייח": 'ש"ח',
    "יוייר": 'יו"ר',
    "בליימ": 'בל"מ',
    "תבייע": 'תב"ע',
    "תמייא": 'תמ"א',
    "סייה": 'ס"ה',
    "שייפ": 'ש"פ',
    "שצייפ": 'שצ"פ',
    "שבייצ": 'שב"צ',
    "עסיים": 'עס"ם',
    "הייה": 'ה"ה',
    "פסייד": 'פס"ד',
    "תיידא": 'תיד"א',
    "בגייץ": 'בג"ץ',
    "עתיים": 'עת"ם',
    "עעיים": 'עע"ם',
    # Hebrew calendar day prefixes (כ"א .. כ"ט etc.)
    "כייא": 'כ"א', "כייב": 'כ"ב', "כייג": 'כ"ג', "כייד": 'כ"ד',
    "כייה": 'כ"ה', "כייו": 'כ"ו', "כייז": 'כ"ז', "כייח": 'כ"ח', "כייט": 'כ"ט',
    "לייא": 'ל"א',
    "יייא": 'י"א', "יייב": 'י"ב', "יייג": 'י"ג', "יייד": 'י"ד',
    "טייו": 'ט"ו', "טייז": 'ט"ז', "יייז": 'י"ז', "יייח": 'י"ח', "יייט": 'י"ט',
    # Hebrew calendar years (תשפ"ה, תשפ"ד...)
    "תשפייא": 'תשפ"א', "תשפייב": 'תשפ"ב', "תשפייג": 'תשפ"ג',
    "תשפייד": 'תשפ"ד', "תשפייה": 'תשפ"ה', "תשפייו": 'תשפ"ו',
    "תשפיין": 'תשפ"ן',
 }
 _ABBREV_PATTERN = re.compile(
    "|".join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True))
 )
 def _fix_hebrew_quotes(text: str) -> str:
    return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
 def _ocr_page_image(image_bytes: bytes, page_num: int) -> str:
    client = _get_vision_client()
    image = vision.Image(content=image_bytes)
    response = client.document_text_detection(
        image=image,
        image_context=vision.ImageContext(language_hints=["he"]),
    )
    if response.error.message:
        raise RuntimeError(f"Vision error page {page_num}: {response.error.message}")
    text = response.full_text_annotation.text if response.full_text_annotation else ""
    return _fix_hebrew_quotes(text)
 _FOOTER_JUNK_RE = re.compile(
    r"^("
    r"\s*|"                                        # blank
    r"[-·*.\"\'׳״]+|"                             # stray punctuation
    r"\d{1,3}[\s\-·*.\"\'׳״]*|"                  # page number with any stray char
    r"עמוד[\s\d\-·*.\"\'׳״]*|"                   # "עמוד" / "עמוד N" w/ trailing noise
    r"[-·*\s\"\'׳״]*[a-zA-Z][a-zA-Z0-9 .\-·*_]{0,30}"  # garbled latin (nevo URL variants)
    r")$"
 )
 def _clean_page_text(text: str) -> str:
    """Strip Nevo page headers, footers and watermarks from a single page's OCR text.
    Nevo footer on each page looks like:
        עמוד
        N          (or "N·", "N*")
        nevo.co.il (or OCR-garbled: "new coal", "neto coal", etc.)
        -          (optional stray dash)
    Google Vision OCRs this block at the end of each page's text.
    """
    # 1. Strip top header "עמוד X מתוך Y" anywhere
    text = PDF_PAGE_HEADER_RE.sub("\n", text)
    # 2. Walk back from end, dropping footer junk lines
    lines = text.split("\n")
    while lines and _FOOTER_JUNK_RE.match(lines[-1].strip()):
        lines.pop()
    text = "\n".join(lines)
    # 3. Final pass: strip any leftover Nevo URLs mid-text and orphan "עמוד X" lines
    text = NEVO_URL_RE.sub("", text)
    text = PDF_PAGE_NUM_LINE_RE.sub("", text)
    text = PDF_PAGE_ORPHAN_RE.sub("", text)
    return text.strip()
 def proofread_pdf(path: Path) -> tuple[str, dict]:
    """Extract clean decision text from Nevo PDF via Google Vision OCR."""
    doc = fitz.open(str(path))
    pages: list[str] = []
    for i, page in enumerate(doc):
        pix = page.get_pixmap(dpi=300)
        img_bytes = pix.tobytes("png")
        text = _ocr_page_image(img_bytes, i + 1)
        pages.append(_clean_page_text(text))
        # Small delay between API calls to be safe
        time.sleep(0.1)
    doc.close()
    body = "\n\n".join(p for p in pages if p)
    body = re.sub(r"\n{3,}", "\n\n", body)
    body = re.sub(r"[ \t]+\n", "\n", body)
    for marker in NEVO_POSTAMBLE_MARKERS:
        idx = body.find(marker)
        if idx != -1:
            body = body[:idx].rstrip()
            break
    return body, {
        "pages": len(pages),
        "chars": len(body),
    }
 # ── Orchestration ─────────────────────────────────────────────────
 SKIP_FILES = {
    "הכנת שאלות מחקר.docx",
    "סוכן_מנתח_ומחקר_משפטי_Paperclip_מדריך.docx",
    "README.md",
 }
 def output_filename(src: Path) -> str:
    """Build clean output filename preserving case identifier."""
    stem = src.stem
    # Normalize: replace spaces with - where helpful, but keep Hebrew intact
    return f"{stem}.md"
 def main(argv: list[str]) -> int:
    OUTPUT_DIR.mkdir(exist_ok=True)
    RAW_DIR.mkdir(exist_ok=True)
    # Filter files
    only = argv[1:] if len(argv) > 1 else None
    files: list[Path] = []
    for p in sorted(TRAINING_DIR.iterdir()):
        if p.is_dir() or p.name.startswith("."):
            continue
        if p.name in SKIP_FILES:
            continue
        if p.suffix.lower() not in (".docx", ".pdf"):
            continue
        if only and p.name not in only:
            continue
        files.append(p)
    print(f"Processing {len(files)} files...\n")
    for path in files:
        try:
            if path.suffix.lower() == ".docx":
                md, stats = proofread_docx(path)
            else:
                md, stats = proofread_pdf(path)
            out_path = OUTPUT_DIR / output_filename(path)
            out_path.write_text(md, encoding="utf-8")
            print(f"✓ {path.name}")
            print(f"  → {out_path.name} ({len(md):,} chars)  {stats}")
        except Exception as e:
            print(f"✗ {path.name}: {e}")
    return 0
 if __name__ == "__main__":
    sys.exit(main(sys.argv))
--- a/web/app.py
+++ b/web/app.py
@@ -28,7 +28,7 @@ from pydantic import BaseModel
 import asyncpg
 from legal_mcp import config
-from legal_mcp.services import chunker, db, embeddings, extractor, processor
+from legal_mcp.services import chunker, db, embeddings, extractor, processor, proofreader
 from legal_mcp.tools import cases as cases_tools, search as search_tools, workflow as workflow_tools, drafting as drafting_tools
 # Import integration clients (same directory)
@@ -163,6 +163,261 @@ async def classify_file(req: ClassifyRequest):
    return {"task_id": task_id}
 # ── Training Corpus: Analyze & Upload ─────────────────────────────
@app.post("/api/training/analyze")
 async def training_analyze(filename: str = Form(...)):
    """Proofread an uploaded file and extract metadata for review.
    Input: filename in UPLOAD_DIR (from /api/upload).
    Output: clean text preview + extracted metadata (number, date, categories).
    """
    source = UPLOAD_DIR / filename
    if not source.exists() or not source.parent.samefile(UPLOAD_DIR):
        raise HTTPException(404, "File not found in uploads")
    try:
        result = await proofreader.analyze_file(source)
    except Exception as e:
        logger.exception("Proofread failed for %s", filename)
        raise HTTPException(500, f"Proofreading failed: {e}")
    return result
 class TrainingUploadRequest(BaseModel):
    filename: str                      # name in UPLOAD_DIR
    decision_number: str = ""
    decision_date: str = ""            # YYYY-MM-DD
    subject_categories: list[str] = []
    title: str = ""
@app.post("/api/training/upload")
 async def training_upload(req: TrainingUploadRequest):
    """Upload a proofread file to the style corpus.
    Runs proofreading again to guarantee clean text (not raw file content),
    then inserts into style_corpus + chunks + embeddings.
    """
    source = UPLOAD_DIR / req.filename
    if not source.exists() or not source.parent.samefile(UPLOAD_DIR):
        raise HTTPException(404, "File not found in uploads")
    # Check for duplicate by decision_number
    if req.decision_number:
        pool = await db.get_pool()
        async with pool.acquire() as conn:
            exists = await conn.fetchval(
                "SELECT 1 FROM style_corpus WHERE decision_number = $1 LIMIT 1",
                req.decision_number,
            )
        if exists:
            raise HTTPException(
                409,
                f"החלטה {req.decision_number} כבר קיימת בקורפוס",
            )
    task_id = str(uuid4())
    _progress[task_id] = {"status": "queued", "filename": req.filename}
    asyncio.create_task(_process_proofread_training(task_id, source, req))
    return {"task_id": task_id}
 async def _process_proofread_training(
    task_id: str, source: Path, req: TrainingUploadRequest
 ):
    """Background task: proofread → store in corpus → chunk → embed."""
    from datetime import date as date_type
    try:
        title = req.title or source.stem.split("_", 1)[-1]
        # 1. Proofread (strip Nevo additions)
        _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "proofreading"}
        clean_text, stats = await proofreader.proofread(source)
        # 2. Save proofread .md to training dir (alongside original)
        _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "saving"}
        training_dir = config.TRAINING_DIR
        proofread_dir = training_dir / "proofread"
        training_dir.mkdir(parents=True, exist_ok=True)
        proofread_dir.mkdir(exist_ok=True)
        # Copy original to training dir
        original_name = re.sub(r"^\d+_", "", source.name)
        orig_dest = training_dir / original_name
        shutil.copy2(str(source), str(orig_dest))
        # Save cleaned version
        proofread_name = Path(original_name).stem + ".md"
        proofread_dest = proofread_dir / proofread_name
        proofread_dest.write_text(clean_text, encoding="utf-8")
        # 3. Parse date
        d_date = None
        if req.decision_date:
            d_date = date_type.fromisoformat(req.decision_date)
        # 4. Add to style corpus
        _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "corpus"}
        corpus_id = await db.add_to_style_corpus(
            document_id=None,
            decision_number=req.decision_number,
            decision_date=d_date,
            subject_categories=req.subject_categories,
            full_text=clean_text,
        )
        # 5. Chunk + embed
        _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "chunking"}
        chunks = chunker.chunk_document(clean_text)
        chunk_count = 0
        if chunks:
            doc = await db.create_document(
                case_id=None,
                doc_type="decision",
                title=f"[קורפוס] {title}",
                file_path=str(orig_dest),
                page_count=stats.get("pages", 0),
            )
            doc_id = UUID(doc["id"])
            await db.update_document(
                doc_id, extracted_text=clean_text, extraction_status="completed"
            )
            _progress[task_id] = {
                "status": "processing", "filename": req.filename, "step": "embedding",
            }
            texts = [c.content for c in chunks]
            embs = await embeddings.embed_texts(texts, input_type="document")
            chunk_dicts = [
                {
                    "content": c.content,
                    "section_type": c.section_type,
                    "embedding": emb,
                    "page_number": c.page_number,
                    "chunk_index": c.chunk_index,
                }
                for c, emb in zip(chunks, embs)
            ]
            await db.store_chunks(doc_id, None, chunk_dicts)
            chunk_count = len(chunks)
        # 6. Cleanup upload
        source.unlink(missing_ok=True)
        _progress[task_id] = {
            "status": "completed",
            "filename": req.filename,
            "result": {
                "corpus_id": str(corpus_id),
                "title": title,
                "chars": len(clean_text),
                "chunks": chunk_count,
                "proofread_stats": stats,
            },
        }
    except Exception as e:
        logger.exception("Training upload failed for %s", req.filename)
        _progress[task_id] = {"status": "failed", "error": str(e), "filename": req.filename}
@app.get("/api/training/patterns")
 async def training_patterns():
    """List all extracted style patterns, grouped by type."""
    pool = await db.get_pool()
    async with pool.acquire() as conn:
        rows = await conn.fetch(
            "SELECT pattern_type, pattern_text, frequency, context, examples "
            "FROM style_patterns "
            "ORDER BY pattern_type, frequency DESC"
        )
    grouped: dict[str, list] = {}
    for r in rows:
        pt = r["pattern_type"]
        examples = r["examples"]
        if isinstance(examples, str):
            try:
                examples = json.loads(examples)
            except Exception:
                examples = []
        grouped.setdefault(pt, []).append({
            "pattern_text": r["pattern_text"],
            "frequency": r["frequency"],
            "context": r["context"] or "",
            "examples": examples or [],
        })
    return {"total": len(rows), "by_type": grouped}
 _style_analysis_state = {"running": False, "started_at": None, "result": None, "error": None}
@app.post("/api/training/analyze-style")
 async def training_analyze_style():
    """Kick off style analysis over the corpus. Returns immediately."""
    if _style_analysis_state["running"]:
        raise HTTPException(409, "ניתוח סגנון כבר רץ")
    _style_analysis_state.update(
        {"running": True, "started_at": time.time(), "result": None, "error": None}
    )
    async def _run():
        from legal_mcp.services.style_analyzer import analyze_corpus
        try:
            result = await analyze_corpus()
            _style_analysis_state["result"] = result
        except Exception as e:
            logger.exception("Style analysis failed")
            _style_analysis_state["error"] = str(e)
        finally:
            _style_analysis_state["running"] = False
    asyncio.create_task(_run())
    return {"status": "started"}
@app.get("/api/training/analyze-style/status")
 async def training_analyze_style_status():
    """Poll status of the running style analysis."""
    state = dict(_style_analysis_state)
    if state["started_at"]:
        state["elapsed"] = int(time.time() - state["started_at"])
    return state
@app.get("/api/training/corpus")
 async def training_corpus_list():
    """List all decisions currently in the style corpus."""
    pool = await db.get_pool()
    async with pool.acquire() as conn:
        rows = await conn.fetch(
            "SELECT id, decision_number, decision_date, subject_categories, "
            "       length(full_text) as chars, created_at "
            "FROM style_corpus "
            "ORDER BY created_at DESC"
        )
    return [
        {
            "id": str(r["id"]),
            "decision_number": r["decision_number"] or "",
            "decision_date": str(r["decision_date"]) if r["decision_date"] else "",
            "subject_categories": (
                json.loads(r["subject_categories"])
                if isinstance(r["subject_categories"], str)
                else r["subject_categories"] or []
            ),
            "chars": r["chars"],
            "created_at": r["created_at"].isoformat() if r["created_at"] else "",
        }
        for r in rows
    ]
@app.get("/api/progress/{task_id}")
 async def progress_stream(task_id: str):
    """SSE stream of processing progress."""
--- a/web/static/index.html
+++ b/web/static/index.html
@@ -283,6 +283,120 @@ header nav a:hover, header nav a.active { color: #fff; background: rgba(255,255,
 }
 .skill-install-result.error { background: #ffebee; border-color: #ffcdd2; }
 /* ── Training Corpus Upload ───────────────────────────── */
 .training-review {
  border: 1px solid #e5e5e5; border-radius: 8px; padding: 14px 16px;
  margin-bottom: 12px; background: #fafafa;
 }
 .training-review .review-header {
  display: flex; align-items: center; gap: 10px;
  padding-bottom: 10px; margin-bottom: 12px;
  border-bottom: 1px solid #eee;
 }
 .training-review .review-header strong { font-size: 0.95em; color: #1a1a2e; flex: 1; }
 .training-review .review-meta { font-size: 0.78em; color: #888; }
 .training-review .btn-icon {
  background: transparent; border: none; color: #aaa; cursor: pointer;
  font-size: 1.1em; padding: 4px 8px; border-radius: 4px;
 }
 .training-review .btn-icon:hover { background: #ffebee; color: #c62828; }
 .training-review .review-fields {
  display: grid; grid-template-columns: 1fr 160px; gap: 14px; margin-bottom: 12px;
 }
 .training-review .review-fields label {
  display: flex; flex-direction: column; gap: 4px;
  font-size: 0.8em; color: #666; font-weight: 500;
 }
 .training-review .review-fields input {
  padding: 7px 10px; border: 1px solid #ddd; border-radius: 6px;
  font-size: 0.88em; font-family: inherit;
 }
 .training-review .review-fields input:focus {
  outline: none; border-color: #e94560;
 }
 .training-review .review-cats { margin-bottom: 10px; }
 .training-review .review-cats-label {
  font-size: 0.8em; color: #666; font-weight: 500; margin-bottom: 6px;
 }
 .training-review .review-cats-list { display: flex; flex-wrap: wrap; gap: 6px; }
 .cat-chip {
  display: inline-flex; align-items: center; gap: 5px;
  padding: 4px 10px; border: 1px solid #ddd; border-radius: 14px;
  font-size: 0.78em; cursor: pointer; background: #fff;
  transition: background 0.12s;
 }
 .cat-chip:hover { background: #f0f0f0; }
 .cat-chip input[type="checkbox"] { margin: 0; cursor: pointer; }
 .cat-chip:has(input:checked) { background: #ffe4ea; border-color: #e94560; color: #c62828; }
 .review-preview {
  margin-top: 6px; border: 1px solid #eee; border-radius: 6px;
  background: #fff; padding: 8px 12px;
 }
 .review-preview summary {
  cursor: pointer; font-size: 0.78em; color: #888; font-weight: 500;
 }
 .review-preview pre {
  margin-top: 10px; font-size: 0.78em; color: #333; direction: rtl;
  white-space: pre-wrap; font-family: inherit; line-height: 1.5;
  max-height: 250px; overflow-y: auto;
 }
 .training-task {
  padding: 10px 14px; margin-bottom: 6px; border-radius: 6px;
  background: #f7f7f7; font-size: 0.85em;
  display: flex; align-items: center; gap: 10px;
 }
 .training-task:last-child { margin-bottom: 0; }
 .corpus-table { width: 100%; border-collapse: collapse; font-size: 0.82em; }
 .corpus-table th, .corpus-table td {
  text-align: right; padding: 8px 10px; border-bottom: 1px solid #eee;
 }
 .corpus-table th {
  background: #f7f7f7; font-weight: 600; color: #555;
  font-size: 0.78em; text-transform: uppercase;
 }
 .corpus-table tr:hover td { background: #fafafa; }
 .cat-tag {
  display: inline-block; padding: 2px 8px; margin: 0 2px;
  background: #e3f2fd; color: #1565c0; border-radius: 10px;
  font-size: 0.72em; font-weight: 500;
 }
 /* Pattern groups */
 .pattern-group {
  border: 1px solid #eee; border-radius: 8px; margin-bottom: 10px;
  background: #fff;
 }
 .pattern-group[open] { background: #fafafa; }
 .pattern-group summary {
  padding: 12px 16px; cursor: pointer; font-size: 0.9em;
  display: flex; align-items: center; gap: 10px; list-style: none;
 }
 .pattern-group summary::-webkit-details-marker { display: none; }
 .pattern-group summary::before {
  content: '▸'; transition: transform 0.15s; font-size: 0.9em; color: #888;
 }
 .pattern-group[open] summary::before { transform: rotate(90deg); }
 .pattern-count {
  margin-right: auto; background: #e3f2fd; color: #1565c0;
  padding: 2px 10px; border-radius: 10px; font-size: 0.76em; font-weight: 500;
 }
 .pattern-list {
  padding: 4px 16px 14px 16px; display: flex; flex-direction: column; gap: 8px;
 }
 .pattern-item {
  padding: 10px 14px; background: #fff; border: 1px solid #eee;
  border-radius: 6px; font-size: 0.84em;
 }
 .pattern-text { color: #1a1a2e; font-weight: 500; }
 .pattern-context { color: #666; font-size: 0.88em; margin-top: 4px; }
 .pattern-meta {
  color: #999; font-size: 0.78em; margin-top: 6px;
  display: flex; gap: 10px;
 }
@media (max-width: 800px) {
  .main { padding: 16px; }
  header { padding: 14px 16px; }
@@ -302,6 +416,7 @@ header nav a:hover, header nav a.active { color: #fff; background: rgba(255,255,
    <a href="#/" id="navHome">תיקים</a>
    <a href="#/new" id="navNew">+ תיק חדש</a>
    <a href="#/upload" id="navUpload">העלאה</a>
    <a href="#/training" id="navTraining">אימון סגנון</a>
    <a href="#/skills" id="navSkills">Skills</a>
  </nav>
 </header>
@@ -552,6 +667,75 @@ header nav a:hover, header nav a.active { color: #fff; background: rgba(255,255,
      <div class="card-body" id="legacyTasksList"></div>
    </div>
  </div>
  <!-- ══ Page: Training Corpus Upload ══ -->
  <div class="page" id="page-training">
    <div class="page-header">
      <h2>אימון סגנון — העלאת החלטות לקורפוס</h2>
    </div>
    <div class="card">
      <div class="card-body">
        <p style="margin-bottom:12px;color:#555;line-height:1.6">
          העלה החלטות קודמות של דפנה כדי ללמד את המערכת את סגנון הכתיבה שלה.
          הקבצים יעברו <strong>הגהה אוטומטית</strong> (הסרת תוספות נבו, כותרות, סימני מים)
          ו<strong>חילוץ מטא-דאטה</strong> (מספר החלטה, תאריך, קטגוריות) לסקירה לפני ההעלאה.
        </p>
        <div class="upload-zone" id="trainingDropZone">
          <div style="font-size:3em;color:#ccc;margin-bottom:16px">&#128218;</div>
          <h3>גרור קבצי החלטה לכאן או לחץ לבחירה</h3>
          <p>PDF, DOCX, MD — עד 50MB. ניתן להעלות מספר קבצים בבת אחת.</p>
          <input type="file" id="trainingFileInput" multiple accept=".pdf,.docx,.md,.txt">
        </div>
      </div>
    </div>
    <div class="card" id="trainingAnalysisCard" style="display:none">
      <div class="card-header">
        <span>סקירת מטא-דאטה לפני ההעלאה</span>
        <span id="trainingAnalysisStatus" style="float:left;font-weight:400;color:#888;font-size:0.9em"></span>
      </div>
      <div class="card-body">
        <div id="trainingReviewList"></div>
        <div style="display:flex;gap:10px;margin-top:16px;justify-content:flex-end">
          <button class="btn btn-ghost" onclick="cancelTrainingReview()">בטל</button>
          <button class="btn btn-primary" id="trainingUploadBtn" onclick="uploadAllTraining()">
            העלה הכל לקורפוס
          </button>
        </div>
      </div>
    </div>
    <div class="card" id="trainingTasksCard" style="display:none">
      <div class="card-header">עיבוד והעלאה</div>
      <div class="card-body" id="trainingTasksList"></div>
    </div>
    <div class="card">
      <div class="card-header">
        <span>קורפוס הסגנון</span>
        <span id="corpusCount" style="float:left;font-weight:400;color:#888;font-size:0.9em"></span>
      </div>
      <div class="card-body" id="corpusList">
        <div class="empty">טוען...</div>
      </div>
    </div>
    <div class="card">
      <div class="card-header">
        <span>דוח סגנון — דפוסים שחולצו</span>
        <span style="float:left;display:flex;gap:10px;align-items:center">
          <span id="patternsCount" style="font-weight:400;color:#888;font-size:0.9em"></span>
          <button class="btn btn-primary" id="analyzeStyleBtn" onclick="runStyleAnalysis()">
            נתח קורפוס
          </button>
        </span>
      </div>
      <div class="card-body" id="patternsList">
        <div class="empty">טוען...</div>
      </div>
    </div>
  </div>
 </div>
 <!-- Status Bar -->
@@ -614,6 +798,11 @@ function handleRoute() {
    document.getElementById('navUpload').classList.add('active');
    subtitle = 'העלאת מסמכים';
    loadLegacyPending();
  } else if (hash === '#/training') {
    document.getElementById('page-training').classList.add('active');
    document.getElementById('navTraining').classList.add('active');
    subtitle = 'אימון סגנון';
    initTrainingPage();
  }
  document.getElementById('pageSubtitle').textContent = subtitle;
@@ -1559,6 +1748,377 @@ async function restartPaperclip() {
 // Init legacy upload listeners
 setupLegacyUpload();
 // ── Training Corpus Upload ─────────────────────────────────────────
 const ALL_CATEGORIES = [
  'בנייה', 'שימוש חורג', 'תכנית', 'היתר', 'הקלה',
  'חלוקה', 'תמ"א 38', 'היטל השבחה', 'פיצויים 197',
 ];
 let _trainingReviews = [];  // in-progress metadata awaiting user approval
 function initTrainingPage() {
  setupTrainingDropZone();
  loadCorpusList();
  loadStylePatterns();
  pollStyleAnalysisStatus();
  // Reset review state on re-entry
  _trainingReviews = [];
  document.getElementById('trainingAnalysisCard').style.display = 'none';
  document.getElementById('trainingTasksCard').style.display = 'none';
  document.getElementById('trainingReviewList').innerHTML = '';
  document.getElementById('trainingTasksList').innerHTML = '';
 }
 function setupTrainingDropZone() {
  const zone = document.getElementById('trainingDropZone');
  const input = document.getElementById('trainingFileInput');
  if (zone._wired) return;
  zone._wired = true;
  zone.addEventListener('click', () => input.click());
  zone.addEventListener('dragover', (e) => { e.preventDefault(); zone.classList.add('dragging'); });
  zone.addEventListener('dragleave', () => zone.classList.remove('dragging'));
  zone.addEventListener('drop', (e) => {
    e.preventDefault();
    zone.classList.remove('dragging');
    handleTrainingFiles(e.dataTransfer.files);
  });
  input.addEventListener('change', () => handleTrainingFiles(input.files));
 }
 async function handleTrainingFiles(fileList) {
  const files = Array.from(fileList || []);
  if (!files.length) return;
  const card = document.getElementById('trainingAnalysisCard');
  const status = document.getElementById('trainingAnalysisStatus');
  card.style.display = '';
  status.textContent = `מעלה ומנתח ${files.length} קבצים...`;
  for (const file of files) {
    try {
      // 1. Upload to pending dir
      status.textContent = `מעלה: ${file.name}...`;
      const fd = new FormData();
      fd.append('file', file);
      const upRes = await fetch(API + '/upload', { method: 'POST', body: fd });
      if (!upRes.ok) throw new Error(`Upload failed: ${await upRes.text()}`);
      const uploadInfo = await upRes.json();
      // 2. Analyze (proofread + extract metadata)
      status.textContent = `מנתח: ${file.name}...`;
      const analyzeFd = new FormData();
      analyzeFd.append('filename', uploadInfo.filename);
      const anRes = await fetch(API + '/training/analyze', { method: 'POST', body: analyzeFd });
      if (!anRes.ok) throw new Error(`Analyze failed: ${await anRes.text()}`);
      const analysis = await anRes.json();
      _trainingReviews.push({
        ...analysis,
        _pendingName: uploadInfo.filename,
        _originalName: file.name,
        _status: 'ready',
      });
    } catch (e) {
      toast(`שגיאה בעיבוד ${file.name}: ${e.message}`, 'error');
    }
  }
  status.textContent = '';
  renderTrainingReview();
 }
 function renderTrainingReview() {
  const list = document.getElementById('trainingReviewList');
  if (!_trainingReviews.length) {
    list.innerHTML = '<div class="empty">אין קבצים לסקירה</div>';
    document.getElementById('trainingAnalysisCard').style.display = 'none';
    return;
  }
  list.innerHTML = _trainingReviews.map((r, i) => renderReviewRow(r, i)).join('');
 }
 function renderReviewRow(r, idx) {
  const catsHtml = ALL_CATEGORIES.map(c => {
    const checked = r.subject_categories.includes(c) ? 'checked' : '';
    return `<label class="cat-chip"><input type="checkbox" ${checked} onchange="toggleCat(${idx}, '${c}')"> ${c}</label>`;
  }).join('');
  return `
    <div class="training-review" data-idx="${idx}">
      <div class="review-header">
        <strong>${esc(r._originalName)}</strong>
        <span class="review-meta">${r.chars.toLocaleString('he-IL')} תווים · ${esc(r.stats.source_type)}</span>
        <button class="btn-icon" onclick="removeTrainingReview(${idx})" title="הסר">✕</button>
      </div>
      <div class="review-fields">
        <label>מספר החלטה
          <input type="text" value="${esc(r.decision_number)}"
                 onchange="_trainingReviews[${idx}].decision_number=this.value"
                 placeholder="NNNN/YY">
        </label>
        <label>תאריך
          <input type="date" value="${esc(r.decision_date)}"
                 onchange="_trainingReviews[${idx}].decision_date=this.value">
        </label>
      </div>
      <div class="review-cats">
        <div class="review-cats-label">קטגוריות:</div>
        <div class="review-cats-list">${catsHtml}</div>
      </div>
      <details class="review-preview">
        <summary>תצוגה מקדימה של טקסט מנוקה (500 תווים ראשונים)</summary>
        <pre>${esc(r.preview)}</pre>
      </details>
    </div>
  `;
 }
 function toggleCat(idx, cat) {
  const r = _trainingReviews[idx];
  const i = r.subject_categories.indexOf(cat);
  if (i >= 0) r.subject_categories.splice(i, 1);
  else r.subject_categories.push(cat);
 }
 function removeTrainingReview(idx) {
  const r = _trainingReviews[idx];
  // Clean up the uploaded pending file
  if (r._pendingName) {
    fetch(API + '/uploads/' + encodeURIComponent(r._pendingName), { method: 'DELETE' })
      .catch(() => {});
  }
  _trainingReviews.splice(idx, 1);
  renderTrainingReview();
 }
 function cancelTrainingReview() {
  // Delete all pending uploads
  for (const r of _trainingReviews) {
    if (r._pendingName) {
      fetch(API + '/uploads/' + encodeURIComponent(r._pendingName), { method: 'DELETE' })
        .catch(() => {});
    }
  }
  _trainingReviews = [];
  renderTrainingReview();
 }
 async function uploadAllTraining() {
  const btn = document.getElementById('trainingUploadBtn');
  btn.disabled = true;
  const tasksCard = document.getElementById('trainingTasksCard');
  const tasksList = document.getElementById('trainingTasksList');
  tasksCard.style.display = '';
  tasksList.innerHTML = '';
  for (let i = 0; i < _trainingReviews.length; i++) {
    const r = _trainingReviews[i];
    const row = document.createElement('div');
    row.className = 'training-task';
    row.innerHTML = `<span class="mini-spinner"></span> ${esc(r._originalName)} — ${esc(r.decision_number || '—')}`;
    tasksList.appendChild(row);
    try {
      const res = await fetch(API + '/training/upload', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({
          filename: r._pendingName,
          decision_number: r.decision_number,
          decision_date: r.decision_date,
          subject_categories: r.subject_categories,
          title: r._originalName.replace(/\.[^.]+$/, ''),
        }),
      });
      if (!res.ok) {
        const err = await res.text();
        throw new Error(err);
      }
      const { task_id } = await res.json();
      const result = await pollTrainingProgress(task_id, row, r._originalName);
      row.innerHTML = `<span style="color:#0a0">✓</span> ${esc(r._originalName)} — ${result.chars.toLocaleString('he-IL')} תווים, ${result.chunks} קטעים`;
    } catch (e) {
      row.innerHTML = `<span style="color:#c00">✗</span> ${esc(r._originalName)} — ${esc(e.message.substring(0, 200))}`;
    }
  }
  _trainingReviews = [];
  renderTrainingReview();
  btn.disabled = false;
  loadCorpusList();
  toast('ההעלאה הושלמה', 'success');
 }
 const TRAINING_STEP_LABELS = {
  queued: 'בתור',
  proofreading: 'הגהה',
  saving: 'שמירה',
  corpus: 'קליטה לקורפוס',
  chunking: 'פיצול לקטעים',
  embedding: 'יצירת embeddings',
  completed: 'הושלם',
  failed: 'נכשל',
 };
 function pollTrainingProgress(taskId, row, name) {
  return new Promise((resolve, reject) => {
    const es = new EventSource(API + '/progress/' + taskId);
    es.onmessage = (e) => {
      const data = JSON.parse(e.data);
      const label = TRAINING_STEP_LABELS[data.step] || TRAINING_STEP_LABELS[data.status] || data.status;
      row.innerHTML = `<span class="mini-spinner"></span> ${esc(name)} — ${esc(label)}...`;
      if (data.status === 'completed') {
        es.close();
        resolve(data.result);
      } else if (data.status === 'failed') {
        es.close();
        reject(new Error(data.error || 'Processing failed'));
      }
    };
    es.onerror = () => {
      es.close();
      reject(new Error('connection lost'));
    };
  });
 }
 // ── Style Analysis (patterns) ────────────────────────────
 const PATTERN_TYPE_LABELS = {
  opening_formula: 'נוסחאות פתיחה',
  closing_formula: 'נוסחאות סיום',
  transition: 'ביטויי מעבר',
  characteristic_phrase: 'ביטויים אופייניים',
  argument_flow: 'זרימת טיעון',
  analysis_structure: 'מבנה ניתוח',
  evidence_handling: 'טיפול בראיות',
  citation_style: 'סגנון ציטוט',
 };
 async function loadStylePatterns() {
  const container = document.getElementById('patternsList');
  const count = document.getElementById('patternsCount');
  try {
    const res = await fetch(API + '/training/patterns');
    const data = await res.json();
    count.textContent = `${data.total} דפוסים`;
    if (!data.total) {
      container.innerHTML = '<div class="empty">אין דפוסים עדיין. לחץ "נתח קורפוס" כדי לחלץ דפוסים מההחלטות הקיימות.</div>';
      return;
    }
    const typeOrder = [
      'opening_formula', 'transition', 'characteristic_phrase',
      'argument_flow', 'analysis_structure', 'evidence_handling',
      'citation_style', 'closing_formula',
    ];
    const types = typeOrder.filter(t => data.by_type[t]);
    Object.keys(data.by_type).forEach(t => { if (!types.includes(t)) types.push(t); });
    container.innerHTML = types.map(type => `
      <details class="pattern-group" open>
        <summary>
          <strong>${esc(PATTERN_TYPE_LABELS[type] || type)}</strong>
          <span class="pattern-count">${data.by_type[type].length}</span>
        </summary>
        <div class="pattern-list">
          ${data.by_type[type].map(p => `
            <div class="pattern-item">
              <div class="pattern-text">${esc(p.pattern_text)}</div>
              ${p.context ? `<div class="pattern-context">${esc(p.context)}</div>` : ''}
              <div class="pattern-meta">
                <span>תדירות: ${p.frequency}</span>
                ${p.examples && p.examples.length ? `<span>· ${p.examples.length} דוגמאות</span>` : ''}
              </div>
            </div>
          `).join('')}
        </div>
      </details>
    `).join('');
  } catch (e) {
    container.innerHTML = `<div class="empty">שגיאה בטעינה: ${esc(e.message)}</div>`;
  }
 }
 async function runStyleAnalysis() {
  const btn = document.getElementById('analyzeStyleBtn');
  btn.disabled = true;
  try {
    const res = await fetch(API + '/training/analyze-style', { method: 'POST' });
    if (res.status === 409) {
      toast('ניתוח כבר רץ ברקע', 'warn');
    } else if (!res.ok) {
      throw new Error(await res.text());
    } else {
      toast('ניתוח סגנון התחיל — 2-5 דקות', 'success');
    }
    pollStyleAnalysisStatus();
  } catch (e) {
    toast('שגיאה: ' + e.message, 'error');
    btn.disabled = false;
  }
 }
 async function pollStyleAnalysisStatus() {
  const btn = document.getElementById('analyzeStyleBtn');
  try {
    const res = await fetch(API + '/training/analyze-style/status');
    const state = await res.json();
    if (state.running) {
      btn.disabled = true;
      btn.innerHTML = `<span class="mini-spinner"></span> מנתח... ${state.elapsed || 0}s`;
      setTimeout(pollStyleAnalysisStatus, 3000);
    } else {
      btn.disabled = false;
      btn.textContent = 'נתח קורפוס';
      if (state.error) {
        toast('ניתוח נכשל: ' + state.error.substring(0, 150), 'error');
      } else if (state.result) {
        toast('הניתוח הושלם — הדפוסים עודכנו', 'success');
        loadStylePatterns();
      }
    }
  } catch (e) {
    btn.disabled = false;
  }
 }
 async function loadCorpusList() {
  const container = document.getElementById('corpusList');
  const count = document.getElementById('corpusCount');
  try {
    const res = await fetch(API + '/training/corpus');
    const rows = await res.json();
    count.textContent = `${rows.length} החלטות`;
    if (!rows.length) {
      container.innerHTML = '<div class="empty">הקורפוס ריק</div>';
      return;
    }
    container.innerHTML = `
      <table class="corpus-table">
        <thead>
          <tr><th>מספר</th><th>תאריך</th><th>קטגוריות</th><th>תווים</th><th>נוצר</th></tr>
        </thead>
        <tbody>
          ${rows.map(r => `
            <tr>
              <td>${esc(r.decision_number || '—')}</td>
              <td>${esc(r.decision_date || '—')}</td>
              <td>${(r.subject_categories || []).map(c => `<span class="cat-tag">${esc(c)}</span>`).join('')}</td>
              <td>${r.chars.toLocaleString('he-IL')}</td>
              <td>${esc(r.created_at ? r.created_at.substring(0, 10) : '—')}</td>
            </tr>
          `).join('')}
        </tbody>
      </table>
    `;
  } catch (e) {
    container.innerHTML = `<div class="empty">שגיאה בטעינה: ${esc(e.message)}</div>`;
  }
 }
 </script>
 </body>
 </html>