legal-ai/mcp-server/src/legal_mcp/services/analysis_docx_exporter.py

"""Export the legal analysis (analysis-and-research.md + precedents) to a
DOCX file that uses דפנה's decision template styles.

The template lives at `skills/docx/decision_template.docx` (converted once
from `טיוטת החלטה.dotx` via `scripts/convert_decision_template.py`).
We open it, wipe the sample body paragraphs, and write new content by
applying style names only — never by hand-setting font/size/RTL/margins,
because the template's styles.xml already carries those.

Style mapping:
    "Title"            → the document title (case number, date)
    "Heading 2"        → top-level section headers
                         (טענות סף / סוגיות להכרעה / מסקנות)
    "Normal" + bold    → subsection headers (individual claim/issue)
    "Normal"           → field label (bold run) + value
    "Quote"            → precedent quote text
    "Normal" (italic)  → precedent citation

Output: data/cases/{case_number}/exports/ניתוח-משפטי-v{N}.docx
"""

from __future__ import annotations

import re
from pathlib import Path
from typing import Any
from uuid import UUID

from docx import Document
from docx.document import Document as DocumentT
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
from docx.text.paragraph import Paragraph
from docx.text.run import Run

from legal_mcp import config
from legal_mcp.services import db, research_md


def _mark_run_rtl(run: Run) -> None:
    """Mark a run as complex-script (Hebrew/Arabic) so Word uses the `cs`
    font slot from the style (David) rather than `ascii` (Times New Roman).

    Without this, runs we add programmatically render Hebrew in the ascii
    font — even though the paragraph style has `<w:rFonts cs="David"/>`.
    """
    rPr = run._r.get_or_add_rPr()
    if rPr.find(qn("w:rtl")) is None:
        rPr.append(OxmlElement("w:rtl"))


def _mark_paragraph_rtl(paragraph: Paragraph) -> None:
    """Add `<w:rtl/>` inside the paragraph's rPr so the paragraph mark
    itself is treated as RTL. The paragraph style already sets bidi
    direction, but empty paragraphs and trailing marks need this flag.
    """
    pPr = paragraph._p.get_or_add_pPr()
    rPr = pPr.find(qn("w:rPr"))
    if rPr is None:
        rPr = OxmlElement("w:rPr")
        pPr.append(rPr)
    if rPr.find(qn("w:rtl")) is None:
        rPr.append(OxmlElement("w:rtl"))

# Path to the converted template. Static — populated by
# scripts/convert_decision_template.py.
TEMPLATE_PATH = (
    Path(__file__).resolve().parents[4]
    / "skills"
    / "docx"
    / "decision_template.docx"
)

CHAIR_POSITION_LABEL = "עמדת ועדת הערר"
CHAIR_POSITION_PLACEHOLDER = "[טרם מולאה עמדת ועדת הערר]"

NUMBERED_LINE_RE = re.compile(r"^\s*(\d+)[.)]\s+(.+)$")
BULLET_LINE_RE = re.compile(r"^\s*[\-\u2022\*\u25CF\u25E6]\s+(.+)$")
# (א) (ב) (ג) ... — Hebrew-letter enumeration used by the authors.
# We keep the marker inside the text (the author wrote it), but render the
# paragraph as "List Paragraph" without the numPr so the visual indentation
# matches the template's list style without adding a double "1." prefix.
HEB_LETTER_LINE_RE = re.compile(r"^\s*\([א-ת]\)\s+")

# A standalone **LABEL:** line (the whole trimmed line is wrapped in ** **)
STANDALONE_LABEL_RE = re.compile(r"^\s*\*\*([^\n*]+?):\*\*\s*$")
# A short standalone "XYZ:" line (no ** **) — acts as a sub-heading for the
# paragraphs that follow. Limit to short phrases to avoid eating real
# sentences that happen to end with a colon.
PLAIN_LABEL_RE = re.compile(r"^\s*([^\n:]{2,40}):\s*$")
# "**LABEL:** value" inline — bold label followed by prose on the same line.
INLINE_LABEL_RE = re.compile(r"^\s*\*\*([^\n*]+?):\*\*\s+(.+)$")


def _classify_line(line: str) -> tuple[str, str]:
    """Return (kind, clean_text) where kind ∈ {numbered, bullet, heb_letter,
    label_heading, inline_label, plain}.

    clean_text conventions:
      - numbered/bullet — marker stripped
      - heb_letter — marker kept (author supplied it)
      - label_heading — surrounding ** and trailing : stripped
      - inline_label — "LABEL\x00VALUE" (NUL-separated; _emit splits it)
    """
    m = STANDALONE_LABEL_RE.match(line)
    if m:
        return "label_heading", m.group(1).strip()
    m = INLINE_LABEL_RE.match(line)
    if m:
        return "inline_label", f"{m.group(1).strip()}\x00{m.group(2).strip()}"
    m = NUMBERED_LINE_RE.match(line)
    if m:
        return "numbered", m.group(2).strip()
    m = BULLET_LINE_RE.match(line)
    if m:
        inner = m.group(1).strip()
        # A bullet whose only content is **LABEL:** is a heading, not a list item.
        # E.g. "- **נקודות פתוחות:**"
        m2 = STANDALONE_LABEL_RE.match(inner)
        if m2:
            return "label_heading", m2.group(1).strip()
        # A bullet of the form "- **LABEL:** value" → inline label.
        m3 = INLINE_LABEL_RE.match(inner)
        if m3:
            return "inline_label", f"{m3.group(1).strip()}\x00{m3.group(2).strip()}"
        return "bullet", inner
    if HEB_LETTER_LINE_RE.match(line):
        return "heb_letter", line.strip()
    m = PLAIN_LABEL_RE.match(line)
    if m:
        return "label_heading", m.group(1).strip()
    return "plain", line.strip()


def _strip_numpr(paragraph: Paragraph) -> None:
    """Remove any <w:numPr> from the paragraph's pPr.

    Used when we want the visual styling of `List Paragraph` (indent,
    font) without Word's auto-decimal "1." prefix — e.g. for Hebrew-
    letter enumeration where the author wrote (א) (ב) (ג) manually.
    """
    pPr = paragraph._p.get_or_add_pPr()
    for numPr in pPr.findall(qn("w:numPr")):
        pPr.remove(numPr)


# Characters that the code should never emit (user instruction: "no dashes").
# Applied only to code-generated text, not to user content from the md file.
_CODE_DASH_RE = re.compile(r"[\u2013\u2014]")

# Markdown inline bold — `**...**`
_INLINE_BOLD_RE = re.compile(r"\*\*([^\n*]+?)\*\*")


def _no_dash(text: str) -> str:
    """Strip em/en dashes from text the code emits (not from source content)."""
    return _CODE_DASH_RE.sub("", text)


def _add_runs_with_inline_bold(paragraph: Paragraph, text: str) -> None:
    """Split `text` on `**...**` markers, adding alternating plain and bold
    runs to `paragraph`. All runs are marked RTL and passed through
    `_no_dash`.

    This keeps `**טענה חשובה**` rendering as bold (as the author intended)
    instead of leaving the literal asterisks in the output.
    """
    text = _no_dash(text)
    pos = 0
    for m in _INLINE_BOLD_RE.finditer(text):
        if m.start() > pos:
            plain = paragraph.add_run(text[pos : m.start()])
            _mark_run_rtl(plain)
        bold = paragraph.add_run(m.group(1))
        bold.bold = True
        _mark_run_rtl(bold)
        pos = m.end()
    if pos < len(text):
        tail = paragraph.add_run(text[pos:])
        _mark_run_rtl(tail)


def _clear_body(doc: DocumentT) -> None:
    """Remove every paragraph currently in the document body.

    The template ships with example paragraphs ("רקע", "דיון והכרעה"…)
    that we don't want in the output. Section properties (sectPr) are
    kept so page size / margins / RTL / footer remain intact.
    """
    body = doc.element.body
    for p in list(body.findall(qn("w:p"))):
        body.remove(p)
    # Leave sectPr alone — it carries page setup including bidi.


def _add_paragraph(doc: DocumentT, text: str, style: str) -> Paragraph:
    p = doc.add_paragraph(style=style)
    _mark_paragraph_rtl(p)
    if text:
        _add_runs_with_inline_bold(p, text)
    return p


def _add_label_value(
    doc: DocumentT, label: str, value: str, *, value_italic: bool = False
) -> Paragraph:
    """Add a paragraph with a bold label and an inline value.

    Example rendering:   **עמדת המבקשת:** The party argues that…
    """
    p = doc.add_paragraph(style="Normal")
    _mark_paragraph_rtl(p)
    run_label = p.add_run(f"{_no_dash(label)}: ")
    run_label.bold = True
    _mark_run_rtl(run_label)
    if value:
        if value_italic:
            # Placeholder text — italic, no inline-bold handling.
            run_value = p.add_run(_no_dash(value))
            run_value.italic = True
            _mark_run_rtl(run_value)
        else:
            _add_runs_with_inline_bold(p, value)
    return p


def _add_multiline_value(
    doc: DocumentT, label: str, value: str
) -> None:
    """Render a field (label + value).

    Multi-line values get the label as its own Heading 2 paragraph (so the
    structure visually breaks between fields), then each body line as its
    own paragraph routed through `_emit_content_line`.

    Single-line values stay inline (bold label + text) — a Heading 2 for
    a one-liner would look inflated.
    """
    lines = [ln for ln in value.splitlines() if ln.strip()]
    if not lines:
        _add_label_value(doc, label, "")
        return
    if len(lines) == 1:
        kind, text = _classify_line(lines[0])
        # Single-line — inline with label regardless of kind
        _add_label_value(doc, label, text)
        return
    # Multi-line: label as Heading 2, then each line via _emit_content_line
    _add_paragraph(doc, label, "Heading 2")
    for line in lines:
        _emit_content_line(doc, line)


def _emit_content_line(doc: DocumentT, line: str) -> None:
    """Render a single line of content using the right template style.

    - `label_heading` (e.g. "**נקודות פתוחות:**" alone)  →  Heading 2
    - `numbered`  ("1. ...")                            →  List Paragraph
                                                           (auto-decimal)
    - `heb_letter` ("(א) ...")                          →  List Paragraph
                                                           with numPr stripped
                                                           (author supplied
                                                           the marker)
    - `bullet`    ("- ...")                             →  Normal (marker
                                                           stripped)
    - `plain`                                           →  Normal
    """
    kind, text = _classify_line(line)

    if kind == "label_heading":
        _add_paragraph(doc, text, "Heading 2")
        return

    if kind == "inline_label":
        label, value = text.split("\x00", 1)
        _add_label_value(doc, label, value)
        return

    if kind == "numbered":
        para = doc.add_paragraph(style="List Paragraph")
    elif kind == "heb_letter":
        para = doc.add_paragraph(style="List Paragraph")
        _strip_numpr(para)
    else:
        para = doc.add_paragraph(style="Normal")
    _mark_paragraph_rtl(para)
    _add_runs_with_inline_bold(para, text)


def _format_subsection_title(item: dict[str, Any], kind_label: str) -> str:
    """Return '{kind_label} {number}: {title}' e.g. 'טענת סף 1: חוסר סמכות'."""
    number = item.get("number") or ""
    title = item.get("title", "").strip()
    if number and title:
        return f"{kind_label} {number}: {title}"
    if title:
        return title
    return f"{kind_label} {number}".strip()


def _write_subsection(
    doc: DocumentT,
    item: dict[str, Any],
    precedents_for_item: list[dict[str, Any]],
    kind_label: str,
) -> None:
    # Subsection header — bolded Normal paragraph, not a Heading,
    # so it visually sits under the section's Heading 2.
    header_text = _format_subsection_title(item, kind_label)
    p = doc.add_paragraph(style="Normal")
    _mark_paragraph_rtl(p)
    run = p.add_run(_no_dash(header_text))
    run.bold = True
    _mark_run_rtl(run)

    # Regular fields (party positions, legal questions, etc.)
    for field in item.get("fields", []):
        label = field.get("label", "").strip()
        content = field.get("content", "").strip()
        if not label:
            continue
        _add_multiline_value(doc, label, content)

    # Chair position — special handling: always render, use placeholder if empty.
    chair_position = (item.get("chair_position") or "").strip()
    if chair_position:
        _add_multiline_value(doc, CHAIR_POSITION_LABEL, chair_position)
    else:
        _add_label_value(
            doc, CHAIR_POSITION_LABEL, CHAIR_POSITION_PLACEHOLDER,
            value_italic=True,
        )

    # Precedents attached to this subsection
    if precedents_for_item:
        p = doc.add_paragraph(style="Normal")
        _mark_paragraph_rtl(p)
        run = p.add_run("פסיקה רלוונטית:")
        run.bold = True
        _mark_run_rtl(run)
        for prec in precedents_for_item:
            quote = (prec.get("quote") or "").strip()
            citation = (prec.get("citation") or "").strip()
            if quote:
                _add_paragraph(doc, quote, "Quote")
            if citation:
                cite_p = doc.add_paragraph(style="Normal")
                _mark_paragraph_rtl(cite_p)
                cite_run = cite_p.add_run(_no_dash(citation))
                cite_run.italic = True
                _mark_run_rtl(cite_run)


def _add_background_section(
    doc: DocumentT, title: str, body: str | None
) -> None:
    """Render a background H2 section (e.g. "רקע דיוני") from a prose
    body. Lines are routed through `_emit_content_line` so bullets,
    `**labels:**`, and (א) enumerations all get the template styles.
    """
    if not body or not body.strip():
        return
    _add_paragraph(doc, title, "Heading 2")
    for raw in body.splitlines():
        if not raw.strip():
            continue
        _emit_content_line(doc, raw)


def _group_precedents(
    precedents: list[dict[str, Any]],
) -> tuple[list[dict], dict[str, list[dict]]]:
    """Split the flat precedent list into case-level and per-section maps.

    Returns (case_level_precedents, {section_id: [precedents]}).
    """
    case_level: list[dict] = []
    by_section: dict[str, list[dict]] = {}
    for p in precedents:
        sid = p.get("section_id")
        if sid is None:
            case_level.append(p)
        else:
            by_section.setdefault(sid, []).append(p)
    return case_level, by_section


def _next_version(export_dir: Path) -> int:
    """Return the next version number for ניתוח-משפטי-v{N}.docx."""
    existing = sorted(export_dir.glob("ניתוח-משפטי-v*.docx"))
    next_ver = 1
    for p in existing:
        try:
            ver = int(p.stem.split("-v")[1])
        except (IndexError, ValueError):
            continue
        next_ver = max(next_ver, ver + 1)
    return next_ver


async def build_analysis_docx(case_number: str) -> Path:
    """Build a DOCX of the legal analysis for a case using the template
    styles, and save a versioned copy under the case's exports folder.

    Raises FileNotFoundError if no analysis file or template exists.
    """
    if not TEMPLATE_PATH.exists():
        raise FileNotFoundError(
            f"Template not found at {TEMPLATE_PATH}. "
            "Run: python scripts/convert_decision_template.py"
        )

    case_dir = config.find_case_dir(case_number)
    analysis_path = case_dir / "documents" / "research" / "analysis-and-research.md"
    if not analysis_path.exists():
        raise FileNotFoundError(
            f"Analysis file not found for case {case_number}"
        )

    parsed = research_md.parse(analysis_path)

    # Resolve case_id so we can fetch precedents. Missing case → proceed
    # without precedents rather than failing the export.
    case_level_precedents: list[dict] = []
    precedents_by_section: dict[str, list[dict]] = {}
    case = await db.get_case_by_number(case_number)
    if case:
        precedents = await db.list_case_precedents(UUID(case["id"]))
        case_level_precedents, precedents_by_section = _group_precedents(precedents)

    doc = Document(str(TEMPLATE_PATH))
    _clear_body(doc)

    # Document title
    header = parsed.get("header", {})
    date = header.get("date", "").strip()
    title_text = f"ניתוח משפטי וכתיבת עמדה בערר {case_number}"
    _add_paragraph(doc, title_text, "Heading 1")
    if date:
        p_date = doc.add_paragraph(style="Normal")
        _mark_paragraph_rtl(p_date)
        run_date = p_date.add_run(f"תאריך: {date}")
        _mark_run_rtl(run_date)

    # Background sections — printed first so the reader gets context
    # before any claims/precedents. These come only in the exported DOCX,
    # not in the web UI (the UI renders them elsewhere).
    _add_background_section(doc, "רקע לניתוח", parsed.get("represented_party"))
    _add_background_section(doc, "רקע דיוני", parsed.get("procedural_background"))
    _add_background_section(doc, "עובדות מוסכמות", parsed.get("agreed_facts"))
    _add_background_section(
        doc, "עובדות שנויות במחלוקת", parsed.get("disputed_facts")
    )

    # Case-level precedents appear at the top (they cut across claims/issues)
    if case_level_precedents:
        _add_paragraph(doc, "פסיקה כללית", "Heading 2")
        for prec in case_level_precedents:
            quote = (prec.get("quote") or "").strip()
            citation = (prec.get("citation") or "").strip()
            if quote:
                _add_paragraph(doc, quote, "Quote")
            if citation:
                cp = doc.add_paragraph(style="Normal")
                _mark_paragraph_rtl(cp)
                cr = cp.add_run(_no_dash(citation))
                cr.italic = True
                _mark_run_rtl(cr)

    # Threshold claims
    threshold_claims = parsed.get("threshold_claims", [])
    if threshold_claims:
        _add_paragraph(doc, "טענות סף", "Heading 2")
        for tc in threshold_claims:
            _write_subsection(
                doc, tc, precedents_by_section.get(tc["id"], []), "טענת סף"
            )

    # Issues
    issues = parsed.get("issues", [])
    if issues:
        _add_paragraph(doc, "סוגיות להכרעה", "Heading 2")
        for iss in issues:
            _write_subsection(
                doc, iss, precedents_by_section.get(iss["id"], []), "סוגיה"
            )

    # Conclusions
    conclusions = (parsed.get("conclusions") or "").strip()
    if conclusions:
        _add_paragraph(doc, "מסקנות", "Heading 2")
        for raw in conclusions.splitlines():
            if not raw.strip():
                continue
            _emit_content_line(doc, raw)

    # Save versioned
    export_dir = case_dir / "exports"
    export_dir.mkdir(parents=True, exist_ok=True)
    version = _next_version(export_dir)
    out_path = export_dir / f"ניתוח-משפטי-v{version}.docx"
    doc.save(str(out_path))
    return out_path