Pre-existing agent updates + analysis DOCX export

Updates accumulated from prior sessions: - HEARTBEAT: company-based filtering (CMP/CMPA) rules - legal-qa, legal-researcher: routine updates - analysis_docx_exporter: new service for analysis DOCX export - compose page: "הורד כ-DOCX" button for analysis - decision_template.docx: template for exporter Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-16 18:49:10 +00:00
parent 3da4d73498
commit 28daff58be
7 changed files with 665 additions and 3 deletions
--- a/mcp-server/src/legal_mcp/services/analysis_docx_exporter.py
+++ b/mcp-server/src/legal_mcp/services/analysis_docx_exporter.py
@@ -0,0 +1,503 @@
+"""Export the legal analysis (analysis-and-research.md + precedents) to a
+DOCX file that uses דפנה's decision template styles.
+
+The template lives at `skills/docx/decision_template.docx` (converted once
+from `טיוטת החלטה.dotx` via `scripts/convert_decision_template.py`).
+We open it, wipe the sample body paragraphs, and write new content by
+applying style names only — never by hand-setting font/size/RTL/margins,
+because the template's styles.xml already carries those.
+
+Style mapping:
+    "Title"            → the document title (case number, date)
+    "Heading 2"        → top-level section headers
+                         (טענות סף / סוגיות להכרעה / מסקנות)
+    "Normal" + bold    → subsection headers (individual claim/issue)
+    "Normal"           → field label (bold run) + value
+    "Quote"            → precedent quote text
+    "Normal" (italic)  → precedent citation
+
+Output: data/cases/{case_number}/exports/ניתוח-משפטי-v{N}.docx
+"""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from typing import Any
+from uuid import UUID
+
+from docx import Document
+from docx.document import Document as DocumentT
+from docx.oxml.ns import qn
+from docx.oxml import OxmlElement
+from docx.text.paragraph import Paragraph
+from docx.text.run import Run
+
+from legal_mcp import config
+from legal_mcp.services import db, research_md
+
+
+def _mark_run_rtl(run: Run) -> None:
+    """Mark a run as complex-script (Hebrew/Arabic) so Word uses the `cs`
+    font slot from the style (David) rather than `ascii` (Times New Roman).
+
+    Without this, runs we add programmatically render Hebrew in the ascii
+    font — even though the paragraph style has `<w:rFonts cs="David"/>`.
+    """
+    rPr = run._r.get_or_add_rPr()
+    if rPr.find(qn("w:rtl")) is None:
+        rPr.append(OxmlElement("w:rtl"))
+
+
+def _mark_paragraph_rtl(paragraph: Paragraph) -> None:
+    """Add `<w:rtl/>` inside the paragraph's rPr so the paragraph mark
+    itself is treated as RTL. The paragraph style already sets bidi
+    direction, but empty paragraphs and trailing marks need this flag.
+    """
+    pPr = paragraph._p.get_or_add_pPr()
+    rPr = pPr.find(qn("w:rPr"))
+    if rPr is None:
+        rPr = OxmlElement("w:rPr")
+        pPr.append(rPr)
+    if rPr.find(qn("w:rtl")) is None:
+        rPr.append(OxmlElement("w:rtl"))
+
+# Path to the converted template. Static — populated by
+# scripts/convert_decision_template.py.
+TEMPLATE_PATH = (
+    Path(__file__).resolve().parents[4]
+    / "skills"
+    / "docx"
+    / "decision_template.docx"
+)
+
+CHAIR_POSITION_LABEL = "עמדת ועדת הערר"
+CHAIR_POSITION_PLACEHOLDER = "[טרם מולאה עמדת ועדת הערר]"
+
+NUMBERED_LINE_RE = re.compile(r"^\s*(\d+)[.)]\s+(.+)$")
+BULLET_LINE_RE = re.compile(r"^\s*[\-\u2022\*\u25CF\u25E6]\s+(.+)$")
+# (א) (ב) (ג) ... — Hebrew-letter enumeration used by the authors.
+# We keep the marker inside the text (the author wrote it), but render the
+# paragraph as "List Paragraph" without the numPr so the visual indentation
+# matches the template's list style without adding a double "1." prefix.
+HEB_LETTER_LINE_RE = re.compile(r"^\s*\([א-ת]\)\s+")
+
+# A standalone **LABEL:** line (the whole trimmed line is wrapped in ** **)
+STANDALONE_LABEL_RE = re.compile(r"^\s*\*\*([^\n*]+?):\*\*\s*$")
+# A short standalone "XYZ:" line (no ** **) — acts as a sub-heading for the
+# paragraphs that follow. Limit to short phrases to avoid eating real
+# sentences that happen to end with a colon.
+PLAIN_LABEL_RE = re.compile(r"^\s*([^\n:]{2,40}):\s*$")
+# "**LABEL:** value" inline — bold label followed by prose on the same line.
+INLINE_LABEL_RE = re.compile(r"^\s*\*\*([^\n*]+?):\*\*\s+(.+)$")
+
+
+def _classify_line(line: str) -> tuple[str, str]:
+    """Return (kind, clean_text) where kind ∈ {numbered, bullet, heb_letter,
+    label_heading, inline_label, plain}.
+
+    clean_text conventions:
+      - numbered/bullet — marker stripped
+      - heb_letter — marker kept (author supplied it)
+      - label_heading — surrounding ** and trailing : stripped
+      - inline_label — "LABEL\x00VALUE" (NUL-separated; _emit splits it)
+    """
+    m = STANDALONE_LABEL_RE.match(line)
+    if m:
+        return "label_heading", m.group(1).strip()
+    m = INLINE_LABEL_RE.match(line)
+    if m:
+        return "inline_label", f"{m.group(1).strip()}\x00{m.group(2).strip()}"
+    m = NUMBERED_LINE_RE.match(line)
+    if m:
+        return "numbered", m.group(2).strip()
+    m = BULLET_LINE_RE.match(line)
+    if m:
+        inner = m.group(1).strip()
+        # A bullet whose only content is **LABEL:** is a heading, not a list item.
+        # E.g. "- **נקודות פתוחות:**"
+        m2 = STANDALONE_LABEL_RE.match(inner)
+        if m2:
+            return "label_heading", m2.group(1).strip()
+        # A bullet of the form "- **LABEL:** value" → inline label.
+        m3 = INLINE_LABEL_RE.match(inner)
+        if m3:
+            return "inline_label", f"{m3.group(1).strip()}\x00{m3.group(2).strip()}"
+        return "bullet", inner
+    if HEB_LETTER_LINE_RE.match(line):
+        return "heb_letter", line.strip()
+    m = PLAIN_LABEL_RE.match(line)
+    if m:
+        return "label_heading", m.group(1).strip()
+    return "plain", line.strip()
+
+
+def _strip_numpr(paragraph: Paragraph) -> None:
+    """Remove any <w:numPr> from the paragraph's pPr.
+
+    Used when we want the visual styling of `List Paragraph` (indent,
+    font) without Word's auto-decimal "1." prefix — e.g. for Hebrew-
+    letter enumeration where the author wrote (א) (ב) (ג) manually.
+    """
+    pPr = paragraph._p.get_or_add_pPr()
+    for numPr in pPr.findall(qn("w:numPr")):
+        pPr.remove(numPr)
+
+
+# Characters that the code should never emit (user instruction: "no dashes").
+# Applied only to code-generated text, not to user content from the md file.
+_CODE_DASH_RE = re.compile(r"[\u2013\u2014]")
+
+# Markdown inline bold — `**...**`
+_INLINE_BOLD_RE = re.compile(r"\*\*([^\n*]+?)\*\*")
+
+
+def _no_dash(text: str) -> str:
+    """Strip em/en dashes from text the code emits (not from source content)."""
+    return _CODE_DASH_RE.sub("", text)
+
+
+def _add_runs_with_inline_bold(paragraph: Paragraph, text: str) -> None:
+    """Split `text` on `**...**` markers, adding alternating plain and bold
+    runs to `paragraph`. All runs are marked RTL and passed through
+    `_no_dash`.
+
+    This keeps `**טענה חשובה**` rendering as bold (as the author intended)
+    instead of leaving the literal asterisks in the output.
+    """
+    text = _no_dash(text)
+    pos = 0
+    for m in _INLINE_BOLD_RE.finditer(text):
+        if m.start() > pos:
+            plain = paragraph.add_run(text[pos : m.start()])
+            _mark_run_rtl(plain)
+        bold = paragraph.add_run(m.group(1))
+        bold.bold = True
+        _mark_run_rtl(bold)
+        pos = m.end()
+    if pos < len(text):
+        tail = paragraph.add_run(text[pos:])
+        _mark_run_rtl(tail)
+
+
+def _clear_body(doc: DocumentT) -> None:
+    """Remove every paragraph currently in the document body.
+
+    The template ships with example paragraphs ("רקע", "דיון והכרעה"…)
+    that we don't want in the output. Section properties (sectPr) are
+    kept so page size / margins / RTL / footer remain intact.
+    """
+    body = doc.element.body
+    for p in list(body.findall(qn("w:p"))):
+        body.remove(p)
+    # Leave sectPr alone — it carries page setup including bidi.
+
+
+def _add_paragraph(doc: DocumentT, text: str, style: str) -> Paragraph:
+    p = doc.add_paragraph(style=style)
+    _mark_paragraph_rtl(p)
+    if text:
+        _add_runs_with_inline_bold(p, text)
+    return p
+
+
+def _add_label_value(
+    doc: DocumentT, label: str, value: str, *, value_italic: bool = False
+) -> Paragraph:
+    """Add a paragraph with a bold label and an inline value.
+
+    Example rendering:   **עמדת המבקשת:** The party argues that…
+    """
+    p = doc.add_paragraph(style="Normal")
+    _mark_paragraph_rtl(p)
+    run_label = p.add_run(f"{_no_dash(label)}: ")
+    run_label.bold = True
+    _mark_run_rtl(run_label)
+    if value:
+        if value_italic:
+            # Placeholder text — italic, no inline-bold handling.
+            run_value = p.add_run(_no_dash(value))
+            run_value.italic = True
+            _mark_run_rtl(run_value)
+        else:
+            _add_runs_with_inline_bold(p, value)
+    return p
+
+
+def _add_multiline_value(
+    doc: DocumentT, label: str, value: str
+) -> None:
+    """Render a field (label + value).
+
+    Multi-line values get the label as its own Heading 2 paragraph (so the
+    structure visually breaks between fields), then each body line as its
+    own paragraph routed through `_emit_content_line`.
+
+    Single-line values stay inline (bold label + text) — a Heading 2 for
+    a one-liner would look inflated.
+    """
+    lines = [ln for ln in value.splitlines() if ln.strip()]
+    if not lines:
+        _add_label_value(doc, label, "")
+        return
+    if len(lines) == 1:
+        kind, text = _classify_line(lines[0])
+        # Single-line — inline with label regardless of kind
+        _add_label_value(doc, label, text)
+        return
+    # Multi-line: label as Heading 2, then each line via _emit_content_line
+    _add_paragraph(doc, label, "Heading 2")
+    for line in lines:
+        _emit_content_line(doc, line)
+
+
+def _emit_content_line(doc: DocumentT, line: str) -> None:
+    """Render a single line of content using the right template style.
+
+    - `label_heading` (e.g. "**נקודות פתוחות:**" alone)  →  Heading 2
+    - `numbered`  ("1. ...")                            →  List Paragraph
+                                                           (auto-decimal)
+    - `heb_letter` ("(א) ...")                          →  List Paragraph
+                                                           with numPr stripped
+                                                           (author supplied
+                                                           the marker)
+    - `bullet`    ("- ...")                             →  Normal (marker
+                                                           stripped)
+    - `plain`                                           →  Normal
+    """
+    kind, text = _classify_line(line)
+
+    if kind == "label_heading":
+        _add_paragraph(doc, text, "Heading 2")
+        return
+
+    if kind == "inline_label":
+        label, value = text.split("\x00", 1)
+        _add_label_value(doc, label, value)
+        return
+
+    if kind == "numbered":
+        para = doc.add_paragraph(style="List Paragraph")
+    elif kind == "heb_letter":
+        para = doc.add_paragraph(style="List Paragraph")
+        _strip_numpr(para)
+    else:
+        para = doc.add_paragraph(style="Normal")
+    _mark_paragraph_rtl(para)
+    _add_runs_with_inline_bold(para, text)
+
+
+def _format_subsection_title(item: dict[str, Any], kind_label: str) -> str:
+    """Return '{kind_label} {number}: {title}' e.g. 'טענת סף 1: חוסר סמכות'."""
+    number = item.get("number") or ""
+    title = item.get("title", "").strip()
+    if number and title:
+        return f"{kind_label} {number}: {title}"
+    if title:
+        return title
+    return f"{kind_label} {number}".strip()
+
+
+def _write_subsection(
+    doc: DocumentT,
+    item: dict[str, Any],
+    precedents_for_item: list[dict[str, Any]],
+    kind_label: str,
+) -> None:
+    # Subsection header — bolded Normal paragraph, not a Heading,
+    # so it visually sits under the section's Heading 2.
+    header_text = _format_subsection_title(item, kind_label)
+    p = doc.add_paragraph(style="Normal")
+    _mark_paragraph_rtl(p)
+    run = p.add_run(_no_dash(header_text))
+    run.bold = True
+    _mark_run_rtl(run)
+
+    # Regular fields (party positions, legal questions, etc.)
+    for field in item.get("fields", []):
+        label = field.get("label", "").strip()
+        content = field.get("content", "").strip()
+        if not label:
+            continue
+        _add_multiline_value(doc, label, content)
+
+    # Chair position — special handling: always render, use placeholder if empty.
+    chair_position = (item.get("chair_position") or "").strip()
+    if chair_position:
+        _add_multiline_value(doc, CHAIR_POSITION_LABEL, chair_position)
+    else:
+        _add_label_value(
+            doc, CHAIR_POSITION_LABEL, CHAIR_POSITION_PLACEHOLDER,
+            value_italic=True,
+        )
+
+    # Precedents attached to this subsection
+    if precedents_for_item:
+        p = doc.add_paragraph(style="Normal")
+        _mark_paragraph_rtl(p)
+        run = p.add_run("פסיקה רלוונטית:")
+        run.bold = True
+        _mark_run_rtl(run)
+        for prec in precedents_for_item:
+            quote = (prec.get("quote") or "").strip()
+            citation = (prec.get("citation") or "").strip()
+            if quote:
+                _add_paragraph(doc, quote, "Quote")
+            if citation:
+                cite_p = doc.add_paragraph(style="Normal")
+                _mark_paragraph_rtl(cite_p)
+                cite_run = cite_p.add_run(_no_dash(citation))
+                cite_run.italic = True
+                _mark_run_rtl(cite_run)
+
+
+def _add_background_section(
+    doc: DocumentT, title: str, body: str | None
+) -> None:
+    """Render a background H2 section (e.g. "רקע דיוני") from a prose
+    body. Lines are routed through `_emit_content_line` so bullets,
+    `**labels:**`, and (א) enumerations all get the template styles.
+    """
+    if not body or not body.strip():
+        return
+    _add_paragraph(doc, title, "Heading 2")
+    for raw in body.splitlines():
+        if not raw.strip():
+            continue
+        _emit_content_line(doc, raw)
+
+
+def _group_precedents(
+    precedents: list[dict[str, Any]],
+) -> tuple[list[dict], dict[str, list[dict]]]:
+    """Split the flat precedent list into case-level and per-section maps.
+
+    Returns (case_level_precedents, {section_id: [precedents]}).
+    """
+    case_level: list[dict] = []
+    by_section: dict[str, list[dict]] = {}
+    for p in precedents:
+        sid = p.get("section_id")
+        if sid is None:
+            case_level.append(p)
+        else:
+            by_section.setdefault(sid, []).append(p)
+    return case_level, by_section
+
+
+def _next_version(export_dir: Path) -> int:
+    """Return the next version number for ניתוח-משפטי-v{N}.docx."""
+    existing = sorted(export_dir.glob("ניתוח-משפטי-v*.docx"))
+    next_ver = 1
+    for p in existing:
+        try:
+            ver = int(p.stem.split("-v")[1])
+        except (IndexError, ValueError):
+            continue
+        next_ver = max(next_ver, ver + 1)
+    return next_ver
+
+
+async def build_analysis_docx(case_number: str) -> Path:
+    """Build a DOCX of the legal analysis for a case using the template
+    styles, and save a versioned copy under the case's exports folder.
+
+    Raises FileNotFoundError if no analysis file or template exists.
+    """
+    if not TEMPLATE_PATH.exists():
+        raise FileNotFoundError(
+            f"Template not found at {TEMPLATE_PATH}. "
+            "Run: python scripts/convert_decision_template.py"
+        )
+
+    case_dir = config.find_case_dir(case_number)
+    analysis_path = case_dir / "documents" / "research" / "analysis-and-research.md"
+    if not analysis_path.exists():
+        raise FileNotFoundError(
+            f"Analysis file not found for case {case_number}"
+        )
+
+    parsed = research_md.parse(analysis_path)
+
+    # Resolve case_id so we can fetch precedents. Missing case → proceed
+    # without precedents rather than failing the export.
+    case_level_precedents: list[dict] = []
+    precedents_by_section: dict[str, list[dict]] = {}
+    case = await db.get_case_by_number(case_number)
+    if case:
+        precedents = await db.list_case_precedents(UUID(case["id"]))
+        case_level_precedents, precedents_by_section = _group_precedents(precedents)
+
+    doc = Document(str(TEMPLATE_PATH))
+    _clear_body(doc)
+
+    # Document title
+    header = parsed.get("header", {})
+    date = header.get("date", "").strip()
+    title_text = f"ניתוח משפטי וכתיבת עמדה בערר {case_number}"
+    _add_paragraph(doc, title_text, "Heading 1")
+    if date:
+        p_date = doc.add_paragraph(style="Normal")
+        _mark_paragraph_rtl(p_date)
+        run_date = p_date.add_run(f"תאריך: {date}")
+        _mark_run_rtl(run_date)
+
+    # Background sections — printed first so the reader gets context
+    # before any claims/precedents. These come only in the exported DOCX,
+    # not in the web UI (the UI renders them elsewhere).
+    _add_background_section(doc, "רקע לניתוח", parsed.get("represented_party"))
+    _add_background_section(doc, "רקע דיוני", parsed.get("procedural_background"))
+    _add_background_section(doc, "עובדות מוסכמות", parsed.get("agreed_facts"))
+    _add_background_section(
+        doc, "עובדות שנויות במחלוקת", parsed.get("disputed_facts")
+    )
+
+    # Case-level precedents appear at the top (they cut across claims/issues)
+    if case_level_precedents:
+        _add_paragraph(doc, "פסיקה כללית", "Heading 2")
+        for prec in case_level_precedents:
+            quote = (prec.get("quote") or "").strip()
+            citation = (prec.get("citation") or "").strip()
+            if quote:
+                _add_paragraph(doc, quote, "Quote")
+            if citation:
+                cp = doc.add_paragraph(style="Normal")
+                _mark_paragraph_rtl(cp)
+                cr = cp.add_run(_no_dash(citation))
+                cr.italic = True
+                _mark_run_rtl(cr)
+
+    # Threshold claims
+    threshold_claims = parsed.get("threshold_claims", [])
+    if threshold_claims:
+        _add_paragraph(doc, "טענות סף", "Heading 2")
+        for tc in threshold_claims:
+            _write_subsection(
+                doc, tc, precedents_by_section.get(tc["id"], []), "טענת סף"
+            )
+
+    # Issues
+    issues = parsed.get("issues", [])
+    if issues:
+        _add_paragraph(doc, "סוגיות להכרעה", "Heading 2")
+        for iss in issues:
+            _write_subsection(
+                doc, iss, precedents_by_section.get(iss["id"], []), "סוגיה"
+            )
+
+    # Conclusions
+    conclusions = (parsed.get("conclusions") or "").strip()
+    if conclusions:
+        _add_paragraph(doc, "מסקנות", "Heading 2")
+        for raw in conclusions.splitlines():
+            if not raw.strip():
+                continue
+            _emit_content_line(doc, raw)
+
+    # Save versioned
+    export_dir = case_dir / "exports"
+    export_dir.mkdir(parents=True, exist_ok=True)
+    version = _next_version(export_dir)
+    out_path = export_dir / f"ניתוח-משפטי-v{version}.docx"
+    doc.save(str(out_path))
+    return out_path