Pre-existing agent updates + analysis DOCX export

Updates accumulated from prior sessions:
- HEARTBEAT: company-based filtering (CMP/CMPA) rules
- legal-qa, legal-researcher: routine updates
- analysis_docx_exporter: new service for analysis DOCX export
- compose page: "הורד כ-DOCX" button for analysis
- decision_template.docx: template for exporter

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-16 18:49:10 +00:00
parent 3da4d73498
commit 28daff58be
7 changed files with 665 additions and 3 deletions

View File

@@ -0,0 +1,503 @@
"""Export the legal analysis (analysis-and-research.md + precedents) to a
DOCX file that uses דפנה's decision template styles.
The template lives at `skills/docx/decision_template.docx` (converted once
from `טיוטת החלטה.dotx` via `scripts/convert_decision_template.py`).
We open it, wipe the sample body paragraphs, and write new content by
applying style names only — never by hand-setting font/size/RTL/margins,
because the template's styles.xml already carries those.
Style mapping:
"Title" → the document title (case number, date)
"Heading 2" → top-level section headers
(טענות סף / סוגיות להכרעה / מסקנות)
"Normal" + bold → subsection headers (individual claim/issue)
"Normal" → field label (bold run) + value
"Quote" → precedent quote text
"Normal" (italic) → precedent citation
Output: data/cases/{case_number}/exports/ניתוח-משפטי-v{N}.docx
"""
from __future__ import annotations
import re
from pathlib import Path
from typing import Any
from uuid import UUID
from docx import Document
from docx.document import Document as DocumentT
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from legal_mcp import config
from legal_mcp.services import db, research_md
def _mark_run_rtl(run: Run) -> None:
"""Mark a run as complex-script (Hebrew/Arabic) so Word uses the `cs`
font slot from the style (David) rather than `ascii` (Times New Roman).
Without this, runs we add programmatically render Hebrew in the ascii
font — even though the paragraph style has `<w:rFonts cs="David"/>`.
"""
rPr = run._r.get_or_add_rPr()
if rPr.find(qn("w:rtl")) is None:
rPr.append(OxmlElement("w:rtl"))
def _mark_paragraph_rtl(paragraph: Paragraph) -> None:
"""Add `<w:rtl/>` inside the paragraph's rPr so the paragraph mark
itself is treated as RTL. The paragraph style already sets bidi
direction, but empty paragraphs and trailing marks need this flag.
"""
pPr = paragraph._p.get_or_add_pPr()
rPr = pPr.find(qn("w:rPr"))
if rPr is None:
rPr = OxmlElement("w:rPr")
pPr.append(rPr)
if rPr.find(qn("w:rtl")) is None:
rPr.append(OxmlElement("w:rtl"))
# Path to the converted template. Static — populated by
# scripts/convert_decision_template.py.
TEMPLATE_PATH = (
Path(__file__).resolve().parents[4]
/ "skills"
/ "docx"
/ "decision_template.docx"
)
CHAIR_POSITION_LABEL = "עמדת ועדת הערר"
CHAIR_POSITION_PLACEHOLDER = "[טרם מולאה עמדת ועדת הערר]"
NUMBERED_LINE_RE = re.compile(r"^\s*(\d+)[.)]\s+(.+)$")
BULLET_LINE_RE = re.compile(r"^\s*[\-\u2022\*\u25CF\u25E6]\s+(.+)$")
# (א) (ב) (ג) ... — Hebrew-letter enumeration used by the authors.
# We keep the marker inside the text (the author wrote it), but render the
# paragraph as "List Paragraph" without the numPr so the visual indentation
# matches the template's list style without adding a double "1." prefix.
HEB_LETTER_LINE_RE = re.compile(r"^\s*\([א-ת]\)\s+")
# A standalone **LABEL:** line (the whole trimmed line is wrapped in ** **)
STANDALONE_LABEL_RE = re.compile(r"^\s*\*\*([^\n*]+?):\*\*\s*$")
# A short standalone "XYZ:" line (no ** **) — acts as a sub-heading for the
# paragraphs that follow. Limit to short phrases to avoid eating real
# sentences that happen to end with a colon.
PLAIN_LABEL_RE = re.compile(r"^\s*([^\n:]{2,40}):\s*$")
# "**LABEL:** value" inline — bold label followed by prose on the same line.
INLINE_LABEL_RE = re.compile(r"^\s*\*\*([^\n*]+?):\*\*\s+(.+)$")
def _classify_line(line: str) -> tuple[str, str]:
"""Return (kind, clean_text) where kind ∈ {numbered, bullet, heb_letter,
label_heading, inline_label, plain}.
clean_text conventions:
- numbered/bullet — marker stripped
- heb_letter — marker kept (author supplied it)
- label_heading — surrounding ** and trailing : stripped
- inline_label — "LABEL\x00VALUE" (NUL-separated; _emit splits it)
"""
m = STANDALONE_LABEL_RE.match(line)
if m:
return "label_heading", m.group(1).strip()
m = INLINE_LABEL_RE.match(line)
if m:
return "inline_label", f"{m.group(1).strip()}\x00{m.group(2).strip()}"
m = NUMBERED_LINE_RE.match(line)
if m:
return "numbered", m.group(2).strip()
m = BULLET_LINE_RE.match(line)
if m:
inner = m.group(1).strip()
# A bullet whose only content is **LABEL:** is a heading, not a list item.
# E.g. "- **נקודות פתוחות:**"
m2 = STANDALONE_LABEL_RE.match(inner)
if m2:
return "label_heading", m2.group(1).strip()
# A bullet of the form "- **LABEL:** value" → inline label.
m3 = INLINE_LABEL_RE.match(inner)
if m3:
return "inline_label", f"{m3.group(1).strip()}\x00{m3.group(2).strip()}"
return "bullet", inner
if HEB_LETTER_LINE_RE.match(line):
return "heb_letter", line.strip()
m = PLAIN_LABEL_RE.match(line)
if m:
return "label_heading", m.group(1).strip()
return "plain", line.strip()
def _strip_numpr(paragraph: Paragraph) -> None:
"""Remove any <w:numPr> from the paragraph's pPr.
Used when we want the visual styling of `List Paragraph` (indent,
font) without Word's auto-decimal "1." prefix — e.g. for Hebrew-
letter enumeration where the author wrote (א) (ב) (ג) manually.
"""
pPr = paragraph._p.get_or_add_pPr()
for numPr in pPr.findall(qn("w:numPr")):
pPr.remove(numPr)
# Characters that the code should never emit (user instruction: "no dashes").
# Applied only to code-generated text, not to user content from the md file.
_CODE_DASH_RE = re.compile(r"[\u2013\u2014]")
# Markdown inline bold — `**...**`
_INLINE_BOLD_RE = re.compile(r"\*\*([^\n*]+?)\*\*")
def _no_dash(text: str) -> str:
"""Strip em/en dashes from text the code emits (not from source content)."""
return _CODE_DASH_RE.sub("", text)
def _add_runs_with_inline_bold(paragraph: Paragraph, text: str) -> None:
"""Split `text` on `**...**` markers, adding alternating plain and bold
runs to `paragraph`. All runs are marked RTL and passed through
`_no_dash`.
This keeps `**טענה חשובה**` rendering as bold (as the author intended)
instead of leaving the literal asterisks in the output.
"""
text = _no_dash(text)
pos = 0
for m in _INLINE_BOLD_RE.finditer(text):
if m.start() > pos:
plain = paragraph.add_run(text[pos : m.start()])
_mark_run_rtl(plain)
bold = paragraph.add_run(m.group(1))
bold.bold = True
_mark_run_rtl(bold)
pos = m.end()
if pos < len(text):
tail = paragraph.add_run(text[pos:])
_mark_run_rtl(tail)
def _clear_body(doc: DocumentT) -> None:
"""Remove every paragraph currently in the document body.
The template ships with example paragraphs ("רקע", "דיון והכרעה"…)
that we don't want in the output. Section properties (sectPr) are
kept so page size / margins / RTL / footer remain intact.
"""
body = doc.element.body
for p in list(body.findall(qn("w:p"))):
body.remove(p)
# Leave sectPr alone — it carries page setup including bidi.
def _add_paragraph(doc: DocumentT, text: str, style: str) -> Paragraph:
p = doc.add_paragraph(style=style)
_mark_paragraph_rtl(p)
if text:
_add_runs_with_inline_bold(p, text)
return p
def _add_label_value(
doc: DocumentT, label: str, value: str, *, value_italic: bool = False
) -> Paragraph:
"""Add a paragraph with a bold label and an inline value.
Example rendering: **עמדת המבקשת:** The party argues that…
"""
p = doc.add_paragraph(style="Normal")
_mark_paragraph_rtl(p)
run_label = p.add_run(f"{_no_dash(label)}: ")
run_label.bold = True
_mark_run_rtl(run_label)
if value:
if value_italic:
# Placeholder text — italic, no inline-bold handling.
run_value = p.add_run(_no_dash(value))
run_value.italic = True
_mark_run_rtl(run_value)
else:
_add_runs_with_inline_bold(p, value)
return p
def _add_multiline_value(
doc: DocumentT, label: str, value: str
) -> None:
"""Render a field (label + value).
Multi-line values get the label as its own Heading 2 paragraph (so the
structure visually breaks between fields), then each body line as its
own paragraph routed through `_emit_content_line`.
Single-line values stay inline (bold label + text) — a Heading 2 for
a one-liner would look inflated.
"""
lines = [ln for ln in value.splitlines() if ln.strip()]
if not lines:
_add_label_value(doc, label, "")
return
if len(lines) == 1:
kind, text = _classify_line(lines[0])
# Single-line — inline with label regardless of kind
_add_label_value(doc, label, text)
return
# Multi-line: label as Heading 2, then each line via _emit_content_line
_add_paragraph(doc, label, "Heading 2")
for line in lines:
_emit_content_line(doc, line)
def _emit_content_line(doc: DocumentT, line: str) -> None:
"""Render a single line of content using the right template style.
- `label_heading` (e.g. "**נקודות פתוחות:**" alone) → Heading 2
- `numbered` ("1. ...") → List Paragraph
(auto-decimal)
- `heb_letter` ("(א) ...") → List Paragraph
with numPr stripped
(author supplied
the marker)
- `bullet` ("- ...") → Normal (marker
stripped)
- `plain` → Normal
"""
kind, text = _classify_line(line)
if kind == "label_heading":
_add_paragraph(doc, text, "Heading 2")
return
if kind == "inline_label":
label, value = text.split("\x00", 1)
_add_label_value(doc, label, value)
return
if kind == "numbered":
para = doc.add_paragraph(style="List Paragraph")
elif kind == "heb_letter":
para = doc.add_paragraph(style="List Paragraph")
_strip_numpr(para)
else:
para = doc.add_paragraph(style="Normal")
_mark_paragraph_rtl(para)
_add_runs_with_inline_bold(para, text)
def _format_subsection_title(item: dict[str, Any], kind_label: str) -> str:
"""Return '{kind_label} {number}: {title}' e.g. 'טענת סף 1: חוסר סמכות'."""
number = item.get("number") or ""
title = item.get("title", "").strip()
if number and title:
return f"{kind_label} {number}: {title}"
if title:
return title
return f"{kind_label} {number}".strip()
def _write_subsection(
doc: DocumentT,
item: dict[str, Any],
precedents_for_item: list[dict[str, Any]],
kind_label: str,
) -> None:
# Subsection header — bolded Normal paragraph, not a Heading,
# so it visually sits under the section's Heading 2.
header_text = _format_subsection_title(item, kind_label)
p = doc.add_paragraph(style="Normal")
_mark_paragraph_rtl(p)
run = p.add_run(_no_dash(header_text))
run.bold = True
_mark_run_rtl(run)
# Regular fields (party positions, legal questions, etc.)
for field in item.get("fields", []):
label = field.get("label", "").strip()
content = field.get("content", "").strip()
if not label:
continue
_add_multiline_value(doc, label, content)
# Chair position — special handling: always render, use placeholder if empty.
chair_position = (item.get("chair_position") or "").strip()
if chair_position:
_add_multiline_value(doc, CHAIR_POSITION_LABEL, chair_position)
else:
_add_label_value(
doc, CHAIR_POSITION_LABEL, CHAIR_POSITION_PLACEHOLDER,
value_italic=True,
)
# Precedents attached to this subsection
if precedents_for_item:
p = doc.add_paragraph(style="Normal")
_mark_paragraph_rtl(p)
run = p.add_run("פסיקה רלוונטית:")
run.bold = True
_mark_run_rtl(run)
for prec in precedents_for_item:
quote = (prec.get("quote") or "").strip()
citation = (prec.get("citation") or "").strip()
if quote:
_add_paragraph(doc, quote, "Quote")
if citation:
cite_p = doc.add_paragraph(style="Normal")
_mark_paragraph_rtl(cite_p)
cite_run = cite_p.add_run(_no_dash(citation))
cite_run.italic = True
_mark_run_rtl(cite_run)
def _add_background_section(
doc: DocumentT, title: str, body: str | None
) -> None:
"""Render a background H2 section (e.g. "רקע דיוני") from a prose
body. Lines are routed through `_emit_content_line` so bullets,
`**labels:**`, and (א) enumerations all get the template styles.
"""
if not body or not body.strip():
return
_add_paragraph(doc, title, "Heading 2")
for raw in body.splitlines():
if not raw.strip():
continue
_emit_content_line(doc, raw)
def _group_precedents(
precedents: list[dict[str, Any]],
) -> tuple[list[dict], dict[str, list[dict]]]:
"""Split the flat precedent list into case-level and per-section maps.
Returns (case_level_precedents, {section_id: [precedents]}).
"""
case_level: list[dict] = []
by_section: dict[str, list[dict]] = {}
for p in precedents:
sid = p.get("section_id")
if sid is None:
case_level.append(p)
else:
by_section.setdefault(sid, []).append(p)
return case_level, by_section
def _next_version(export_dir: Path) -> int:
"""Return the next version number for ניתוח-משפטי-v{N}.docx."""
existing = sorted(export_dir.glob("ניתוח-משפטי-v*.docx"))
next_ver = 1
for p in existing:
try:
ver = int(p.stem.split("-v")[1])
except (IndexError, ValueError):
continue
next_ver = max(next_ver, ver + 1)
return next_ver
async def build_analysis_docx(case_number: str) -> Path:
"""Build a DOCX of the legal analysis for a case using the template
styles, and save a versioned copy under the case's exports folder.
Raises FileNotFoundError if no analysis file or template exists.
"""
if not TEMPLATE_PATH.exists():
raise FileNotFoundError(
f"Template not found at {TEMPLATE_PATH}. "
"Run: python scripts/convert_decision_template.py"
)
case_dir = config.find_case_dir(case_number)
analysis_path = case_dir / "documents" / "research" / "analysis-and-research.md"
if not analysis_path.exists():
raise FileNotFoundError(
f"Analysis file not found for case {case_number}"
)
parsed = research_md.parse(analysis_path)
# Resolve case_id so we can fetch precedents. Missing case → proceed
# without precedents rather than failing the export.
case_level_precedents: list[dict] = []
precedents_by_section: dict[str, list[dict]] = {}
case = await db.get_case_by_number(case_number)
if case:
precedents = await db.list_case_precedents(UUID(case["id"]))
case_level_precedents, precedents_by_section = _group_precedents(precedents)
doc = Document(str(TEMPLATE_PATH))
_clear_body(doc)
# Document title
header = parsed.get("header", {})
date = header.get("date", "").strip()
title_text = f"ניתוח משפטי וכתיבת עמדה בערר {case_number}"
_add_paragraph(doc, title_text, "Heading 1")
if date:
p_date = doc.add_paragraph(style="Normal")
_mark_paragraph_rtl(p_date)
run_date = p_date.add_run(f"תאריך: {date}")
_mark_run_rtl(run_date)
# Background sections — printed first so the reader gets context
# before any claims/precedents. These come only in the exported DOCX,
# not in the web UI (the UI renders them elsewhere).
_add_background_section(doc, "רקע לניתוח", parsed.get("represented_party"))
_add_background_section(doc, "רקע דיוני", parsed.get("procedural_background"))
_add_background_section(doc, "עובדות מוסכמות", parsed.get("agreed_facts"))
_add_background_section(
doc, "עובדות שנויות במחלוקת", parsed.get("disputed_facts")
)
# Case-level precedents appear at the top (they cut across claims/issues)
if case_level_precedents:
_add_paragraph(doc, "פסיקה כללית", "Heading 2")
for prec in case_level_precedents:
quote = (prec.get("quote") or "").strip()
citation = (prec.get("citation") or "").strip()
if quote:
_add_paragraph(doc, quote, "Quote")
if citation:
cp = doc.add_paragraph(style="Normal")
_mark_paragraph_rtl(cp)
cr = cp.add_run(_no_dash(citation))
cr.italic = True
_mark_run_rtl(cr)
# Threshold claims
threshold_claims = parsed.get("threshold_claims", [])
if threshold_claims:
_add_paragraph(doc, "טענות סף", "Heading 2")
for tc in threshold_claims:
_write_subsection(
doc, tc, precedents_by_section.get(tc["id"], []), "טענת סף"
)
# Issues
issues = parsed.get("issues", [])
if issues:
_add_paragraph(doc, "סוגיות להכרעה", "Heading 2")
for iss in issues:
_write_subsection(
doc, iss, precedents_by_section.get(iss["id"], []), "סוגיה"
)
# Conclusions
conclusions = (parsed.get("conclusions") or "").strip()
if conclusions:
_add_paragraph(doc, "מסקנות", "Heading 2")
for raw in conclusions.splitlines():
if not raw.strip():
continue
_emit_content_line(doc, raw)
# Save versioned
export_dir = case_dir / "exports"
export_dir.mkdir(parents=True, exist_ok=True)
version = _next_version(export_dir)
out_path = export_dir / f"ניתוח-משפטי-v{version}.docx"
doc.save(str(out_path))
return out_path