Updates accumulated from prior sessions: - HEARTBEAT: company-based filtering (CMP/CMPA) rules - legal-qa, legal-researcher: routine updates - analysis_docx_exporter: new service for analysis DOCX export - compose page: "הורד כ-DOCX" button for analysis - decision_template.docx: template for exporter Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
504 lines
18 KiB
Python
504 lines
18 KiB
Python
"""Export the legal analysis (analysis-and-research.md + precedents) to a
|
|
DOCX file that uses דפנה's decision template styles.
|
|
|
|
The template lives at `skills/docx/decision_template.docx` (converted once
|
|
from `טיוטת החלטה.dotx` via `scripts/convert_decision_template.py`).
|
|
We open it, wipe the sample body paragraphs, and write new content by
|
|
applying style names only — never by hand-setting font/size/RTL/margins,
|
|
because the template's styles.xml already carries those.
|
|
|
|
Style mapping:
|
|
"Title" → the document title (case number, date)
|
|
"Heading 2" → top-level section headers
|
|
(טענות סף / סוגיות להכרעה / מסקנות)
|
|
"Normal" + bold → subsection headers (individual claim/issue)
|
|
"Normal" → field label (bold run) + value
|
|
"Quote" → precedent quote text
|
|
"Normal" (italic) → precedent citation
|
|
|
|
Output: data/cases/{case_number}/exports/ניתוח-משפטי-v{N}.docx
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from uuid import UUID
|
|
|
|
from docx import Document
|
|
from docx.document import Document as DocumentT
|
|
from docx.oxml.ns import qn
|
|
from docx.oxml import OxmlElement
|
|
from docx.text.paragraph import Paragraph
|
|
from docx.text.run import Run
|
|
|
|
from legal_mcp import config
|
|
from legal_mcp.services import db, research_md
|
|
|
|
|
|
def _mark_run_rtl(run: Run) -> None:
|
|
"""Mark a run as complex-script (Hebrew/Arabic) so Word uses the `cs`
|
|
font slot from the style (David) rather than `ascii` (Times New Roman).
|
|
|
|
Without this, runs we add programmatically render Hebrew in the ascii
|
|
font — even though the paragraph style has `<w:rFonts cs="David"/>`.
|
|
"""
|
|
rPr = run._r.get_or_add_rPr()
|
|
if rPr.find(qn("w:rtl")) is None:
|
|
rPr.append(OxmlElement("w:rtl"))
|
|
|
|
|
|
def _mark_paragraph_rtl(paragraph: Paragraph) -> None:
|
|
"""Add `<w:rtl/>` inside the paragraph's rPr so the paragraph mark
|
|
itself is treated as RTL. The paragraph style already sets bidi
|
|
direction, but empty paragraphs and trailing marks need this flag.
|
|
"""
|
|
pPr = paragraph._p.get_or_add_pPr()
|
|
rPr = pPr.find(qn("w:rPr"))
|
|
if rPr is None:
|
|
rPr = OxmlElement("w:rPr")
|
|
pPr.append(rPr)
|
|
if rPr.find(qn("w:rtl")) is None:
|
|
rPr.append(OxmlElement("w:rtl"))
|
|
|
|
# Path to the converted template. Static — populated by
|
|
# scripts/convert_decision_template.py.
|
|
TEMPLATE_PATH = (
|
|
Path(__file__).resolve().parents[4]
|
|
/ "skills"
|
|
/ "docx"
|
|
/ "decision_template.docx"
|
|
)
|
|
|
|
CHAIR_POSITION_LABEL = "עמדת ועדת הערר"
|
|
CHAIR_POSITION_PLACEHOLDER = "[טרם מולאה עמדת ועדת הערר]"
|
|
|
|
NUMBERED_LINE_RE = re.compile(r"^\s*(\d+)[.)]\s+(.+)$")
|
|
BULLET_LINE_RE = re.compile(r"^\s*[\-\u2022\*\u25CF\u25E6]\s+(.+)$")
|
|
# (א) (ב) (ג) ... — Hebrew-letter enumeration used by the authors.
|
|
# We keep the marker inside the text (the author wrote it), but render the
|
|
# paragraph as "List Paragraph" without the numPr so the visual indentation
|
|
# matches the template's list style without adding a double "1." prefix.
|
|
HEB_LETTER_LINE_RE = re.compile(r"^\s*\([א-ת]\)\s+")
|
|
|
|
# A standalone **LABEL:** line (the whole trimmed line is wrapped in ** **)
|
|
STANDALONE_LABEL_RE = re.compile(r"^\s*\*\*([^\n*]+?):\*\*\s*$")
|
|
# A short standalone "XYZ:" line (no ** **) — acts as a sub-heading for the
|
|
# paragraphs that follow. Limit to short phrases to avoid eating real
|
|
# sentences that happen to end with a colon.
|
|
PLAIN_LABEL_RE = re.compile(r"^\s*([^\n:]{2,40}):\s*$")
|
|
# "**LABEL:** value" inline — bold label followed by prose on the same line.
|
|
INLINE_LABEL_RE = re.compile(r"^\s*\*\*([^\n*]+?):\*\*\s+(.+)$")
|
|
|
|
|
|
def _classify_line(line: str) -> tuple[str, str]:
|
|
"""Return (kind, clean_text) where kind ∈ {numbered, bullet, heb_letter,
|
|
label_heading, inline_label, plain}.
|
|
|
|
clean_text conventions:
|
|
- numbered/bullet — marker stripped
|
|
- heb_letter — marker kept (author supplied it)
|
|
- label_heading — surrounding ** and trailing : stripped
|
|
- inline_label — "LABEL\x00VALUE" (NUL-separated; _emit splits it)
|
|
"""
|
|
m = STANDALONE_LABEL_RE.match(line)
|
|
if m:
|
|
return "label_heading", m.group(1).strip()
|
|
m = INLINE_LABEL_RE.match(line)
|
|
if m:
|
|
return "inline_label", f"{m.group(1).strip()}\x00{m.group(2).strip()}"
|
|
m = NUMBERED_LINE_RE.match(line)
|
|
if m:
|
|
return "numbered", m.group(2).strip()
|
|
m = BULLET_LINE_RE.match(line)
|
|
if m:
|
|
inner = m.group(1).strip()
|
|
# A bullet whose only content is **LABEL:** is a heading, not a list item.
|
|
# E.g. "- **נקודות פתוחות:**"
|
|
m2 = STANDALONE_LABEL_RE.match(inner)
|
|
if m2:
|
|
return "label_heading", m2.group(1).strip()
|
|
# A bullet of the form "- **LABEL:** value" → inline label.
|
|
m3 = INLINE_LABEL_RE.match(inner)
|
|
if m3:
|
|
return "inline_label", f"{m3.group(1).strip()}\x00{m3.group(2).strip()}"
|
|
return "bullet", inner
|
|
if HEB_LETTER_LINE_RE.match(line):
|
|
return "heb_letter", line.strip()
|
|
m = PLAIN_LABEL_RE.match(line)
|
|
if m:
|
|
return "label_heading", m.group(1).strip()
|
|
return "plain", line.strip()
|
|
|
|
|
|
def _strip_numpr(paragraph: Paragraph) -> None:
|
|
"""Remove any <w:numPr> from the paragraph's pPr.
|
|
|
|
Used when we want the visual styling of `List Paragraph` (indent,
|
|
font) without Word's auto-decimal "1." prefix — e.g. for Hebrew-
|
|
letter enumeration where the author wrote (א) (ב) (ג) manually.
|
|
"""
|
|
pPr = paragraph._p.get_or_add_pPr()
|
|
for numPr in pPr.findall(qn("w:numPr")):
|
|
pPr.remove(numPr)
|
|
|
|
|
|
# Characters that the code should never emit (user instruction: "no dashes").
|
|
# Applied only to code-generated text, not to user content from the md file.
|
|
_CODE_DASH_RE = re.compile(r"[\u2013\u2014]")
|
|
|
|
# Markdown inline bold — `**...**`
|
|
_INLINE_BOLD_RE = re.compile(r"\*\*([^\n*]+?)\*\*")
|
|
|
|
|
|
def _no_dash(text: str) -> str:
|
|
"""Strip em/en dashes from text the code emits (not from source content)."""
|
|
return _CODE_DASH_RE.sub("", text)
|
|
|
|
|
|
def _add_runs_with_inline_bold(paragraph: Paragraph, text: str) -> None:
|
|
"""Split `text` on `**...**` markers, adding alternating plain and bold
|
|
runs to `paragraph`. All runs are marked RTL and passed through
|
|
`_no_dash`.
|
|
|
|
This keeps `**טענה חשובה**` rendering as bold (as the author intended)
|
|
instead of leaving the literal asterisks in the output.
|
|
"""
|
|
text = _no_dash(text)
|
|
pos = 0
|
|
for m in _INLINE_BOLD_RE.finditer(text):
|
|
if m.start() > pos:
|
|
plain = paragraph.add_run(text[pos : m.start()])
|
|
_mark_run_rtl(plain)
|
|
bold = paragraph.add_run(m.group(1))
|
|
bold.bold = True
|
|
_mark_run_rtl(bold)
|
|
pos = m.end()
|
|
if pos < len(text):
|
|
tail = paragraph.add_run(text[pos:])
|
|
_mark_run_rtl(tail)
|
|
|
|
|
|
def _clear_body(doc: DocumentT) -> None:
|
|
"""Remove every paragraph currently in the document body.
|
|
|
|
The template ships with example paragraphs ("רקע", "דיון והכרעה"…)
|
|
that we don't want in the output. Section properties (sectPr) are
|
|
kept so page size / margins / RTL / footer remain intact.
|
|
"""
|
|
body = doc.element.body
|
|
for p in list(body.findall(qn("w:p"))):
|
|
body.remove(p)
|
|
# Leave sectPr alone — it carries page setup including bidi.
|
|
|
|
|
|
def _add_paragraph(doc: DocumentT, text: str, style: str) -> Paragraph:
|
|
p = doc.add_paragraph(style=style)
|
|
_mark_paragraph_rtl(p)
|
|
if text:
|
|
_add_runs_with_inline_bold(p, text)
|
|
return p
|
|
|
|
|
|
def _add_label_value(
|
|
doc: DocumentT, label: str, value: str, *, value_italic: bool = False
|
|
) -> Paragraph:
|
|
"""Add a paragraph with a bold label and an inline value.
|
|
|
|
Example rendering: **עמדת המבקשת:** The party argues that…
|
|
"""
|
|
p = doc.add_paragraph(style="Normal")
|
|
_mark_paragraph_rtl(p)
|
|
run_label = p.add_run(f"{_no_dash(label)}: ")
|
|
run_label.bold = True
|
|
_mark_run_rtl(run_label)
|
|
if value:
|
|
if value_italic:
|
|
# Placeholder text — italic, no inline-bold handling.
|
|
run_value = p.add_run(_no_dash(value))
|
|
run_value.italic = True
|
|
_mark_run_rtl(run_value)
|
|
else:
|
|
_add_runs_with_inline_bold(p, value)
|
|
return p
|
|
|
|
|
|
def _add_multiline_value(
|
|
doc: DocumentT, label: str, value: str
|
|
) -> None:
|
|
"""Render a field (label + value).
|
|
|
|
Multi-line values get the label as its own Heading 2 paragraph (so the
|
|
structure visually breaks between fields), then each body line as its
|
|
own paragraph routed through `_emit_content_line`.
|
|
|
|
Single-line values stay inline (bold label + text) — a Heading 2 for
|
|
a one-liner would look inflated.
|
|
"""
|
|
lines = [ln for ln in value.splitlines() if ln.strip()]
|
|
if not lines:
|
|
_add_label_value(doc, label, "")
|
|
return
|
|
if len(lines) == 1:
|
|
kind, text = _classify_line(lines[0])
|
|
# Single-line — inline with label regardless of kind
|
|
_add_label_value(doc, label, text)
|
|
return
|
|
# Multi-line: label as Heading 2, then each line via _emit_content_line
|
|
_add_paragraph(doc, label, "Heading 2")
|
|
for line in lines:
|
|
_emit_content_line(doc, line)
|
|
|
|
|
|
def _emit_content_line(doc: DocumentT, line: str) -> None:
|
|
"""Render a single line of content using the right template style.
|
|
|
|
- `label_heading` (e.g. "**נקודות פתוחות:**" alone) → Heading 2
|
|
- `numbered` ("1. ...") → List Paragraph
|
|
(auto-decimal)
|
|
- `heb_letter` ("(א) ...") → List Paragraph
|
|
with numPr stripped
|
|
(author supplied
|
|
the marker)
|
|
- `bullet` ("- ...") → Normal (marker
|
|
stripped)
|
|
- `plain` → Normal
|
|
"""
|
|
kind, text = _classify_line(line)
|
|
|
|
if kind == "label_heading":
|
|
_add_paragraph(doc, text, "Heading 2")
|
|
return
|
|
|
|
if kind == "inline_label":
|
|
label, value = text.split("\x00", 1)
|
|
_add_label_value(doc, label, value)
|
|
return
|
|
|
|
if kind == "numbered":
|
|
para = doc.add_paragraph(style="List Paragraph")
|
|
elif kind == "heb_letter":
|
|
para = doc.add_paragraph(style="List Paragraph")
|
|
_strip_numpr(para)
|
|
else:
|
|
para = doc.add_paragraph(style="Normal")
|
|
_mark_paragraph_rtl(para)
|
|
_add_runs_with_inline_bold(para, text)
|
|
|
|
|
|
def _format_subsection_title(item: dict[str, Any], kind_label: str) -> str:
|
|
"""Return '{kind_label} {number}: {title}' e.g. 'טענת סף 1: חוסר סמכות'."""
|
|
number = item.get("number") or ""
|
|
title = item.get("title", "").strip()
|
|
if number and title:
|
|
return f"{kind_label} {number}: {title}"
|
|
if title:
|
|
return title
|
|
return f"{kind_label} {number}".strip()
|
|
|
|
|
|
def _write_subsection(
|
|
doc: DocumentT,
|
|
item: dict[str, Any],
|
|
precedents_for_item: list[dict[str, Any]],
|
|
kind_label: str,
|
|
) -> None:
|
|
# Subsection header — bolded Normal paragraph, not a Heading,
|
|
# so it visually sits under the section's Heading 2.
|
|
header_text = _format_subsection_title(item, kind_label)
|
|
p = doc.add_paragraph(style="Normal")
|
|
_mark_paragraph_rtl(p)
|
|
run = p.add_run(_no_dash(header_text))
|
|
run.bold = True
|
|
_mark_run_rtl(run)
|
|
|
|
# Regular fields (party positions, legal questions, etc.)
|
|
for field in item.get("fields", []):
|
|
label = field.get("label", "").strip()
|
|
content = field.get("content", "").strip()
|
|
if not label:
|
|
continue
|
|
_add_multiline_value(doc, label, content)
|
|
|
|
# Chair position — special handling: always render, use placeholder if empty.
|
|
chair_position = (item.get("chair_position") or "").strip()
|
|
if chair_position:
|
|
_add_multiline_value(doc, CHAIR_POSITION_LABEL, chair_position)
|
|
else:
|
|
_add_label_value(
|
|
doc, CHAIR_POSITION_LABEL, CHAIR_POSITION_PLACEHOLDER,
|
|
value_italic=True,
|
|
)
|
|
|
|
# Precedents attached to this subsection
|
|
if precedents_for_item:
|
|
p = doc.add_paragraph(style="Normal")
|
|
_mark_paragraph_rtl(p)
|
|
run = p.add_run("פסיקה רלוונטית:")
|
|
run.bold = True
|
|
_mark_run_rtl(run)
|
|
for prec in precedents_for_item:
|
|
quote = (prec.get("quote") or "").strip()
|
|
citation = (prec.get("citation") or "").strip()
|
|
if quote:
|
|
_add_paragraph(doc, quote, "Quote")
|
|
if citation:
|
|
cite_p = doc.add_paragraph(style="Normal")
|
|
_mark_paragraph_rtl(cite_p)
|
|
cite_run = cite_p.add_run(_no_dash(citation))
|
|
cite_run.italic = True
|
|
_mark_run_rtl(cite_run)
|
|
|
|
|
|
def _add_background_section(
|
|
doc: DocumentT, title: str, body: str | None
|
|
) -> None:
|
|
"""Render a background H2 section (e.g. "רקע דיוני") from a prose
|
|
body. Lines are routed through `_emit_content_line` so bullets,
|
|
`**labels:**`, and (א) enumerations all get the template styles.
|
|
"""
|
|
if not body or not body.strip():
|
|
return
|
|
_add_paragraph(doc, title, "Heading 2")
|
|
for raw in body.splitlines():
|
|
if not raw.strip():
|
|
continue
|
|
_emit_content_line(doc, raw)
|
|
|
|
|
|
def _group_precedents(
|
|
precedents: list[dict[str, Any]],
|
|
) -> tuple[list[dict], dict[str, list[dict]]]:
|
|
"""Split the flat precedent list into case-level and per-section maps.
|
|
|
|
Returns (case_level_precedents, {section_id: [precedents]}).
|
|
"""
|
|
case_level: list[dict] = []
|
|
by_section: dict[str, list[dict]] = {}
|
|
for p in precedents:
|
|
sid = p.get("section_id")
|
|
if sid is None:
|
|
case_level.append(p)
|
|
else:
|
|
by_section.setdefault(sid, []).append(p)
|
|
return case_level, by_section
|
|
|
|
|
|
def _next_version(export_dir: Path) -> int:
|
|
"""Return the next version number for ניתוח-משפטי-v{N}.docx."""
|
|
existing = sorted(export_dir.glob("ניתוח-משפטי-v*.docx"))
|
|
next_ver = 1
|
|
for p in existing:
|
|
try:
|
|
ver = int(p.stem.split("-v")[1])
|
|
except (IndexError, ValueError):
|
|
continue
|
|
next_ver = max(next_ver, ver + 1)
|
|
return next_ver
|
|
|
|
|
|
async def build_analysis_docx(case_number: str) -> Path:
|
|
"""Build a DOCX of the legal analysis for a case using the template
|
|
styles, and save a versioned copy under the case's exports folder.
|
|
|
|
Raises FileNotFoundError if no analysis file or template exists.
|
|
"""
|
|
if not TEMPLATE_PATH.exists():
|
|
raise FileNotFoundError(
|
|
f"Template not found at {TEMPLATE_PATH}. "
|
|
"Run: python scripts/convert_decision_template.py"
|
|
)
|
|
|
|
case_dir = config.find_case_dir(case_number)
|
|
analysis_path = case_dir / "documents" / "research" / "analysis-and-research.md"
|
|
if not analysis_path.exists():
|
|
raise FileNotFoundError(
|
|
f"Analysis file not found for case {case_number}"
|
|
)
|
|
|
|
parsed = research_md.parse(analysis_path)
|
|
|
|
# Resolve case_id so we can fetch precedents. Missing case → proceed
|
|
# without precedents rather than failing the export.
|
|
case_level_precedents: list[dict] = []
|
|
precedents_by_section: dict[str, list[dict]] = {}
|
|
case = await db.get_case_by_number(case_number)
|
|
if case:
|
|
precedents = await db.list_case_precedents(UUID(case["id"]))
|
|
case_level_precedents, precedents_by_section = _group_precedents(precedents)
|
|
|
|
doc = Document(str(TEMPLATE_PATH))
|
|
_clear_body(doc)
|
|
|
|
# Document title
|
|
header = parsed.get("header", {})
|
|
date = header.get("date", "").strip()
|
|
title_text = f"ניתוח משפטי וכתיבת עמדה בערר {case_number}"
|
|
_add_paragraph(doc, title_text, "Heading 1")
|
|
if date:
|
|
p_date = doc.add_paragraph(style="Normal")
|
|
_mark_paragraph_rtl(p_date)
|
|
run_date = p_date.add_run(f"תאריך: {date}")
|
|
_mark_run_rtl(run_date)
|
|
|
|
# Background sections — printed first so the reader gets context
|
|
# before any claims/precedents. These come only in the exported DOCX,
|
|
# not in the web UI (the UI renders them elsewhere).
|
|
_add_background_section(doc, "רקע לניתוח", parsed.get("represented_party"))
|
|
_add_background_section(doc, "רקע דיוני", parsed.get("procedural_background"))
|
|
_add_background_section(doc, "עובדות מוסכמות", parsed.get("agreed_facts"))
|
|
_add_background_section(
|
|
doc, "עובדות שנויות במחלוקת", parsed.get("disputed_facts")
|
|
)
|
|
|
|
# Case-level precedents appear at the top (they cut across claims/issues)
|
|
if case_level_precedents:
|
|
_add_paragraph(doc, "פסיקה כללית", "Heading 2")
|
|
for prec in case_level_precedents:
|
|
quote = (prec.get("quote") or "").strip()
|
|
citation = (prec.get("citation") or "").strip()
|
|
if quote:
|
|
_add_paragraph(doc, quote, "Quote")
|
|
if citation:
|
|
cp = doc.add_paragraph(style="Normal")
|
|
_mark_paragraph_rtl(cp)
|
|
cr = cp.add_run(_no_dash(citation))
|
|
cr.italic = True
|
|
_mark_run_rtl(cr)
|
|
|
|
# Threshold claims
|
|
threshold_claims = parsed.get("threshold_claims", [])
|
|
if threshold_claims:
|
|
_add_paragraph(doc, "טענות סף", "Heading 2")
|
|
for tc in threshold_claims:
|
|
_write_subsection(
|
|
doc, tc, precedents_by_section.get(tc["id"], []), "טענת סף"
|
|
)
|
|
|
|
# Issues
|
|
issues = parsed.get("issues", [])
|
|
if issues:
|
|
_add_paragraph(doc, "סוגיות להכרעה", "Heading 2")
|
|
for iss in issues:
|
|
_write_subsection(
|
|
doc, iss, precedents_by_section.get(iss["id"], []), "סוגיה"
|
|
)
|
|
|
|
# Conclusions
|
|
conclusions = (parsed.get("conclusions") or "").strip()
|
|
if conclusions:
|
|
_add_paragraph(doc, "מסקנות", "Heading 2")
|
|
for raw in conclusions.splitlines():
|
|
if not raw.strip():
|
|
continue
|
|
_emit_content_line(doc, raw)
|
|
|
|
# Save versioned
|
|
export_dir = case_dir / "exports"
|
|
export_dir.mkdir(parents=True, exist_ok=True)
|
|
version = _next_version(export_dir)
|
|
out_path = export_dir / f"ניתוח-משפטי-v{version}.docx"
|
|
doc.save(str(out_path))
|
|
return out_path
|