diff --git a/mcp-server/src/legal_mcp/services/docx_exporter.py b/mcp-server/src/legal_mcp/services/docx_exporter.py index be50a8f..6dc746c 100644 --- a/mcp-server/src/legal_mcp/services/docx_exporter.py +++ b/mcp-server/src/legal_mcp/services/docx_exporter.py @@ -15,47 +15,112 @@ from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.oxml import OxmlElement from docx.oxml.ns import qn -from docx.shared import Cm, Pt, RGBColor from legal_mcp import config from legal_mcp.services import db logger = logging.getLogger(__name__) -# ── Constants ───────────────────────────────────────────────────── - -FONT_NAME = "David" -FONT_SIZE_BODY = Pt(12) -FONT_SIZE_TITLE = Pt(16) -FONT_SIZE_HEADING = Pt(14) -LINE_SPACING = 1.5 -PAGE_MARGIN = Cm(2.5) +# Path to the converted decision template. Carries David font, RTL, margins, +# and styles (Title / Heading 1-2 / Normal / Quote / List Paragraph). +# Populated once by `scripts/convert_decision_template.py` from `.dotx`. +TEMPLATE_PATH = ( + Path(__file__).resolve().parents[4] + / "skills" / "docx" / "decision_template.docx" +) # ── RTL helpers ─────────────────────────────────────────────────── +# Three layers of RTL are required (per skills/docx/SKILL.md): +# 1. Section: in sectPr (inherited from template) +# 2. Paragraph: directly in pPr — paragraph direction +# 3. Run: in rPr — tells Word to use cs (complex-script) font +# Without explicit font on run, Hebrew can render in the ascii slot +# (Times New Roman) — so we also force David on all four font slots. -def _set_rtl_paragraph(paragraph) -> None: - """Set paragraph-level RTL properties.""" - pPr = paragraph._element.get_or_add_pPr() - bidi = OxmlElement("w:bidi") - bidi.set(qn("w:val"), "1") - pPr.append(bidi) +HEBREW_FONT = "David" -def _set_rtl_run(run) -> None: - """Set run-level RTL properties.""" - rPr = run._element.get_or_add_rPr() - rtl = OxmlElement("w:rtl") - rtl.set(qn("w:val"), "1") - rPr.append(rtl) +def _mark_run_rtl(run) -> None: + """Force David font on all four slots, then add .""" + rPr = run._r.get_or_add_rPr() + if rPr.find(qn("w:rFonts")) is None: + fonts = OxmlElement("w:rFonts") + fonts.set(qn("w:ascii"), HEBREW_FONT) + fonts.set(qn("w:hAnsi"), HEBREW_FONT) + fonts.set(qn("w:cs"), HEBREW_FONT) + fonts.set(qn("w:eastAsia"), HEBREW_FONT) + rPr.insert(0, fonts) + if rPr.find(qn("w:rtl")) is None: + rPr.append(OxmlElement("w:rtl")) -def _set_rtl_section(section) -> None: - """Set section-level RTL (bidi).""" - sectPr = section._sectPr - bidi = OxmlElement("w:bidi") - bidi.set(qn("w:val"), "1") - sectPr.append(bidi) +def _mark_paragraph_rtl(paragraph) -> None: + """Add directly to pPr (paragraph direction) and + to the paragraph-mark rPr (affects trailing ¶ glyph).""" + pPr = paragraph._p.get_or_add_pPr() + # (2) directly in pPr — paragraph direction + if pPr.find(qn("w:bidi")) is None: + bidi = OxmlElement("w:bidi") + pstyle = pPr.find(qn("w:pStyle")) + if pstyle is not None: + pstyle.addnext(bidi) + else: + pPr.insert(0, bidi) + # paragraph-mark rPr gets so ¶ inherits RTL too + rPr = pPr.find(qn("w:rPr")) + if rPr is None: + rPr = OxmlElement("w:rPr") + pPr.append(rPr) + if rPr.find(qn("w:rtl")) is None: + rPr.append(OxmlElement("w:rtl")) + + +def _set_paragraph_jc(paragraph, value: str) -> None: + """Force on a paragraph, overriding style-inherited jc. + + Needed because Heading 3 in the template ships with jc=center — we want + body headings justified right (jc=both) like Normal. + """ + pPr = paragraph._p.get_or_add_pPr() + existing = pPr.find(qn("w:jc")) + if existing is not None: + pPr.remove(existing) + jc = OxmlElement("w:jc") + jc.set(qn("w:val"), value) + pPr.append(jc) + + +def _suppress_paragraph_numbering(paragraph) -> None: + """Kill any style-inherited auto-numbering on this paragraph. + + Heading styles linked to outline lists can auto-inject א./ב./ג. markers + in some Word versions even when the style we read doesn't show numPr. + Setting numId=0 explicitly removes the paragraph from any list. + """ + pPr = paragraph._p.get_or_add_pPr() + existing = pPr.find(qn("w:numPr")) + if existing is not None: + pPr.remove(existing) + numPr = OxmlElement("w:numPr") + ilvl = OxmlElement("w:ilvl") + ilvl.set(qn("w:val"), "0") + numId = OxmlElement("w:numId") + numId.set(qn("w:val"), "0") + numPr.append(ilvl) + numPr.append(numId) + pPr.append(numPr) + + +def _clear_body(doc) -> None: + """Remove all paragraphs in the document body while keeping sectPr. + + The template ships with sample paragraphs we don't want. Section + properties (page size, margins, bidi) stay intact. + """ + body = doc.element.body + for p in list(body.findall(qn("w:p"))): + body.remove(p) # ── Bookmark helpers ────────────────────────────────────────────── @@ -109,61 +174,109 @@ def _wrap_block_with_bookmarks(doc, block_name: str, _insert_bookmark_end(last_new, bm_id) -def _add_paragraph(doc, text: str, style: str = "Normal", - bold: bool = False, font_size=None, - alignment=None, space_after: Pt | None = None) -> None: - """Add an RTL paragraph with David font.""" - para = doc.add_paragraph() - _set_rtl_paragraph(para) +# ── Content cleanup ────────────────────────────────────────────── - if alignment: +# Em-dash (—, U+2014) and en-dash (–, U+2013) — per chair's no-dash policy, +# strip from body text. Surrounding spaces collapse. +_DASH_RE = re.compile(r"\s*[—–]\s*") +_MULTI_SPACE_RE = re.compile(r" {2,}") + + +def _strip_dashes(text: str) -> str: + """Remove em/en-dashes and collapse surrounding whitespace.""" + text = _DASH_RE.sub(" ", text) + return _MULTI_SPACE_RE.sub(" ", text).strip() + + +# Numbered paragraph: "1. content", "23. content" — auto-numbered via +# List Paragraph style so order reflects emission, not literal prefix. +_NUM_PREFIX_RE = re.compile(r"^(\d+)\.\s+(.*)$", re.DOTALL) + + +# Markdown inline bold — `**...**` +_INLINE_BOLD_RE = re.compile(r"\*\*([^\n*]+?)\*\*") + + +def _add_runs_with_inline_bold(paragraph, text: str, *, bold_all: bool = False) -> None: + """Split text on `**...**` markers, alternating plain and bold runs. + + Keeps `**טענה חשובה**` rendering as bold instead of leaving literal + asterisks. When bold_all is True, every run is bold (used for headings + that still carry inline-bold markup). + """ + pos = 0 + for m in _INLINE_BOLD_RE.finditer(text): + if m.start() > pos: + plain = paragraph.add_run(text[pos:m.start()]) + if bold_all: + plain.bold = True + _mark_run_rtl(plain) + run_bold = paragraph.add_run(m.group(1)) + run_bold.bold = True + _mark_run_rtl(run_bold) + pos = m.end() + if pos < len(text): + tail = paragraph.add_run(text[pos:]) + if bold_all: + tail.bold = True + _mark_run_rtl(tail) + + +def _add_styled_paragraph(doc, text: str, style: str = "Normal", + bold: bool = False, + alignment=None): + """Add a paragraph using a template style. + + Font, size, RTL direction and spacing all come from the style + definition in the template — we only pick the style by name. + Renders `**...**` markdown as inline bold runs. + + Returns the paragraph so callers can apply further overrides. + """ + para = doc.add_paragraph(style=style) + _mark_paragraph_rtl(para) + + if alignment is not None: para.alignment = alignment - else: - para.alignment = WD_ALIGN_PARAGRAPH.RIGHT - run = para.add_run(text) - run.font.name = FONT_NAME - run.font.size = font_size or FONT_SIZE_BODY - run.bold = bold - _set_rtl_run(run) + if text: + _add_runs_with_inline_bold(para, text, bold_all=bold) - # Line spacing - pf = para.paragraph_format - pf.line_spacing = LINE_SPACING - if space_after is not None: - pf.space_after = space_after + return para -def _add_centered_paragraph(doc, text: str, bold: bool = True, - font_size=None) -> None: - """Add centered RTL paragraph.""" - _add_paragraph(doc, text, bold=bold, font_size=font_size, - alignment=WD_ALIGN_PARAGRAPH.CENTER) +def _add_centered_paragraph(doc, text: str, *, bold: bool = True, + style: str = "Normal") -> None: + _add_styled_paragraph(doc, text, style=style, bold=bold, + alignment=WD_ALIGN_PARAGRAPH.CENTER) + + +def _add_heading(doc, text: str, *, style: str) -> None: + """Heading with overrides: jc=both (overrides style-center / style-left) + and suppressed auto-numbering (so style-linked outline lists don't inject + א./ב./ג. — chair manages markers manually in content).""" + para = doc.add_paragraph(style=style) + _mark_paragraph_rtl(para) + _set_paragraph_jc(para, "both") + _suppress_paragraph_numbering(para) + if text: + _add_runs_with_inline_bold(para, text) def _add_blockquote(doc, text: str) -> None: - """Add indented blockquote paragraph.""" - para = doc.add_paragraph() - _set_rtl_paragraph(para) - para.alignment = WD_ALIGN_PARAGRAPH.RIGHT - - run = para.add_run(text) - run.font.name = FONT_NAME - run.font.size = Pt(11) - run.italic = True - _set_rtl_run(run) - - pf = para.paragraph_format - pf.left_indent = Cm(1.5) - pf.right_indent = Cm(1.5) - pf.line_spacing = LINE_SPACING + """Indented quote using the template's Quote style.""" + _add_styled_paragraph(doc, text, style="Quote") def _add_image_placeholder(doc, description: str) -> None: - """Add image placeholder box.""" - _add_paragraph(doc, f"[{description}]", - alignment=WD_ALIGN_PARAGRAPH.CENTER, - font_size=Pt(10)) + _add_styled_paragraph(doc, f"[{description}]", style="Normal", + alignment=WD_ALIGN_PARAGRAPH.CENTER) + + +def _add_spacer(doc) -> None: + """Add an empty paragraph as a visual spacer.""" + para = doc.add_paragraph(style="Normal") + _mark_paragraph_rtl(para) # ── Main export ─────────────────────────────────────────────────── @@ -241,16 +354,14 @@ async def export_decision( else: ordered_blocks = list(rows) - # Create document - doc = Document() + if not TEMPLATE_PATH.exists(): + raise FileNotFoundError( + f"Template not found at {TEMPLATE_PATH}. " + "Run scripts/convert_decision_template.py first." + ) - # Set page margins - for section in doc.sections: - section.top_margin = PAGE_MARGIN - section.bottom_margin = PAGE_MARGIN - section.left_margin = PAGE_MARGIN - section.right_margin = PAGE_MARGIN - _set_rtl_section(section) + doc = Document(str(TEMPLATE_PATH)) + _clear_body(doc) # Write blocks with bookmarks wrapping each block (anchors for revisions) bm_counter = [_BOOKMARK_ID_START] @@ -291,93 +402,132 @@ async def export_decision( def _write_block_to_docx(doc, block_id: str, title: str, content: str) -> None: - """Write a single block to the DOCX document.""" + """Write a single block to the DOCX document using template styles.""" # Header blocks (א-ד) if block_id == "block-alef": for line in content.split("\n"): if line.strip(): - _add_centered_paragraph(doc, line.strip(), bold=True, font_size=FONT_SIZE_HEADING) + _add_styled_paragraph(doc, line.strip(), style="Heading 1", + alignment=WD_ALIGN_PARAGRAPH.CENTER) return if block_id == "block-bet": - _add_paragraph(doc, "", space_after=Pt(6)) # spacer + _add_spacer(doc) for line in content.split("\n"): if line.strip(): - _add_centered_paragraph(doc, line.strip(), bold=False, font_size=FONT_SIZE_BODY) + _add_centered_paragraph(doc, line.strip(), bold=False) return if block_id == "block-gimel": - _add_paragraph(doc, "", space_after=Pt(6)) - lines = content.split("\n") - for line in lines: + _add_spacer(doc) + for line in content.split("\n"): stripped = line.strip() if not stripped: continue if stripped == "נגד": - _add_centered_paragraph(doc, "— נגד —", bold=True, font_size=FONT_SIZE_BODY) + _add_centered_paragraph(doc, "— נגד —", bold=True) else: - _add_centered_paragraph(doc, stripped, bold=False, font_size=FONT_SIZE_BODY) + _add_centered_paragraph(doc, stripped, bold=False) return if block_id == "block-dalet": - _add_paragraph(doc, "", space_after=Pt(12)) # spacer - _add_centered_paragraph(doc, "החלטה", bold=True, font_size=FONT_SIZE_TITLE) - _add_paragraph(doc, "", space_after=Pt(12)) + _add_spacer(doc) + # Avoid style=Title: its rFonts use theme fonts (majorHAnsi / majorBidi) + # and 28pt size — renders Hebrew oversized and in the wrong face. + # Heading 1 carries David and proper RTL, bold + center gives the + # same visual weight. + para = _add_styled_paragraph(doc, "החלטה", style="Heading 1", + alignment=WD_ALIGN_PARAGRAPH.CENTER, + bold=True) + _suppress_paragraph_numbering(para) + _add_spacer(doc) return if block_id == "block-yod-bet": - _add_paragraph(doc, "", space_after=Pt(24)) # spacer + _add_spacer(doc) for line in content.split("\n"): if line.strip(): - _add_centered_paragraph(doc, line.strip(), bold=False, font_size=FONT_SIZE_BODY) + _add_centered_paragraph(doc, line.strip(), bold=False) return # Content blocks (ה-יא) — parse paragraphs - paragraphs = content.split("\n") - for para_text in paragraphs: - stripped = para_text.strip() + for para_text in content.split("\n"): + stripped = _strip_dashes(para_text.strip()) if not stripped: continue - # Section headings (e.g., "תמצית טענות הצדדים", "טענות העוררים") - if _is_section_heading(stripped): - _add_paragraph(doc, stripped, bold=True, font_size=FONT_SIZE_HEADING, - space_after=Pt(6)) + # Markdown H1/H2/H3 → template heading styles + md_heading = re.match(r"^(#{1,6})\s+(.*)$", stripped) + if md_heading: + level = len(md_heading.group(1)) + heading_text = md_heading.group(2).strip() + style = "Heading 1" if level == 1 else f"Heading {min(level, 3)}" + _add_heading(doc, heading_text, style=style) + continue + + # Standalone `**...**` line — treat as a sub-heading (Heading 3) + stand_bold = re.match(r"^\*\*([^\n*]+?)\*\*$", stripped) + if stand_bold: + _add_heading(doc, stand_bold.group(1).strip(), style="Heading 3") + continue + + if _is_section_heading(stripped): + _add_heading(doc, stripped, style="Heading 2") continue - # Blockquotes (indented quotes from protocols/rulings) if stripped.startswith('"') or stripped.startswith("״") or stripped.startswith(">"): clean = stripped.lstrip(">").strip().strip('"').strip("״").strip('"') _add_blockquote(doc, clean) continue - # Image placeholders - if "📷" in stripped or stripped.startswith("[") and "תמונה" in stripped: + if "📷" in stripped or (stripped.startswith("[") and "תמונה" in stripped): _add_image_placeholder(doc, stripped.strip("[]📷 ")) continue - # Regular numbered paragraph or plain text - _add_paragraph(doc, stripped) + # Numbered body paragraph ("1. text") → List Paragraph with auto-num. + # The literal prefix is dropped; Word renders "1. 2. 3. ..." via numId. + num_match = _NUM_PREFIX_RE.match(stripped) + if num_match: + body_text = num_match.group(2).strip() + _add_styled_paragraph(doc, body_text, style="List Paragraph") + continue + + _add_styled_paragraph(doc, stripped, style="Normal") -def _is_section_heading(text: str) -> bool: - """Detect section headings in decision text.""" - heading_patterns = [ +_SECTION_HEADING_PATTERNS = [ + re.compile(p) for p in ( + # Block-level titles + r"^פתח\s+דבר", + r"^רקע\s+עובדתי", r"^תמצית\s+טענות", + r"^טענות\s+הצדדים", r"^טענות\s+העוררי", + r"^טענות\s+המשיב", r"^עמדת\s+הוועדה", r"^עמדת\s+מבקשי", r"^ההליכים\s+בפני", + r"^הליכים\s+בפני", r"^דיון\s+והכרעה", r"^סוף\s+דבר", r"^סיכום", - r"^פתח\s+דבר", + # Subsection titles produced by legal-writer inside block-vav/block-tet + r"^המצב\s+התכנוני", + r"^הליכי\s+הרישוי", + r"^שומת\s+ההשבחה", + r"^הליך\s+השומה", + r"^הגשת\s+הערר", + r"^תכניות\s+מתאר", + r"^תכניות\s+מפורטות", r"^תכניות\s+חלות", - ] - for pattern in heading_patterns: - if re.search(pattern, text): - return True - # Short bold-like lines (under 60 chars, not numbered) - if len(text) < 60 and not re.match(r"^\d+\.", text): - return False - return False + r"^תכניות\s+החלות", + r"^מדיניות\s+מהנדס", + r"^היתרי\s+בני", + r"^היתר\s+בני", + ) +] + + +def _is_section_heading(text: str) -> bool: + """Detect legal-decision section headings — mapped to Heading 2 style.""" + return any(p.search(text) for p in _SECTION_HEADING_PATTERNS) diff --git a/mcp-server/tests/test_docx_exporter_bookmarks.py b/mcp-server/tests/test_docx_exporter_bookmarks.py index cd1ed7e..7340aa6 100644 --- a/mcp-server/tests/test_docx_exporter_bookmarks.py +++ b/mcp-server/tests/test_docx_exporter_bookmarks.py @@ -13,12 +13,20 @@ from lxml import etree from legal_mcp.services.docx_exporter import ( _BOOKMARK_ID_START, + HEBREW_FONT, + _add_styled_paragraph, _insert_bookmark_end, _insert_bookmark_start, + _mark_paragraph_rtl, + _mark_run_rtl, + _strip_dashes, _wrap_block_with_bookmarks, + _write_block_to_docx, ) from legal_mcp.services.docx_reviser import NSMAP, _w, list_bookmarks +from docx.oxml.ns import qn + def test_insert_bookmark_helpers_create_valid_xml(tmp_path: Path) -> None: doc = Document() @@ -101,3 +109,119 @@ def test_multiple_blocks_get_unique_bookmark_ids(tmp_path: Path) -> None: names = list_bookmarks(out) assert set(names) == {"block-alef", "block-bet", "block-gimel"} + + +# ── RTL / David-font invariants ─────────────────────────────────── +# These guard against regressions where Hebrew renders LTR or in the wrong +# font slot (Times New Roman instead of David). See plan file for context. + + +def test_mark_paragraph_rtl_adds_bidi_directly_in_pPr() -> None: + doc = Document() + p = doc.add_paragraph("טקסט בעברית") + _mark_paragraph_rtl(p) + pPr = p._p.find(qn("w:pPr")) + assert pPr is not None + # must be a direct child of pPr (paragraph direction), + # NOT nested inside . + assert pPr.find(qn("w:bidi")) is not None + # paragraph-mark rPr still gets + rPr = pPr.find(qn("w:rPr")) + assert rPr is not None and rPr.find(qn("w:rtl")) is not None + + +def test_mark_run_rtl_forces_david_on_all_font_slots() -> None: + doc = Document() + p = doc.add_paragraph() + run = p.add_run("טקסט") + _mark_run_rtl(run) + rPr = run._r.find(qn("w:rPr")) + assert rPr is not None + fonts = rPr.find(qn("w:rFonts")) + assert fonts is not None + for slot in ("w:ascii", "w:hAnsi", "w:cs", "w:eastAsia"): + assert fonts.get(qn(slot)) == HEBREW_FONT, f"{slot} not {HEBREW_FONT}" + assert rPr.find(qn("w:rtl")) is not None + + +def test_styled_paragraph_applies_bidi_and_david() -> None: + """End-to-end: _add_styled_paragraph produces pPr/bidi + rFonts/cs=David.""" + doc = Document() + _add_styled_paragraph(doc, "פסקה עברית", style="Normal") + p = doc.paragraphs[-1] + assert p._p.find(qn("w:pPr")).find(qn("w:bidi")) is not None + run = p.runs[0] + fonts = run._r.find(qn("w:rPr")).find(qn("w:rFonts")) + assert fonts.get(qn("w:cs")) == HEBREW_FONT + + +def test_block_dalet_does_not_use_title_style() -> None: + """Title style uses theme fonts and 28pt — avoid for Hebrew.""" + doc = Document() + _write_block_to_docx(doc, "block-dalet", title="", content="") + styles_used = {p.style.name for p in doc.paragraphs} + assert "Title" not in styles_used, ( + f"block-dalet should not produce a Title-styled paragraph, got {styles_used}" + ) + # The 'החלטה' text must still appear somewhere + texts = [p.text for p in doc.paragraphs] + assert any("החלטה" in t for t in texts) + + +# ── Heading overrides, numbered-list, dash strip ────────────────── + + +def test_strip_dashes_removes_em_and_en_dashes() -> None: + assert _strip_dashes("תכנית 1454198 — אושרה ביום") == "תכנית 1454198 אושרה ביום" + assert _strip_dashes("א – ב") == "א ב" + assert _strip_dashes("no dash") == "no dash" + # Collapsed whitespace + assert _strip_dashes("רקע — עובדתי") == "רקע עובדתי" + + +def test_heading2_gets_justified_and_no_numbering() -> None: + """Section heading → Heading 2 with jc=both and numId=0.""" + doc = Document() + _write_block_to_docx(doc, "block-vav", title="", content="דיון והכרעה") + heading = next(p for p in doc.paragraphs if p.style.name == "Heading 2") + pPr = heading._p.find(qn("w:pPr")) + jc = pPr.find(qn("w:jc")) + assert jc is not None and jc.get(qn("w:val")) == "both" + numPr = pPr.find(qn("w:numPr")) + assert numPr is not None + numId = numPr.find(qn("w:numId")) + assert numId is not None and numId.get(qn("w:val")) == "0" + + +def test_heading3_gets_justified_not_centered() -> None: + """Heading 3 in template has jc=center — override to jc=both.""" + doc = Document() + _write_block_to_docx(doc, "block-vav", title="", content="**המצב התכנוני**") + heading = next(p for p in doc.paragraphs if p.style.name == "Heading 3") + jc = heading._p.find(qn("w:pPr")).find(qn("w:jc")) + assert jc is not None and jc.get(qn("w:val")) == "both" + + +def test_numbered_paragraph_uses_list_paragraph_and_strips_prefix() -> None: + """'1. text' → List Paragraph style, literal '1. ' removed.""" + doc = Document() + _write_block_to_docx( + doc, "block-vav", title="", + content="1. עניינו של ערר זה.\n2. שכונת נווה יעקב.", + ) + lp = [p for p in doc.paragraphs if p.style.name == "List Paragraph"] + assert len(lp) == 2 + assert lp[0].text.startswith("עניינו") + assert not lp[0].text.startswith("1.") + assert lp[1].text.startswith("שכונת") + + +def test_body_content_has_no_em_dashes() -> None: + """Content with em-dashes is rendered without them.""" + doc = Document() + _write_block_to_docx( + doc, "block-vav", title="", + content="3. תכנית 5924 — קובעת את שטחי הבנייה.", + ) + texts = "\n".join(p.text for p in doc.paragraphs) + assert "—" not in texts