legal-ai/mcp-server/tests/test_docx_exporter_bookmarks.py

"""בדיקות ל-bookmark helpers ב-docx_exporter.

הבדיקות מתרכזות ב-helper functions בלבד (לא בכל ה-export flow שדורש DB).
"""

from __future__ import annotations

import zipfile
from pathlib import Path

from docx import Document
from lxml import etree

from legal_mcp.services.docx_exporter import (
    _BOOKMARK_ID_START,
    HEBREW_FONT,
    _add_styled_paragraph,
    _insert_bookmark_end,
    _insert_bookmark_start,
    _mark_paragraph_rtl,
    _mark_run_rtl,
    _strip_dashes,
    _wrap_block_with_bookmarks,
    _write_block_to_docx,
)
from legal_mcp.services.docx_reviser import NSMAP, _w, list_bookmarks

from docx.oxml.ns import qn


def test_insert_bookmark_helpers_create_valid_xml(tmp_path: Path) -> None:
    doc = Document()
    p = doc.add_paragraph("תוכן בלוק י")
    _insert_bookmark_start(p, "block-yod", 10001)
    _insert_bookmark_end(p, 10001)

    out = tmp_path / "out.docx"
    doc.save(str(out))

    # Verify via list_bookmarks (uses the same XML)
    assert list_bookmarks(out) == ["block-yod"]


def test_wrap_block_with_bookmarks_wraps_multiple_paragraphs(tmp_path: Path) -> None:
    doc = Document()
    doc.add_paragraph("ראשון — לפני")  # noise before

    bm_counter = [_BOOKMARK_ID_START]

    def writer() -> None:
        doc.add_paragraph("בלוק — פסקה 1")
        doc.add_paragraph("בלוק — פסקה 2")
        doc.add_paragraph("בלוק — פסקה 3")

    _wrap_block_with_bookmarks(doc, "block-yod", writer, bm_counter)
    doc.add_paragraph("אחרי — אחרון")  # noise after

    out = tmp_path / "out.docx"
    doc.save(str(out))

    # The bookmark should wrap exactly the 3 middle paragraphs
    with zipfile.ZipFile(out, "r") as zf:
        tree = etree.fromstring(zf.read("word/document.xml"))

    paragraphs = tree.findall(".//w:p", NSMAP)
    # Find para index of bookmarkStart and bookmarkEnd
    start_idx = end_idx = None
    for i, p in enumerate(paragraphs):
        if p.find(".//w:bookmarkStart", NSMAP) is not None:
            start_idx = i
        if p.find(".//w:bookmarkEnd", NSMAP) is not None:
            end_idx = i
    assert start_idx is not None
    assert end_idx is not None
    # The paragraph containing start must be the first new one ("פסקה 1")
    start_text = "".join(paragraphs[start_idx].itertext())
    end_text = "".join(paragraphs[end_idx].itertext())
    assert "פסקה 1" in start_text
    assert "פסקה 3" in end_text


def test_wrap_block_skipped_when_writer_adds_nothing(tmp_path: Path) -> None:
    doc = Document()
    bm_counter = [_BOOKMARK_ID_START]
    _wrap_block_with_bookmarks(doc, "block-empty", lambda: None, bm_counter)
    out = tmp_path / "out.docx"
    doc.save(str(out))
    assert list_bookmarks(out) == []


def test_multiple_blocks_get_unique_bookmark_ids(tmp_path: Path) -> None:
    doc = Document()
    bm_counter = [_BOOKMARK_ID_START]
    for name in ("block-alef", "block-bet", "block-gimel"):
        _wrap_block_with_bookmarks(
            doc, name,
            lambda n=name: doc.add_paragraph(f"תוכן של {n}"),
            bm_counter,
        )
    out = tmp_path / "out.docx"
    doc.save(str(out))

    with zipfile.ZipFile(out, "r") as zf:
        tree = etree.fromstring(zf.read("word/document.xml"))

    ids = [el.get(_w("id")) for el in tree.iterfind(".//w:bookmarkStart", NSMAP)]
    assert len(ids) == 3
    assert len(set(ids)) == 3

    names = list_bookmarks(out)
    assert set(names) == {"block-alef", "block-bet", "block-gimel"}


# ── RTL / David-font invariants ───────────────────────────────────
# These guard against regressions where Hebrew renders LTR or in the wrong
# font slot (Times New Roman instead of David). See plan file for context.


def test_mark_paragraph_rtl_adds_bidi_directly_in_pPr() -> None:
    doc = Document()
    p = doc.add_paragraph("טקסט בעברית")
    _mark_paragraph_rtl(p)
    pPr = p._p.find(qn("w:pPr"))
    assert pPr is not None
    # <w:bidi/> must be a direct child of pPr (paragraph direction),
    # NOT nested inside <w:rPr>.
    assert pPr.find(qn("w:bidi")) is not None
    # paragraph-mark rPr still gets <w:rtl/>
    rPr = pPr.find(qn("w:rPr"))
    assert rPr is not None and rPr.find(qn("w:rtl")) is not None


def test_mark_run_rtl_forces_david_on_all_font_slots() -> None:
    doc = Document()
    p = doc.add_paragraph()
    run = p.add_run("טקסט")
    _mark_run_rtl(run)
    rPr = run._r.find(qn("w:rPr"))
    assert rPr is not None
    fonts = rPr.find(qn("w:rFonts"))
    assert fonts is not None
    for slot in ("w:ascii", "w:hAnsi", "w:cs", "w:eastAsia"):
        assert fonts.get(qn(slot)) == HEBREW_FONT, f"{slot} not {HEBREW_FONT}"
    assert rPr.find(qn("w:rtl")) is not None


def test_styled_paragraph_applies_bidi_and_david() -> None:
    """End-to-end: _add_styled_paragraph produces pPr/bidi + rFonts/cs=David."""
    doc = Document()
    _add_styled_paragraph(doc, "פסקה עברית", style="Normal")
    p = doc.paragraphs[-1]
    assert p._p.find(qn("w:pPr")).find(qn("w:bidi")) is not None
    run = p.runs[0]
    fonts = run._r.find(qn("w:rPr")).find(qn("w:rFonts"))
    assert fonts.get(qn("w:cs")) == HEBREW_FONT


def test_block_dalet_does_not_use_title_style() -> None:
    """Title style uses theme fonts and 28pt — avoid for Hebrew."""
    doc = Document()
    _write_block_to_docx(doc, "block-dalet", title="", content="")
    styles_used = {p.style.name for p in doc.paragraphs}
    assert "Title" not in styles_used, (
        f"block-dalet should not produce a Title-styled paragraph, got {styles_used}"
    )
    # The 'החלטה' text must still appear somewhere
    texts = [p.text for p in doc.paragraphs]
    assert any("החלטה" in t for t in texts)


# ── Heading overrides, numbered-list, dash strip ──────────────────


def test_strip_dashes_removes_em_and_en_dashes() -> None:
    assert _strip_dashes("תכנית 1454198 — אושרה ביום") == "תכנית 1454198 אושרה ביום"
    assert _strip_dashes("א – ב") == "א ב"
    assert _strip_dashes("no dash") == "no dash"
    # Collapsed whitespace
    assert _strip_dashes("רקע  —  עובדתי") == "רקע עובדתי"


def test_heading2_gets_justified_and_no_numbering() -> None:
    """Section heading → Heading 2 with jc=both and numId=0."""
    doc = Document()
    _write_block_to_docx(doc, "block-vav", title="", content="דיון והכרעה")
    heading = next(p for p in doc.paragraphs if p.style.name == "Heading 2")
    pPr = heading._p.find(qn("w:pPr"))
    jc = pPr.find(qn("w:jc"))
    assert jc is not None and jc.get(qn("w:val")) == "both"
    numPr = pPr.find(qn("w:numPr"))
    assert numPr is not None
    numId = numPr.find(qn("w:numId"))
    assert numId is not None and numId.get(qn("w:val")) == "0"


def test_heading3_gets_justified_not_centered() -> None:
    """Heading 3 in template has jc=center — override to jc=both."""
    doc = Document()
    _write_block_to_docx(doc, "block-vav", title="", content="**המצב התכנוני**")
    heading = next(p for p in doc.paragraphs if p.style.name == "Heading 3")
    jc = heading._p.find(qn("w:pPr")).find(qn("w:jc"))
    assert jc is not None and jc.get(qn("w:val")) == "both"


def test_numbered_paragraph_uses_list_paragraph_and_strips_prefix() -> None:
    """'1. text' → List Paragraph style, literal '1. ' removed."""
    doc = Document()
    _write_block_to_docx(
        doc, "block-vav", title="",
        content="1. עניינו של ערר זה.\n2. שכונת נווה יעקב.",
    )
    lp = [p for p in doc.paragraphs if p.style.name == "List Paragraph"]
    assert len(lp) == 2
    assert lp[0].text.startswith("עניינו")
    assert not lp[0].text.startswith("1.")
    assert lp[1].text.startswith("שכונת")


def test_body_content_has_no_em_dashes() -> None:
    """Content with em-dashes is rendered without them."""
    doc = Document()
    _write_block_to_docx(
        doc, "block-vav", title="",
        content="3. תכנית 5924 — קובעת את שטחי הבנייה.",
    )
    texts = "\n".join(p.text for p in doc.paragraphs)
    assert "—" not in texts