legal-ai/mcp-server/tests/test_docx_reviser.py

"""בדיקות docx_reviser — Track Changes XML surgery.

הבדיקות יוצרות DOCX בסיסי עם bookmarks, מפעילות revisions, ובודקות:
1. שה-XML שנוצר תקף ונטען חזרה כ-Document
2. שה-<w:ins> / <w:del> קיימים בפורמט הנכון
3. שה-bookmarks נשמרים אחרי עריכה
4. שגופן David ו-RTL נשמרים
5. שכשלונות מטופלים אלגנטית (bookmark חסר → failed, לא crash)
"""

from __future__ import annotations

import zipfile
from datetime import datetime, timezone
from io import BytesIO
from pathlib import Path

import pytest
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from lxml import etree

from legal_mcp.services import docx_reviser
from legal_mcp.services.docx_reviser import (
    NSMAP,
    Revision,
    _w,
    apply_tracked_revisions,
    list_bookmarks,
)


# ── Test fixtures ──────────────────────────────────────────────────


def _insert_bookmark(paragraph, name: str, bm_id: int) -> None:
    """Insert a <w:bookmarkStart> at the start of a paragraph and a
    <w:bookmarkEnd> at the end."""
    p_elem = paragraph._p

    start = OxmlElement("w:bookmarkStart")
    start.set(qn("w:id"), str(bm_id))
    start.set(qn("w:name"), name)
    p_elem.insert(0, start)

    end = OxmlElement("w:bookmarkEnd")
    end.set(qn("w:id"), str(bm_id))
    p_elem.append(end)


def _make_sample_docx(path: Path) -> None:
    """Create a simple DOCX with 3 paragraphs, each with a bookmark."""
    doc = Document()
    for idx, name in enumerate(("block-alef", "block-yod", "block-yod-bet")):
        p = doc.add_paragraph()
        run = p.add_run(f"תוכן פסקה של {name}")
        run.font.name = "David"
        _insert_bookmark(p, name, idx + 1)
    doc.save(str(path))


@pytest.fixture
def sample_docx(tmp_path: Path) -> Path:
    path = tmp_path / "source.docx"
    _make_sample_docx(path)
    return path


# ── list_bookmarks ────────────────────────────────────────────────


def test_list_bookmarks_returns_all_named(sample_docx: Path) -> None:
    names = list_bookmarks(sample_docx)
    assert set(names) == {"block-alef", "block-yod", "block-yod-bet"}


def test_list_bookmarks_excludes_internal(tmp_path: Path) -> None:
    """Bookmarks starting with '_' (like _GoBack) should be filtered out."""
    path = tmp_path / "internal.docx"
    doc = Document()
    p1 = doc.add_paragraph("visible")
    _insert_bookmark(p1, "block-real", 1)
    p2 = doc.add_paragraph("hidden")
    _insert_bookmark(p2, "_GoBack", 2)
    doc.save(str(path))

    names = list_bookmarks(path)
    assert names == ["block-real"]


# ── apply_tracked_revisions: insert_after ─────────────────────────


def test_insert_after_adds_tracked_paragraph(sample_docx: Path, tmp_path: Path) -> None:
    out = tmp_path / "out.docx"
    rev = Revision(
        id="r1",
        type="insert_after",
        anchor_bookmark="block-yod",
        content="פסקה חדשה שהמערכת מוסיפה.",
    )
    result = apply_tracked_revisions(
        sample_docx, out, [rev],
        author="מערכת AI",
        date=datetime(2026, 4, 16, 14, 0, tzinfo=timezone.utc),
    )
    assert result.applied == 1
    assert result.failed == 0
    assert out.exists()

    # Verify <w:ins> present in document.xml
    with zipfile.ZipFile(out, "r") as zf:
        doc_xml = zf.read("word/document.xml")
    tree = etree.fromstring(doc_xml)
    ins_elements = tree.findall(".//w:ins", NSMAP)
    assert len(ins_elements) >= 1
    # Verify the content is there
    all_text = "".join(tree.itertext())
    assert "פסקה חדשה שהמערכת מוסיפה." in all_text
    # Verify original content preserved
    assert "תוכן פסקה של block-yod" in all_text


def _find_ins_with_runs(tree: etree._Element) -> etree._Element | None:
    """Pick the <w:ins> that actually wraps runs (not the pilcrow-marker one)."""
    for ins in tree.iterfind(".//w:ins", NSMAP):
        if ins.find(".//w:r", NSMAP) is not None:
            return ins
    return None


def test_insert_after_ins_has_author_and_date(sample_docx: Path, tmp_path: Path) -> None:
    out = tmp_path / "out.docx"
    rev = Revision(id="r1", type="insert_after",
                   anchor_bookmark="block-alef", content="test")
    apply_tracked_revisions(sample_docx, out, [rev], author="דפנה")

    with zipfile.ZipFile(out, "r") as zf:
        doc_xml = zf.read("word/document.xml")
    tree = etree.fromstring(doc_xml)
    ins = _find_ins_with_runs(tree)
    assert ins is not None
    assert ins.get(_w("author")) == "דפנה"
    date_str = ins.get(_w("date"))
    assert date_str is not None
    assert date_str.endswith("Z")  # ISO 8601 UTC


def test_insert_after_uses_rtl_and_david(sample_docx: Path, tmp_path: Path) -> None:
    out = tmp_path / "out.docx"
    rev = Revision(id="r1", type="insert_after",
                   anchor_bookmark="block-alef", content="מוסף")
    apply_tracked_revisions(sample_docx, out, [rev])

    with zipfile.ZipFile(out, "r") as zf:
        tree = etree.fromstring(zf.read("word/document.xml"))

    ins = _find_ins_with_runs(tree)
    assert ins is not None
    run = ins.find(".//w:r", NSMAP)
    assert run is not None
    rPr = run.find(_w("rPr"))
    assert rPr is not None
    assert rPr.find(_w("rtl")) is not None
    rFonts = rPr.find(_w("rFonts"))
    assert rFonts is not None
    assert rFonts.get(_w("ascii")) == "David"


# ── apply_tracked_revisions: insert_before ────────────────────────


def test_insert_before_places_above_anchor(sample_docx: Path, tmp_path: Path) -> None:
    out = tmp_path / "out.docx"
    rev = Revision(id="r1", type="insert_before",
                   anchor_bookmark="block-yod", content="לפני י.")
    result = apply_tracked_revisions(sample_docx, out, [rev])
    assert result.applied == 1

    # Order check: new paragraph's text must appear before "block-yod"
    with zipfile.ZipFile(out, "r") as zf:
        tree = etree.fromstring(zf.read("word/document.xml"))
    paragraphs = tree.findall(".//w:p", NSMAP)
    texts = ["".join(p.itertext()) for p in paragraphs]
    idx_new = next(i for i, t in enumerate(texts) if "לפני י." in t)
    idx_yod = next(i for i, t in enumerate(texts) if "תוכן פסקה של block-yod" in t)
    assert idx_new < idx_yod


# ── apply_tracked_revisions: delete ───────────────────────────────


def test_delete_wraps_runs_in_w_del(sample_docx: Path, tmp_path: Path) -> None:
    out = tmp_path / "out.docx"
    rev = Revision(id="r1", type="delete", anchor_bookmark="block-yod", content="")
    result = apply_tracked_revisions(sample_docx, out, [rev])
    assert result.applied == 1

    with zipfile.ZipFile(out, "r") as zf:
        tree = etree.fromstring(zf.read("word/document.xml"))
    dels = tree.findall(".//w:del", NSMAP)
    assert len(dels) >= 1
    # Inside w:del, text elements must become w:delText
    del_texts = dels[0].findall(".//w:delText", NSMAP)
    assert any("block-yod" in (t.text or "") for t in del_texts)


# ── apply_tracked_revisions: replace ─────────────────────────────


def test_replace_creates_both_ins_and_del(sample_docx: Path, tmp_path: Path) -> None:
    out = tmp_path / "out.docx"
    rev = Revision(id="r1", type="replace",
                   anchor_bookmark="block-yod", content="תוכן חדש לחלוטין")
    result = apply_tracked_revisions(sample_docx, out, [rev])
    assert result.applied == 1

    with zipfile.ZipFile(out, "r") as zf:
        tree = etree.fromstring(zf.read("word/document.xml"))
    assert len(tree.findall(".//w:ins", NSMAP)) >= 1
    assert len(tree.findall(".//w:del", NSMAP)) >= 1


# ── Failure modes ─────────────────────────────────────────────────


def test_missing_bookmark_returns_failed_not_crash(
    sample_docx: Path, tmp_path: Path,
) -> None:
    out = tmp_path / "out.docx"
    rev = Revision(id="r1", type="insert_after",
                   anchor_bookmark="does-not-exist", content="x")
    result = apply_tracked_revisions(sample_docx, out, [rev])
    assert result.applied == 0
    assert result.failed == 1
    assert result.results[0].status == "failed"
    assert "not found" in (result.results[0].error or "")
    # Output file still produced (unchanged copy)
    assert out.exists()


def test_empty_revisions_list_produces_copy(sample_docx: Path, tmp_path: Path) -> None:
    out = tmp_path / "out.docx"
    result = apply_tracked_revisions(sample_docx, out, [])
    assert result.applied == 0
    assert result.failed == 0
    assert out.exists()
    # bookmarks should still be there
    assert set(list_bookmarks(out)) == {"block-alef", "block-yod", "block-yod-bet"}


# ── Track revisions flag in settings ──────────────────────────────


def test_track_revisions_flag_is_enabled(sample_docx: Path, tmp_path: Path) -> None:
    out = tmp_path / "out.docx"
    rev = Revision(id="r1", type="insert_after",
                   anchor_bookmark="block-alef", content="x")
    apply_tracked_revisions(sample_docx, out, [rev])

    with zipfile.ZipFile(out, "r") as zf:
        settings_xml = zf.read("word/settings.xml")
    settings_tree = etree.fromstring(settings_xml)
    tr = settings_tree.find(_w("trackRevisions"))
    assert tr is not None


# ── Multiple revisions with unique IDs ────────────────────────────


def test_multiple_revisions_get_unique_ids(sample_docx: Path, tmp_path: Path) -> None:
    out = tmp_path / "out.docx"
    revs = [
        Revision(id="r1", type="insert_after",
                 anchor_bookmark="block-alef", content="ראשון"),
        Revision(id="r2", type="insert_after",
                 anchor_bookmark="block-yod", content="שני"),
        Revision(id="r3", type="delete", anchor_bookmark="block-yod-bet"),
    ]
    result = apply_tracked_revisions(sample_docx, out, revs)
    assert result.applied == 3

    with zipfile.ZipFile(out, "r") as zf:
        tree = etree.fromstring(zf.read("word/document.xml"))
    all_ids: list[str] = []
    for xpath in (".//w:ins", ".//w:del"):
        for el in tree.iterfind(xpath, NSMAP):
            wid = el.get(_w("id"))
            if wid:
                all_ids.append(wid)
    assert len(all_ids) == len(set(all_ids)), f"duplicate IDs: {all_ids}"


# ── DOCX remains openable as Document ─────────────────────────────


def test_output_docx_is_openable_by_python_docx(
    sample_docx: Path, tmp_path: Path,
) -> None:
    out = tmp_path / "out.docx"
    rev = Revision(id="r1", type="insert_after",
                   anchor_bookmark="block-yod", content="תוכן חדש")
    apply_tracked_revisions(sample_docx, out, [rev])
    # Must be openable as a valid DOCX by python-docx (no exceptions)
    doc = Document(str(out))
    # Original text is still accessible via python-docx
    all_text = "\n".join(p.text for p in doc.paragraphs)
    assert "block-yod" in all_text

    # Inserted (tracked) text is present in the raw XML via itertext
    with zipfile.ZipFile(out, "r") as zf:
        tree = etree.fromstring(zf.read("word/document.xml"))
    raw_text = "".join(tree.itertext())
    assert "תוכן חדש" in raw_text


# ── Bookmarks preserved through revisions ─────────────────────────


def test_bookmarks_preserved_after_insert(sample_docx: Path, tmp_path: Path) -> None:
    out = tmp_path / "out.docx"
    rev = Revision(id="r1", type="insert_after",
                   anchor_bookmark="block-yod", content="x")
    apply_tracked_revisions(sample_docx, out, [rev])
    names = list_bookmarks(out)
    assert set(names) == {"block-alef", "block-yod", "block-yod-bet"}


# ── Idempotency of loading/saving without changes ────────────────


def test_save_without_revisions_preserves_content(
    sample_docx: Path, tmp_path: Path,
) -> None:
    out = tmp_path / "out.docx"
    apply_tracked_revisions(sample_docx, out, [])
    doc_orig = Document(str(sample_docx))
    doc_new = Document(str(out))
    orig_text = [p.text for p in doc_orig.paragraphs]
    new_text = [p.text for p in doc_new.paragraphs]
    assert orig_text == new_text