legal-ai/mcp-server/tests/test_docx_retrofit.py

"""בדיקות docx_retrofit — הזרקת bookmarks רטרואקטיבית."""

from __future__ import annotations

from pathlib import Path

from docx import Document

from legal_mcp.services.docx_retrofit import (
    BLOCK_ORDER,
    retrofit_bookmarks,
)
from legal_mcp.services.docx_reviser import list_bookmarks


def _make_docx_with_hebrew_blocks(path: Path, markers: list[str]) -> None:
    """Create a DOCX where each paragraph starts with a Hebrew block marker."""
    doc = Document()
    for marker in markers:
        doc.add_paragraph(f"{marker}. תוכן הבלוק שמתחיל ב-{marker}")
        doc.add_paragraph(f"עוד פסקה בבלוק {marker}")
    doc.save(str(path))


def test_retrofit_detects_all_standard_blocks(tmp_path: Path) -> None:
    src = tmp_path / "src.docx"
    _make_docx_with_hebrew_blocks(
        src, ["א", "ב", "ג", "ד", "ה", "ו", "ז", "ח", "ט", "י", "יא", "יב"],
    )

    result = retrofit_bookmarks(src, backup=False)
    assert len(result["bookmarks_added"]) == 12
    assert result["missing_blocks"] == []

    names = list_bookmarks(src)
    expected = {name for name, _ in BLOCK_ORDER}
    assert set(names) == expected


def test_retrofit_reports_missing_blocks(tmp_path: Path) -> None:
    src = tmp_path / "src.docx"
    # Only 4 blocks present
    _make_docx_with_hebrew_blocks(src, ["א", "ב", "ג", "ד"])

    result = retrofit_bookmarks(src, backup=False)
    assert result["bookmarks_added"] == [
        "block-alef", "block-bet", "block-gimel", "block-dalet",
    ]
    assert "block-heh" in result["missing_blocks"]
    assert "block-yod-bet" in result["missing_blocks"]


def test_retrofit_distinguishes_yod_from_yod_alef_yod_bet(tmp_path: Path) -> None:
    """י, יא, יב must all be distinguished — longer markers win."""
    src = tmp_path / "src.docx"
    _make_docx_with_hebrew_blocks(src, ["ט", "י", "יא", "יב"])

    result = retrofit_bookmarks(src, backup=False)
    assert set(result["bookmarks_added"]) == {
        "block-tet", "block-yod", "block-yod-alef", "block-yod-bet",
    }


def test_retrofit_skips_existing_bookmarks(tmp_path: Path) -> None:
    """Running retrofit twice doesn't duplicate bookmarks."""
    src = tmp_path / "src.docx"
    _make_docx_with_hebrew_blocks(src, ["א", "ב"])

    first = retrofit_bookmarks(src, backup=False)
    assert first["bookmarks_added"] == ["block-alef", "block-bet"]

    second = retrofit_bookmarks(src, backup=False)
    assert second["bookmarks_added"] == []  # nothing new
    assert set(second["existing_bookmarks"]) == {"block-alef", "block-bet"}

    # Final document should still have exactly 2 bookmarks
    assert set(list_bookmarks(src)) == {"block-alef", "block-bet"}


def test_retrofit_creates_backup(tmp_path: Path) -> None:
    src = tmp_path / "file.docx"
    _make_docx_with_hebrew_blocks(src, ["א", "ב"])
    retrofit_bookmarks(src)  # backup=True (default)
    backup = src.with_suffix(".pre-retrofit.docx")
    assert backup.exists()


def test_retrofit_to_different_output_path_no_backup(tmp_path: Path) -> None:
    src = tmp_path / "src.docx"
    out = tmp_path / "out.docx"
    _make_docx_with_hebrew_blocks(src, ["א", "ב"])
    retrofit_bookmarks(src, output_path=out)
    # source untouched
    assert list_bookmarks(src) == []
    # output has bookmarks
    assert set(list_bookmarks(out)) == {"block-alef", "block-bet"}


def test_retrofit_ignores_marker_in_middle_of_text(tmp_path: Path) -> None:
    """A lone 'י' inside body text (not at start) should not be detected as block."""
    src = tmp_path / "src.docx"
    doc = Document()
    doc.add_paragraph("א. תחילת הבלוק")
    doc.add_paragraph("טקסט עם האות י לא בתחילת שורה, זה לא בלוק.")
    doc.add_paragraph("ב. בלוק שני")
    doc.save(str(src))

    result = retrofit_bookmarks(src, backup=False)
    assert "block-alef" in result["bookmarks_added"]
    assert "block-bet" in result["bookmarks_added"]
    # 'block-yod' should NOT be detected
    assert "block-yod" not in result["bookmarks_added"]


def test_retrofit_out_of_order_markers_picks_forward_only(tmp_path: Path) -> None:
    """If a later-ordered marker appears first, earlier ones are treated as missing.

    Scanner advances forward through BLOCK_ORDER — it won't go back to claim
    an earlier marker after already seeing a later one.
    """
    src = tmp_path / "src.docx"
    doc = Document()
    doc.add_paragraph("ב. מופיע ראשון")
    doc.add_paragraph("א. מופיע אחרי — יידחה כי 'א' לפני 'ב'")
    doc.add_paragraph("ג. בלוק גימל")
    doc.save(str(src))

    result = retrofit_bookmarks(src, backup=False)
    assert "block-bet" in result["bookmarks_added"]
    assert "block-gimel" in result["bookmarks_added"]
    # 'א' was not detected (the first paragraph was 'ב' — scanner advanced past א)
    assert "block-alef" in result["missing_blocks"]


def test_retrofit_empty_document_reports_all_missing(tmp_path: Path) -> None:
    src = tmp_path / "empty.docx"
    doc = Document()
    doc.save(str(src))
    result = retrofit_bookmarks(src, backup=False)
    assert result["bookmarks_added"] == []
    assert len(result["missing_blocks"]) == 12