Files
legal-ai/mcp-server/tests/test_docx_reviser.py
Chaim 726498126d
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m29s
Add Track Changes architecture for draft revisions (CMP + CMPA)
Fixes critical bug in 1033-25: user-uploaded עריכה-*.docx files were
orphaned on disk while exports kept rebuilding from stale DB blocks.

New architecture:
- User-uploaded DOCX becomes the source of truth (cases.active_draft_path)
- System edits via XML surgery with real Word <w:ins>/<w:del> revisions
- User can Accept/Reject each change from within Word

Components:
- docx_reviser.py: XML surgery for Track Changes (15 tests)
- docx_retrofit.py: retroactive bookmark injection with Hebrew marker
  detection + heading heuristic (9 tests)
- docx_exporter.py: emits bookmarks around each of the 12 blocks
- 3 new MCP tools: apply_user_edit, list_bookmarks, revise_draft
- 4 new/updated endpoints: upload (auto-registers active draft),
  /exports/revise, /exports/bookmarks, /exports/{filename}/retrofit,
  /active-draft
- DB migration: cases.active_draft_path column
- UI: correct banner using real v-numbers, "מקור האמת" badge,
  detailed upload toast with bookmarks_added/missing_blocks
- agents: legal-exporter (3 export modes), legal-ceo (stage G for
  revision handling), legal-writer (revision mode)

Multi-tenancy:
- Works for both CMP (1xxx cases) and CMPA (8xxx/9xxx cases)
- New revise-draft skill added to both companies
- deploy-track-changes.sh syncs skills CMP ↔ CMPA
- retrofit_case.py: one-off retrofit of existing files

Tests: 34 passing (15 reviser + 9 retrofit + 4 exporter bookmarks + 6 e2e)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-16 18:49:30 +00:00

343 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""בדיקות docx_reviser — Track Changes XML surgery.
הבדיקות יוצרות DOCX בסיסי עם bookmarks, מפעילות revisions, ובודקות:
1. שה-XML שנוצר תקף ונטען חזרה כ-Document
2. שה-<w:ins> / <w:del> קיימים בפורמט הנכון
3. שה-bookmarks נשמרים אחרי עריכה
4. שגופן David ו-RTL נשמרים
5. שכשלונות מטופלים אלגנטית (bookmark חסר → failed, לא crash)
"""
from __future__ import annotations
import zipfile
from datetime import datetime, timezone
from io import BytesIO
from pathlib import Path
import pytest
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from lxml import etree
from legal_mcp.services import docx_reviser
from legal_mcp.services.docx_reviser import (
NSMAP,
Revision,
_w,
apply_tracked_revisions,
list_bookmarks,
)
# ── Test fixtures ──────────────────────────────────────────────────
def _insert_bookmark(paragraph, name: str, bm_id: int) -> None:
"""Insert a <w:bookmarkStart> at the start of a paragraph and a
<w:bookmarkEnd> at the end."""
p_elem = paragraph._p
start = OxmlElement("w:bookmarkStart")
start.set(qn("w:id"), str(bm_id))
start.set(qn("w:name"), name)
p_elem.insert(0, start)
end = OxmlElement("w:bookmarkEnd")
end.set(qn("w:id"), str(bm_id))
p_elem.append(end)
def _make_sample_docx(path: Path) -> None:
"""Create a simple DOCX with 3 paragraphs, each with a bookmark."""
doc = Document()
for idx, name in enumerate(("block-alef", "block-yod", "block-yod-bet")):
p = doc.add_paragraph()
run = p.add_run(f"תוכן פסקה של {name}")
run.font.name = "David"
_insert_bookmark(p, name, idx + 1)
doc.save(str(path))
@pytest.fixture
def sample_docx(tmp_path: Path) -> Path:
path = tmp_path / "source.docx"
_make_sample_docx(path)
return path
# ── list_bookmarks ────────────────────────────────────────────────
def test_list_bookmarks_returns_all_named(sample_docx: Path) -> None:
names = list_bookmarks(sample_docx)
assert set(names) == {"block-alef", "block-yod", "block-yod-bet"}
def test_list_bookmarks_excludes_internal(tmp_path: Path) -> None:
"""Bookmarks starting with '_' (like _GoBack) should be filtered out."""
path = tmp_path / "internal.docx"
doc = Document()
p1 = doc.add_paragraph("visible")
_insert_bookmark(p1, "block-real", 1)
p2 = doc.add_paragraph("hidden")
_insert_bookmark(p2, "_GoBack", 2)
doc.save(str(path))
names = list_bookmarks(path)
assert names == ["block-real"]
# ── apply_tracked_revisions: insert_after ─────────────────────────
def test_insert_after_adds_tracked_paragraph(sample_docx: Path, tmp_path: Path) -> None:
out = tmp_path / "out.docx"
rev = Revision(
id="r1",
type="insert_after",
anchor_bookmark="block-yod",
content="פסקה חדשה שהמערכת מוסיפה.",
)
result = apply_tracked_revisions(
sample_docx, out, [rev],
author="מערכת AI",
date=datetime(2026, 4, 16, 14, 0, tzinfo=timezone.utc),
)
assert result.applied == 1
assert result.failed == 0
assert out.exists()
# Verify <w:ins> present in document.xml
with zipfile.ZipFile(out, "r") as zf:
doc_xml = zf.read("word/document.xml")
tree = etree.fromstring(doc_xml)
ins_elements = tree.findall(".//w:ins", NSMAP)
assert len(ins_elements) >= 1
# Verify the content is there
all_text = "".join(tree.itertext())
assert "פסקה חדשה שהמערכת מוסיפה." in all_text
# Verify original content preserved
assert "תוכן פסקה של block-yod" in all_text
def _find_ins_with_runs(tree: etree._Element) -> etree._Element | None:
"""Pick the <w:ins> that actually wraps runs (not the pilcrow-marker one)."""
for ins in tree.iterfind(".//w:ins", NSMAP):
if ins.find(".//w:r", NSMAP) is not None:
return ins
return None
def test_insert_after_ins_has_author_and_date(sample_docx: Path, tmp_path: Path) -> None:
out = tmp_path / "out.docx"
rev = Revision(id="r1", type="insert_after",
anchor_bookmark="block-alef", content="test")
apply_tracked_revisions(sample_docx, out, [rev], author="דפנה")
with zipfile.ZipFile(out, "r") as zf:
doc_xml = zf.read("word/document.xml")
tree = etree.fromstring(doc_xml)
ins = _find_ins_with_runs(tree)
assert ins is not None
assert ins.get(_w("author")) == "דפנה"
date_str = ins.get(_w("date"))
assert date_str is not None
assert date_str.endswith("Z") # ISO 8601 UTC
def test_insert_after_uses_rtl_and_david(sample_docx: Path, tmp_path: Path) -> None:
out = tmp_path / "out.docx"
rev = Revision(id="r1", type="insert_after",
anchor_bookmark="block-alef", content="מוסף")
apply_tracked_revisions(sample_docx, out, [rev])
with zipfile.ZipFile(out, "r") as zf:
tree = etree.fromstring(zf.read("word/document.xml"))
ins = _find_ins_with_runs(tree)
assert ins is not None
run = ins.find(".//w:r", NSMAP)
assert run is not None
rPr = run.find(_w("rPr"))
assert rPr is not None
assert rPr.find(_w("rtl")) is not None
rFonts = rPr.find(_w("rFonts"))
assert rFonts is not None
assert rFonts.get(_w("ascii")) == "David"
# ── apply_tracked_revisions: insert_before ────────────────────────
def test_insert_before_places_above_anchor(sample_docx: Path, tmp_path: Path) -> None:
out = tmp_path / "out.docx"
rev = Revision(id="r1", type="insert_before",
anchor_bookmark="block-yod", content="לפני י.")
result = apply_tracked_revisions(sample_docx, out, [rev])
assert result.applied == 1
# Order check: new paragraph's text must appear before "block-yod"
with zipfile.ZipFile(out, "r") as zf:
tree = etree.fromstring(zf.read("word/document.xml"))
paragraphs = tree.findall(".//w:p", NSMAP)
texts = ["".join(p.itertext()) for p in paragraphs]
idx_new = next(i for i, t in enumerate(texts) if "לפני י." in t)
idx_yod = next(i for i, t in enumerate(texts) if "תוכן פסקה של block-yod" in t)
assert idx_new < idx_yod
# ── apply_tracked_revisions: delete ───────────────────────────────
def test_delete_wraps_runs_in_w_del(sample_docx: Path, tmp_path: Path) -> None:
out = tmp_path / "out.docx"
rev = Revision(id="r1", type="delete", anchor_bookmark="block-yod", content="")
result = apply_tracked_revisions(sample_docx, out, [rev])
assert result.applied == 1
with zipfile.ZipFile(out, "r") as zf:
tree = etree.fromstring(zf.read("word/document.xml"))
dels = tree.findall(".//w:del", NSMAP)
assert len(dels) >= 1
# Inside w:del, text elements must become w:delText
del_texts = dels[0].findall(".//w:delText", NSMAP)
assert any("block-yod" in (t.text or "") for t in del_texts)
# ── apply_tracked_revisions: replace ─────────────────────────────
def test_replace_creates_both_ins_and_del(sample_docx: Path, tmp_path: Path) -> None:
out = tmp_path / "out.docx"
rev = Revision(id="r1", type="replace",
anchor_bookmark="block-yod", content="תוכן חדש לחלוטין")
result = apply_tracked_revisions(sample_docx, out, [rev])
assert result.applied == 1
with zipfile.ZipFile(out, "r") as zf:
tree = etree.fromstring(zf.read("word/document.xml"))
assert len(tree.findall(".//w:ins", NSMAP)) >= 1
assert len(tree.findall(".//w:del", NSMAP)) >= 1
# ── Failure modes ─────────────────────────────────────────────────
def test_missing_bookmark_returns_failed_not_crash(
sample_docx: Path, tmp_path: Path,
) -> None:
out = tmp_path / "out.docx"
rev = Revision(id="r1", type="insert_after",
anchor_bookmark="does-not-exist", content="x")
result = apply_tracked_revisions(sample_docx, out, [rev])
assert result.applied == 0
assert result.failed == 1
assert result.results[0].status == "failed"
assert "not found" in (result.results[0].error or "")
# Output file still produced (unchanged copy)
assert out.exists()
def test_empty_revisions_list_produces_copy(sample_docx: Path, tmp_path: Path) -> None:
out = tmp_path / "out.docx"
result = apply_tracked_revisions(sample_docx, out, [])
assert result.applied == 0
assert result.failed == 0
assert out.exists()
# bookmarks should still be there
assert set(list_bookmarks(out)) == {"block-alef", "block-yod", "block-yod-bet"}
# ── Track revisions flag in settings ──────────────────────────────
def test_track_revisions_flag_is_enabled(sample_docx: Path, tmp_path: Path) -> None:
out = tmp_path / "out.docx"
rev = Revision(id="r1", type="insert_after",
anchor_bookmark="block-alef", content="x")
apply_tracked_revisions(sample_docx, out, [rev])
with zipfile.ZipFile(out, "r") as zf:
settings_xml = zf.read("word/settings.xml")
settings_tree = etree.fromstring(settings_xml)
tr = settings_tree.find(_w("trackRevisions"))
assert tr is not None
# ── Multiple revisions with unique IDs ────────────────────────────
def test_multiple_revisions_get_unique_ids(sample_docx: Path, tmp_path: Path) -> None:
out = tmp_path / "out.docx"
revs = [
Revision(id="r1", type="insert_after",
anchor_bookmark="block-alef", content="ראשון"),
Revision(id="r2", type="insert_after",
anchor_bookmark="block-yod", content="שני"),
Revision(id="r3", type="delete", anchor_bookmark="block-yod-bet"),
]
result = apply_tracked_revisions(sample_docx, out, revs)
assert result.applied == 3
with zipfile.ZipFile(out, "r") as zf:
tree = etree.fromstring(zf.read("word/document.xml"))
all_ids: list[str] = []
for xpath in (".//w:ins", ".//w:del"):
for el in tree.iterfind(xpath, NSMAP):
wid = el.get(_w("id"))
if wid:
all_ids.append(wid)
assert len(all_ids) == len(set(all_ids)), f"duplicate IDs: {all_ids}"
# ── DOCX remains openable as Document ─────────────────────────────
def test_output_docx_is_openable_by_python_docx(
sample_docx: Path, tmp_path: Path,
) -> None:
out = tmp_path / "out.docx"
rev = Revision(id="r1", type="insert_after",
anchor_bookmark="block-yod", content="תוכן חדש")
apply_tracked_revisions(sample_docx, out, [rev])
# Must be openable as a valid DOCX by python-docx (no exceptions)
doc = Document(str(out))
# Original text is still accessible via python-docx
all_text = "\n".join(p.text for p in doc.paragraphs)
assert "block-yod" in all_text
# Inserted (tracked) text is present in the raw XML via itertext
with zipfile.ZipFile(out, "r") as zf:
tree = etree.fromstring(zf.read("word/document.xml"))
raw_text = "".join(tree.itertext())
assert "תוכן חדש" in raw_text
# ── Bookmarks preserved through revisions ─────────────────────────
def test_bookmarks_preserved_after_insert(sample_docx: Path, tmp_path: Path) -> None:
out = tmp_path / "out.docx"
rev = Revision(id="r1", type="insert_after",
anchor_bookmark="block-yod", content="x")
apply_tracked_revisions(sample_docx, out, [rev])
names = list_bookmarks(out)
assert set(names) == {"block-alef", "block-yod", "block-yod-bet"}
# ── Idempotency of loading/saving without changes ────────────────
def test_save_without_revisions_preserves_content(
sample_docx: Path, tmp_path: Path,
) -> None:
out = tmp_path / "out.docx"
apply_tracked_revisions(sample_docx, out, [])
doc_orig = Document(str(sample_docx))
doc_new = Document(str(out))
orig_text = [p.text for p in doc_orig.paragraphs]
new_text = [p.text for p in doc_new.paragraphs]
assert orig_text == new_text