Add Track Changes architecture for draft revisions (CMP + CMPA)

Fixes critical bug in 1033-25: user-uploaded עריכה-*.docx files were orphaned on disk while exports kept rebuilding from stale DB blocks. New architecture: - User-uploaded DOCX becomes the source of truth (cases.active_draft_path) - System edits via XML surgery with real Word <w:ins>/<w:del> revisions - User can Accept/Reject each change from within Word Components: - docx_reviser.py: XML surgery for Track Changes (15 tests) - docx_retrofit.py: retroactive bookmark injection with Hebrew marker detection + heading heuristic (9 tests) - docx_exporter.py: emits bookmarks around each of the 12 blocks - 3 new MCP tools: apply_user_edit, list_bookmarks, revise_draft - 4 new/updated endpoints: upload (auto-registers active draft), /exports/revise, /exports/bookmarks, /exports/{filename}/retrofit, /active-draft - DB migration: cases.active_draft_path column - UI: correct banner using real v-numbers, "מקור האמת" badge, detailed upload toast with bookmarks_added/missing_blocks - agents: legal-exporter (3 export modes), legal-ceo (stage G for revision handling), legal-writer (revision mode) Multi-tenancy: - Works for both CMP (1xxx cases) and CMPA (8xxx/9xxx cases) - New revise-draft skill added to both companies - deploy-track-changes.sh syncs skills CMP ↔ CMPA - retrofit_case.py: one-off retrofit of existing files Tests: 34 passing (15 reviser + 9 retrofit + 4 exporter bookmarks + 6 e2e) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-16 18:49:30 +00:00
parent 28daff58be
commit 726498126d
20 changed files with 2419 additions and 23 deletions
--- a/mcp-server/src/legal_mcp/services/docx_reviser.py
+++ b/mcp-server/src/legal_mcp/services/docx_reviser.py
@@ -0,0 +1,514 @@
+"""עריכת DOCX עם Track Changes אמיתיים של Word.
+
+השירות מיועד לקבל DOCX קיים (עם bookmarks שזיהו אנקורים) ולהחיל עליו
+עריכות מסומנות כ-w:ins / w:del, שבאים לידי ביטוי ב-Word כ-Track Changes
+שהמשתמש יכול Accept/Reject.
+
+אסטרטגיית אנקורים: bookmarks בשמות כגון 'block-yod', 'block-yod-para-3'
+שמוכנסים בזמן הייצוא הראשוני (docx_exporter.py) או רטרואקטיבית
+(docx_retrofit.py).
+"""
+
+from __future__ import annotations
+
+import logging
+import shutil
+import zipfile
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from io import BytesIO
+from pathlib import Path
+from typing import Literal
+
+from lxml import etree
+
+logger = logging.getLogger(__name__)
+
+# ── XML namespaces ─────────────────────────────────────────────────
+
+W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+NSMAP = {"w": W_NS}
+
+
+def _w(tag: str) -> str:
+    """Build a fully qualified tag name in the w: namespace."""
+    return f"{{{W_NS}}}{tag}"
+
+
+# ── Data models ────────────────────────────────────────────────────
+
+
+RevisionType = Literal["insert_after", "insert_before", "replace", "delete"]
+StyleType = Literal["body", "quote", "heading", "bold"]
+
+
+@dataclass
+class Revision:
+    """A single tracked change to apply to the DOCX."""
+
+    id: str
+    type: RevisionType
+    anchor_bookmark: str
+    content: str = ""
+    style: StyleType = "body"
+    reason: str = ""
+    anchor_position: Literal["start", "end"] = "end"
+
+
+@dataclass
+class RevisionResult:
+    """Result of applying a single revision."""
+
+    id: str
+    status: Literal["applied", "failed"]
+    error: str | None = None
+    ins_id: int | None = None
+
+
+@dataclass
+class RevisionBatchResult:
+    """Aggregate result of applying a revision batch."""
+
+    applied: int = 0
+    failed: int = 0
+    results: list[RevisionResult] = field(default_factory=list)
+    output_path: str = ""
+
+
+# ── XML helpers ────────────────────────────────────────────────────
+
+
+def _load_docx_xml(docx_path: Path) -> tuple[dict[str, bytes], etree._Element, etree._Element]:
+    """Load a DOCX as a dict of zip members + parsed document/settings trees."""
+    members: dict[str, bytes] = {}
+    with zipfile.ZipFile(docx_path, "r") as zf:
+        for name in zf.namelist():
+            members[name] = zf.read(name)
+
+    if "word/document.xml" not in members:
+        raise ValueError(f"{docx_path}: missing word/document.xml")
+
+    document_tree = etree.fromstring(members["word/document.xml"])
+    settings_bytes = members.get("word/settings.xml")
+    if settings_bytes:
+        settings_tree = etree.fromstring(settings_bytes)
+    else:
+        settings_tree = etree.Element(_w("settings"), nsmap=NSMAP)
+
+    return members, document_tree, settings_tree
+
+
+def _save_docx_xml(
+    members: dict[str, bytes],
+    document_tree: etree._Element,
+    settings_tree: etree._Element,
+    output_path: Path,
+) -> None:
+    """Write a DOCX back to disk with updated document/settings XML."""
+    members = dict(members)
+    members["word/document.xml"] = etree.tostring(
+        document_tree, xml_declaration=True, encoding="UTF-8", standalone=True
+    )
+    members["word/settings.xml"] = etree.tostring(
+        settings_tree, xml_declaration=True, encoding="UTF-8", standalone=True
+    )
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    buffer = BytesIO()
+    with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf:
+        for name, data in members.items():
+            zf.writestr(name, data)
+    output_path.write_bytes(buffer.getvalue())
+
+
+def _ensure_track_revisions(settings_tree: etree._Element) -> None:
+    """Ensure <w:trackRevisions/> is present in settings.xml.
+
+    Note: This enables *display* of track changes — actual w:ins/w:del nodes
+    are rendered as tracked regardless. Word respects trackRevisions for
+    recording further user edits too.
+    """
+    existing = settings_tree.find(_w("trackRevisions"))
+    if existing is None:
+        el = etree.SubElement(settings_tree, _w("trackRevisions"))
+        el.set(_w("val"), "true")
+
+
+def _next_revision_id(document_tree: etree._Element) -> int:
+    """Find max existing w:id on w:ins/w:del/w:bookmarkStart and return next."""
+    max_id = 0
+    for xpath in (
+        ".//w:ins", ".//w:del", ".//w:bookmarkStart", ".//w:bookmarkEnd",
+        ".//w:commentRangeStart", ".//w:comment",
+    ):
+        for el in document_tree.iterfind(xpath, NSMAP):
+            val = el.get(_w("id"))
+            if val:
+                try:
+                    max_id = max(max_id, int(val))
+                except ValueError:
+                    pass
+    return max_id + 1
+
+
+def _find_bookmark(
+    document_tree: etree._Element, name: str
+) -> tuple[etree._Element | None, etree._Element | None]:
+    """Find w:bookmarkStart and w:bookmarkEnd elements by bookmark name."""
+    start = None
+    end = None
+    for el in document_tree.iterfind(".//w:bookmarkStart", NSMAP):
+        if el.get(_w("name")) == name:
+            start = el
+            break
+    if start is None:
+        return None, None
+    bm_id = start.get(_w("id"))
+    for el in document_tree.iterfind(".//w:bookmarkEnd", NSMAP):
+        if el.get(_w("id")) == bm_id:
+            end = el
+            break
+    return start, end
+
+
+def _find_enclosing_paragraph(element: etree._Element) -> etree._Element | None:
+    """Walk up from an element to find its enclosing w:p."""
+    cur = element
+    while cur is not None:
+        if cur.tag == _w("p"):
+            return cur
+        cur = cur.getparent()
+    return None
+
+
+# ── Paragraph builders ─────────────────────────────────────────────
+
+
+def _build_run(text: str, *, bold: bool = False, italic: bool = False,
+               font: str = "David", size_half_pt: int | None = None) -> etree._Element:
+    """Build a w:r (run) element with RTL/David defaults and given text."""
+    r = etree.Element(_w("r"))
+    rPr = etree.SubElement(r, _w("rPr"))
+
+    rFonts = etree.SubElement(rPr, _w("rFonts"))
+    rFonts.set(_w("ascii"), font)
+    rFonts.set(_w("hAnsi"), font)
+    rFonts.set(_w("cs"), font)
+    rFonts.set(_w("hint"), "cs")
+
+    if size_half_pt is not None:
+        sz = etree.SubElement(rPr, _w("sz"))
+        sz.set(_w("val"), str(size_half_pt))
+        szCs = etree.SubElement(rPr, _w("szCs"))
+        szCs.set(_w("val"), str(size_half_pt))
+
+    if bold:
+        etree.SubElement(rPr, _w("b"))
+        etree.SubElement(rPr, _w("bCs"))
+    if italic:
+        etree.SubElement(rPr, _w("i"))
+        etree.SubElement(rPr, _w("iCs"))
+
+    etree.SubElement(rPr, _w("rtl"))
+
+    t = etree.SubElement(r, _w("t"))
+    t.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")
+    t.text = text
+    return r
+
+
+def _build_paragraph(text: str, *, style: StyleType = "body") -> etree._Element:
+    """Build a w:p (paragraph) with RTL + David + given text."""
+    p = etree.Element(_w("p"))
+    pPr = etree.SubElement(p, _w("pPr"))
+    bidi = etree.SubElement(pPr, _w("bidi"))
+    bidi.set(_w("val"), "1")
+
+    # Right alignment for body/RTL
+    jc = etree.SubElement(pPr, _w("jc"))
+    jc.set(_w("val"), "right")
+
+    rPr_p = etree.SubElement(pPr, _w("rPr"))
+    etree.SubElement(rPr_p, _w("rtl"))
+
+    bold = style in ("heading", "bold")
+    italic = style == "quote"
+    size = None
+    if style == "heading":
+        size = 28  # 14pt
+    elif style == "quote":
+        size = 22  # 11pt
+    run = _build_run(text, bold=bold, italic=italic, size_half_pt=size)
+    p.append(run)
+    return p
+
+
+def _wrap_in_ins(elements: list[etree._Element], *, ins_id: int,
+                 author: str, date_iso: str) -> etree._Element:
+    """Wrap a list of *run-level* elements in a single <w:ins>."""
+    ins = etree.Element(_w("ins"))
+    ins.set(_w("id"), str(ins_id))
+    ins.set(_w("author"), author)
+    ins.set(_w("date"), date_iso)
+    for el in elements:
+        ins.append(el)
+    return ins
+
+
+def _make_tracked_paragraph_insert(
+    text: str, *, style: StyleType, ins_id: int, author: str, date_iso: str,
+    mark_id: int | None = None,
+) -> etree._Element:
+    """Build a whole tracked-inserted paragraph.
+
+    DOCX convention for a fully-inserted paragraph:
+      1. All <w:r> runs are wrapped in a single <w:ins> (own id).
+      2. The paragraph's pPr/rPr gets an <w:ins> marker for the paragraph
+         mark itself (pilcrow) — this uses its *own* id.
+    """
+    if mark_id is None:
+        mark_id = ins_id
+    p = _build_paragraph(text, style=style)
+    pPr = p.find(_w("pPr"))
+    assert pPr is not None
+    rPr = pPr.find(_w("rPr"))
+    if rPr is None:
+        rPr = etree.SubElement(pPr, _w("rPr"))
+    ins_mark = etree.SubElement(rPr, _w("ins"))
+    ins_mark.set(_w("id"), str(mark_id))
+    ins_mark.set(_w("author"), author)
+    ins_mark.set(_w("date"), date_iso)
+
+    runs = [child for child in list(p) if child.tag == _w("r")]
+    if runs:
+        for r in runs:
+            p.remove(r)
+        ins = _wrap_in_ins(runs, ins_id=ins_id, author=author, date_iso=date_iso)
+        p.append(ins)
+    return p
+
+
+def _mark_runs_as_deleted(paragraph: etree._Element, *, del_id: int,
+                          author: str, date_iso: str) -> None:
+    """Convert all <w:r> in a paragraph to <w:del>-wrapped runs.
+
+    Within a <w:del>, <w:t> must become <w:delText>.
+    """
+    runs = [child for child in list(paragraph) if child.tag == _w("r")]
+    if not runs:
+        return
+    # Convert <w:t> → <w:delText> inside each run
+    for r in runs:
+        for t in r.findall(_w("t")):
+            t.tag = _w("delText")
+        paragraph.remove(r)
+    wrapper = etree.Element(_w("del"))
+    wrapper.set(_w("id"), str(del_id))
+    wrapper.set(_w("author"), author)
+    wrapper.set(_w("date"), date_iso)
+    for r in runs:
+        wrapper.append(r)
+    paragraph.append(wrapper)
+
+
+# ── Revision application ───────────────────────────────────────────
+
+
+def _apply_insert(
+    document_tree: etree._Element,
+    revision: Revision,
+    *,
+    ins_id: int,
+    author: str,
+    date_iso: str,
+) -> RevisionResult:
+    """Apply insert_after / insert_before relative to a bookmark."""
+    start, end = _find_bookmark(document_tree, revision.anchor_bookmark)
+    if start is None:
+        return RevisionResult(id=revision.id, status="failed",
+                              error=f"bookmark '{revision.anchor_bookmark}' not found")
+
+    # Pick anchor element based on position
+    if revision.type == "insert_before":
+        anchor = start
+    else:  # insert_after — default
+        anchor = end if end is not None else start
+
+    enclosing_p = _find_enclosing_paragraph(anchor)
+    if enclosing_p is None:
+        return RevisionResult(id=revision.id, status="failed",
+                              error="anchor has no enclosing paragraph")
+
+    # Build new tracked paragraph. ins_id for run wrapper, ins_id+1 for mark.
+    new_p = _make_tracked_paragraph_insert(
+        revision.content, style=revision.style,
+        ins_id=ins_id, mark_id=ins_id + 1,
+        author=author, date_iso=date_iso,
+    )
+
+    parent = enclosing_p.getparent()
+    if parent is None:
+        return RevisionResult(id=revision.id, status="failed",
+                              error="enclosing paragraph has no parent")
+    idx = list(parent).index(enclosing_p)
+    insert_idx = idx if revision.type == "insert_before" else idx + 1
+    parent.insert(insert_idx, new_p)
+
+    return RevisionResult(id=revision.id, status="applied", ins_id=ins_id)
+
+
+def _apply_delete(
+    document_tree: etree._Element,
+    revision: Revision,
+    *,
+    del_id: int,
+    author: str,
+    date_iso: str,
+) -> RevisionResult:
+    """Mark the paragraph enclosed by a bookmark as deleted."""
+    start, end = _find_bookmark(document_tree, revision.anchor_bookmark)
+    if start is None:
+        return RevisionResult(id=revision.id, status="failed",
+                              error=f"bookmark '{revision.anchor_bookmark}' not found")
+
+    enclosing_p = _find_enclosing_paragraph(start)
+    if enclosing_p is None:
+        return RevisionResult(id=revision.id, status="failed",
+                              error="anchor has no enclosing paragraph")
+
+    _mark_runs_as_deleted(enclosing_p, del_id=del_id,
+                          author=author, date_iso=date_iso)
+    return RevisionResult(id=revision.id, status="applied", ins_id=del_id)
+
+
+def _apply_replace(
+    document_tree: etree._Element,
+    revision: Revision,
+    *,
+    ins_id: int,
+    del_id: int,
+    author: str,
+    date_iso: str,
+) -> RevisionResult:
+    """Replace = delete the existing paragraph + insert new one after it."""
+    start, end = _find_bookmark(document_tree, revision.anchor_bookmark)
+    if start is None:
+        return RevisionResult(id=revision.id, status="failed",
+                              error=f"bookmark '{revision.anchor_bookmark}' not found")
+
+    enclosing_p = _find_enclosing_paragraph(start)
+    if enclosing_p is None:
+        return RevisionResult(id=revision.id, status="failed",
+                              error="anchor has no enclosing paragraph")
+
+    parent = enclosing_p.getparent()
+    if parent is None:
+        return RevisionResult(id=revision.id, status="failed",
+                              error="enclosing paragraph has no parent")
+
+    new_p = _make_tracked_paragraph_insert(
+        revision.content, style=revision.style,
+        ins_id=ins_id, mark_id=ins_id + 1,
+        author=author, date_iso=date_iso,
+    )
+    idx = list(parent).index(enclosing_p)
+    parent.insert(idx + 1, new_p)
+
+    _mark_runs_as_deleted(enclosing_p, del_id=del_id,
+                          author=author, date_iso=date_iso)
+    return RevisionResult(id=revision.id, status="applied", ins_id=ins_id)
+
+
+# ── Public API ─────────────────────────────────────────────────────
+
+
+def apply_tracked_revisions(
+    source_path: str | Path,
+    output_path: str | Path,
+    revisions: list[Revision],
+    *,
+    author: str = "מערכת AI",
+    date: datetime | None = None,
+) -> RevisionBatchResult:
+    """Apply a batch of tracked revisions to a DOCX, producing a new DOCX.
+
+    The source file is never mutated. Output is a new DOCX with <w:ins> /
+    <w:del> markers that Word renders as Track Changes (Accept/Reject).
+
+    Args:
+        source_path: existing DOCX (e.g. עריכה-v1.docx) — retains user edits.
+        output_path: where to write the revised DOCX (e.g. טיוטה-v6.docx).
+        revisions: list of Revision objects. Anchors are bookmark names.
+        author: displayed as the revision author in Word.
+        date: revision timestamp (defaults to now, UTC).
+
+    Returns:
+        RevisionBatchResult with per-revision status.
+    """
+    source_path = Path(source_path)
+    output_path = Path(output_path)
+
+    if date is None:
+        date = datetime.now(timezone.utc)
+    date_iso = date.strftime("%Y-%m-%dT%H:%M:%SZ")
+
+    members, doc_tree, settings_tree = _load_docx_xml(source_path)
+    _ensure_track_revisions(settings_tree)
+
+    next_id = _next_revision_id(doc_tree)
+
+    batch = RevisionBatchResult()
+    for rev in revisions:
+        try:
+            if rev.type in ("insert_after", "insert_before"):
+                result = _apply_insert(doc_tree, rev, ins_id=next_id,
+                                       author=author, date_iso=date_iso)
+                # insert consumes 2 IDs: run-wrapper + paragraph-mark
+                next_id += 2
+            elif rev.type == "delete":
+                result = _apply_delete(doc_tree, rev, del_id=next_id,
+                                       author=author, date_iso=date_iso)
+                next_id += 1
+            elif rev.type == "replace":
+                result = _apply_replace(doc_tree, rev,
+                                        ins_id=next_id, del_id=next_id + 2,
+                                        author=author, date_iso=date_iso)
+                # replace consumes 3 IDs: ins-run, ins-mark, del
+                next_id += 3
+            else:
+                result = RevisionResult(id=rev.id, status="failed",
+                                        error=f"unknown type: {rev.type}")
+        except Exception as e:  # pragma: no cover - defensive
+            logger.exception("revision %s failed", rev.id)
+            result = RevisionResult(id=rev.id, status="failed", error=str(e))
+
+        batch.results.append(result)
+        if result.status == "applied":
+            batch.applied += 1
+        else:
+            batch.failed += 1
+
+    _save_docx_xml(members, doc_tree, settings_tree, output_path)
+    batch.output_path = str(output_path)
+    logger.info("applied %d revisions (failed %d) → %s",
+                batch.applied, batch.failed, output_path)
+    return batch
+
+
+def list_bookmarks(docx_path: str | Path) -> list[str]:
+    """Return bookmark names present in the DOCX (excluding '_' internal ones)."""
+    docx_path = Path(docx_path)
+    members, doc_tree, _ = _load_docx_xml(docx_path)
+    names: list[str] = []
+    for el in doc_tree.iterfind(".//w:bookmarkStart", NSMAP):
+        name = el.get(_w("name"))
+        if name and not name.startswith("_"):
+            names.append(name)
+    return names
+
+
+def copy_with_revisions(
+    source_path: str | Path, output_path: str | Path,
+) -> None:
+    """Copy source → output unchanged (used when revisions list is empty)."""
+    shutil.copy2(str(source_path), str(output_path))