Add Track Changes architecture for draft revisions (CMP + CMPA)

Fixes critical bug in 1033-25: user-uploaded עריכה-*.docx files were orphaned on disk while exports kept rebuilding from stale DB blocks. New architecture: - User-uploaded DOCX becomes the source of truth (cases.active_draft_path) - System edits via XML surgery with real Word <w:ins>/<w:del> revisions - User can Accept/Reject each change from within Word Components: - docx_reviser.py: XML surgery for Track Changes (15 tests) - docx_retrofit.py: retroactive bookmark injection with Hebrew marker detection + heading heuristic (9 tests) - docx_exporter.py: emits bookmarks around each of the 12 blocks - 3 new MCP tools: apply_user_edit, list_bookmarks, revise_draft - 4 new/updated endpoints: upload (auto-registers active draft), /exports/revise, /exports/bookmarks, /exports/{filename}/retrofit, /active-draft - DB migration: cases.active_draft_path column - UI: correct banner using real v-numbers, "מקור האמת" badge, detailed upload toast with bookmarks_added/missing_blocks - agents: legal-exporter (3 export modes), legal-ceo (stage G for revision handling), legal-writer (revision mode) Multi-tenancy: - Works for both CMP (1xxx cases) and CMPA (8xxx/9xxx cases) - New revise-draft skill added to both companies - deploy-track-changes.sh syncs skills CMP ↔ CMPA - retrofit_case.py: one-off retrofit of existing files Tests: 34 passing (15 reviser + 9 retrofit + 4 exporter bookmarks + 6 e2e) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-16 18:49:30 +00:00
parent 28daff58be
commit 726498126d
20 changed files with 2419 additions and 23 deletions
--- a/mcp-server/src/legal_mcp/services/docx_retrofit.py
+++ b/mcp-server/src/legal_mcp/services/docx_retrofit.py
@@ -0,0 +1,290 @@
+"""הזרקת bookmarks רטרואקטיבית ל-DOCX שלא נוצרו ע"י ה-exporter.
+
+כאשר משתמש מעלה `עריכה-v*.docx` שנערך ב-Word מחוץ למערכת, אין בו את ה-
+bookmarks שאנו מצפים להם (block-alef ... block-yod-bet). השירות כאן
+מזהה את תחילת כל בלוק לפי סימני הפתיחה העבריים (א., ב., ... יב.) ב-
+הפסקאות הראשונות שלו, ומזריק bookmarkStart/bookmarkEnd בהתאם.
+
+נעשה בצורה defensive — אם לא מצליחים לזהות בלוק, הוא פשוט לא יקבל
+bookmark (`missing_blocks` בתוצאה). השרת אמור להתריע למשתמש.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+import shutil
+import zipfile
+from io import BytesIO
+from pathlib import Path
+
+from lxml import etree
+
+from legal_mcp.services.docx_reviser import (
+    NSMAP,
+    _load_docx_xml,
+    _save_docx_xml,
+    _w,
+)
+
+logger = logging.getLogger(__name__)
+
+# ── Block identification ──────────────────────────────────────────
+
+# The 12 blocks in order, with their Hebrew letter marker
+BLOCK_ORDER = [
+    ("block-alef",     "א"),
+    ("block-bet",      "ב"),
+    ("block-gimel",    "ג"),
+    ("block-dalet",    "ד"),
+    ("block-heh",      "ה"),
+    ("block-vav",      "ו"),
+    ("block-zayin",    "ז"),
+    ("block-chet",     "ח"),
+    ("block-tet",      "ט"),
+    ("block-yod",      "י"),
+    ("block-yod-alef", "יא"),
+    ("block-yod-bet",  "יב"),
+]
+
+# Regex matching a paragraph that begins with a Hebrew block marker
+# followed by '.', ')', ' ', or end-of-string. The marker must be followed
+# either by whitespace/punctuation or end of text to avoid matching longer
+# words that happen to start with these letters.
+_BLOCK_MARKERS_BY_LETTER: dict[str, str] = {letter: name for name, letter in BLOCK_ORDER}
+
+# Longer markers (יא, יב) first so regex matches them before falling back to 'י'
+_MARKER_ALTERNATION = "|".join(
+    re.escape(letter)
+    for letter in sorted(_BLOCK_MARKERS_BY_LETTER, key=len, reverse=True)
+)
+_BLOCK_MARKER_RE = re.compile(
+    rf"^\s*({_MARKER_ALTERNATION})\s*[\.\)\-]\s*"
+)
+
+# Secondary heuristic: Hebrew section headings that reliably mark the
+# start of each block in the Daphna Tamir style (used when markers
+# "א.", "ב." etc. are missing — common in user-edited Word files).
+#
+# Key observations from the 12-block schema:
+#   block-alef:   "בפני: דפנה תמיר" or decision number page
+#   block-bet:    "ערר מספר" line
+#   block-gimel:  appellants vs respondents (parties)
+#   block-dalet:  bold "החלטה" centered
+#   block-heh:    "רקע" / "רקע עובדתי" / "פתח דבר"
+#   block-vav:    "תכניות חלות" / "ההליך שבפנינו" / "ההליכים בפני"
+#   block-zayin:  "תמצית טענות" / "טענות הצדדים"
+#   block-chet:   "תגובת המשיבה" / "עמדת הוועדה"
+#   block-tet:    "ההליכים בפני ועדת הערר" / "הדיון בפנינו"
+#   block-yod:    "דיון והכרעה" / "דיון"
+#   block-yod-alef: "סוף דבר" / "סיכום"
+#   block-yod-bet: "ההחלטה" (signature / closing block)
+_BLOCK_HEADING_PATTERNS: list[tuple[str, list[str]]] = [
+    ("block-alef", [r"בפני[:\s]", r"ועדת הערר"]),
+    ("block-bet",  [r"^ערר\s+מספר", r"^ערר\s+\d"]),
+    ("block-gimel", [r"^נגד\s*$", r"^—\s*נגד\s*—"]),
+    ("block-dalet", [r"^החלטה\s*$"]),
+    ("block-heh",  [r"^רקע\s*$", r"^רקע\s+עובדתי", r"^פתח\s+דבר"]),
+    ("block-vav",  [r"^תכניות\s+חלות", r"^ההליכים?\s+שבפנינו", r"^ההליכים?\s+בפני\s+הוועדה\s+המקומית"]),
+    ("block-zayin", [r"^תמצית\s+טענות", r"^טענות\s+הצדדים", r"^טענות\s+העוררי"]),
+    ("block-chet", [r"^תגובת\s+המשיב", r"^עמדת\s+הוועדה\s+המקומית", r"^תשובת"]),
+    ("block-tet",  [r"^ההליכים?\s+בפני\s+ועדת\s+הערר", r"^הדיון\s+בפנינו"]),
+    ("block-yod",  [r"^דיון\s+והכרעה", r"^דיון\s*$", r"^ההכרעה"]),
+    ("block-yod-alef", [r"^סוף\s+דבר", r"^סיכום\s*$"]),
+    ("block-yod-bet", [r"^ההחלטה\s*$", r"^על\s+כן[,\.]?"]),
+]
+
+_COMPILED_HEADING_PATTERNS: list[tuple[str, list[re.Pattern[str]]]] = [
+    (name, [re.compile(p) for p in patterns])
+    for name, patterns in _BLOCK_HEADING_PATTERNS
+]
+
+
+def _paragraph_text(p: etree._Element) -> str:
+    """Return the full text of a paragraph, joining all w:t nodes."""
+    return "".join(p.itertext()).strip()
+
+
+def _detect_block_starts(
+    paragraphs: list[etree._Element],
+) -> dict[str, int]:
+    """Return a mapping of block_name → paragraph index (start of that block).
+
+    Uses a greedy scan: for each paragraph, if its text starts with an
+    expected block marker and the block hasn't been assigned yet, assign
+    this paragraph as the block's start.
+    """
+    found: dict[str, int] = {}
+    expected_order = [name for name, _ in BLOCK_ORDER]
+    pointer = 0  # index into expected_order — next expected block
+
+    for i, p in enumerate(paragraphs):
+        text = _paragraph_text(p)
+        if not text:
+            continue
+
+        matched_name: str | None = None
+
+        # Try marker-based (א., ב., ...) first
+        m = _BLOCK_MARKER_RE.match(text)
+        if m:
+            letter = m.group(1)
+            matched_name = _BLOCK_MARKERS_BY_LETTER.get(letter)
+
+        # Fall back to heading-keyword heuristic (Daphna style)
+        if matched_name is None:
+            for name, patterns in _COMPILED_HEADING_PATTERNS:
+                if name in found:
+                    continue
+                # Only check patterns for blocks we haven't assigned yet
+                # AND that come at/after the current pointer — to keep the
+                # greedy forward-scan semantics consistent with markers.
+                if expected_order.index(name) < pointer:
+                    continue
+                if any(pat.search(text) for pat in patterns):
+                    matched_name = name
+                    break
+
+        if matched_name is None:
+            continue
+        if matched_name in found:
+            continue
+        if pointer >= len(expected_order):
+            continue
+        name_idx_in_order = expected_order.index(matched_name)
+        if name_idx_in_order >= pointer:
+            found[matched_name] = i
+            pointer = name_idx_in_order + 1
+    return found
+
+
+def _insert_bookmark_around_range(
+    body: etree._Element,
+    paragraphs: list[etree._Element],
+    start_idx: int,
+    end_idx: int,
+    name: str,
+    bm_id: int,
+) -> None:
+    """Insert bookmarkStart at the start of paragraph start_idx and
+    bookmarkEnd at the end of paragraph end_idx."""
+    start_el = etree.Element(_w("bookmarkStart"))
+    start_el.set(_w("id"), str(bm_id))
+    start_el.set(_w("name"), name)
+
+    end_el = etree.Element(_w("bookmarkEnd"))
+    end_el.set(_w("id"), str(bm_id))
+
+    start_p = paragraphs[start_idx]
+    end_p = paragraphs[end_idx]
+    start_p.insert(0, start_el)
+    end_p.append(end_el)
+
+
+def _next_bookmark_id(doc_tree: etree._Element) -> int:
+    """Find max existing bookmark id and return next unused."""
+    max_id = 9999
+    for el in doc_tree.iterfind(".//w:bookmarkStart", NSMAP):
+        wid = el.get(_w("id"))
+        if wid:
+            try:
+                max_id = max(max_id, int(wid))
+            except ValueError:
+                pass
+    return max_id + 1
+
+
+# ── Public API ────────────────────────────────────────────────────
+
+
+def retrofit_bookmarks(
+    docx_path: str | Path,
+    *,
+    output_path: str | Path | None = None,
+    backup: bool = True,
+) -> dict:
+    """Inject block-* bookmarks into an existing DOCX via heuristic detection.
+
+    Args:
+        docx_path: path to DOCX file (modified in place unless output_path set).
+        output_path: if given, write to this path instead of overwriting.
+        backup: if True and writing in place, save the original as
+                `<path>.pre-retrofit.docx` first.
+
+    Returns:
+        {
+          'bookmarks_added': ['block-alef', ...],
+          'missing_blocks':  ['block-dalet', ...],
+          'existing_bookmarks': [...]   # bookmarks already on the doc
+        }
+    """
+    docx_path = Path(docx_path)
+    if not docx_path.exists():
+        raise FileNotFoundError(str(docx_path))
+
+    if output_path is None:
+        output_path = docx_path
+    output_path = Path(output_path)
+
+    members, doc_tree, settings_tree = _load_docx_xml(docx_path)
+
+    # Existing bookmarks
+    existing_names: list[str] = []
+    for el in doc_tree.iterfind(".//w:bookmarkStart", NSMAP):
+        name = el.get(_w("name"))
+        if name:
+            existing_names.append(name)
+
+    # Collect *top-level* body paragraphs (don't descend into tables etc.
+    # for now — MVP). The XPath ".//w:p" would include table cells too;
+    # for retrofitting we only care about the main flow.
+    body = doc_tree.find(f".//{_w('body')}")
+    if body is None:
+        raise ValueError("document has no <w:body>")
+    paragraphs = [p for p in body if p.tag == _w("p")]
+
+    if not paragraphs:
+        return {
+            "bookmarks_added": [],
+            "missing_blocks": [n for n, _ in BLOCK_ORDER],
+            "existing_bookmarks": existing_names,
+        }
+
+    block_starts = _detect_block_starts(paragraphs)
+
+    # Calculate end_idx for each block = paragraph before the next block's start,
+    # or last paragraph if this is the last block found.
+    ordered_found = sorted(block_starts.items(), key=lambda kv: kv[1])
+    ranges: list[tuple[str, int, int]] = []
+    for i, (name, start_idx) in enumerate(ordered_found):
+        if i + 1 < len(ordered_found):
+            end_idx = ordered_found[i + 1][1] - 1
+        else:
+            end_idx = len(paragraphs) - 1
+        ranges.append((name, start_idx, max(start_idx, end_idx)))
+
+    # Backup if overwriting in place
+    if backup and output_path.resolve() == docx_path.resolve():
+        backup_path = docx_path.with_suffix(".pre-retrofit.docx")
+        shutil.copy2(str(docx_path), str(backup_path))
+
+    # Inject bookmarks, skipping any that already exist
+    next_id = _next_bookmark_id(doc_tree)
+    added: list[str] = []
+    for name, s, e in ranges:
+        if name in existing_names:
+            continue
+        _insert_bookmark_around_range(body, paragraphs, s, e, name, next_id)
+        added.append(name)
+        next_id += 1
+
+    _save_docx_xml(members, doc_tree, settings_tree, output_path)
+
+    missing = [n for n, _ in BLOCK_ORDER if n not in block_starts and n not in existing_names]
+    logger.info("retrofit %s: added=%s missing=%s",
+                docx_path.name, added, missing)
+    return {
+        "bookmarks_added": added,
+        "missing_blocks": missing,
+        "existing_bookmarks": existing_names,
+    }