Add Track Changes architecture for draft revisions (CMP + CMPA)
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m29s

Fixes critical bug in 1033-25: user-uploaded עריכה-*.docx files were
orphaned on disk while exports kept rebuilding from stale DB blocks.

New architecture:
- User-uploaded DOCX becomes the source of truth (cases.active_draft_path)
- System edits via XML surgery with real Word <w:ins>/<w:del> revisions
- User can Accept/Reject each change from within Word

Components:
- docx_reviser.py: XML surgery for Track Changes (15 tests)
- docx_retrofit.py: retroactive bookmark injection with Hebrew marker
  detection + heading heuristic (9 tests)
- docx_exporter.py: emits bookmarks around each of the 12 blocks
- 3 new MCP tools: apply_user_edit, list_bookmarks, revise_draft
- 4 new/updated endpoints: upload (auto-registers active draft),
  /exports/revise, /exports/bookmarks, /exports/{filename}/retrofit,
  /active-draft
- DB migration: cases.active_draft_path column
- UI: correct banner using real v-numbers, "מקור האמת" badge,
  detailed upload toast with bookmarks_added/missing_blocks
- agents: legal-exporter (3 export modes), legal-ceo (stage G for
  revision handling), legal-writer (revision mode)

Multi-tenancy:
- Works for both CMP (1xxx cases) and CMPA (8xxx/9xxx cases)
- New revise-draft skill added to both companies
- deploy-track-changes.sh syncs skills CMP ↔ CMPA
- retrofit_case.py: one-off retrofit of existing files

Tests: 34 passing (15 reviser + 9 retrofit + 4 exporter bookmarks + 6 e2e)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-16 18:49:30 +00:00
parent 28daff58be
commit 726498126d
20 changed files with 2419 additions and 23 deletions

View File

@@ -0,0 +1,514 @@
"""עריכת DOCX עם Track Changes אמיתיים של Word.
השירות מיועד לקבל DOCX קיים (עם bookmarks שזיהו אנקורים) ולהחיל עליו
עריכות מסומנות כ-w:ins / w:del, שבאים לידי ביטוי ב-Word כ-Track Changes
שהמשתמש יכול Accept/Reject.
אסטרטגיית אנקורים: bookmarks בשמות כגון 'block-yod', 'block-yod-para-3'
שמוכנסים בזמן הייצוא הראשוני (docx_exporter.py) או רטרואקטיבית
(docx_retrofit.py).
"""
from __future__ import annotations
import logging
import shutil
import zipfile
from dataclasses import dataclass, field
from datetime import datetime, timezone
from io import BytesIO
from pathlib import Path
from typing import Literal
from lxml import etree
logger = logging.getLogger(__name__)
# ── XML namespaces ─────────────────────────────────────────────────
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
NSMAP = {"w": W_NS}
def _w(tag: str) -> str:
"""Build a fully qualified tag name in the w: namespace."""
return f"{{{W_NS}}}{tag}"
# ── Data models ────────────────────────────────────────────────────
RevisionType = Literal["insert_after", "insert_before", "replace", "delete"]
StyleType = Literal["body", "quote", "heading", "bold"]
@dataclass
class Revision:
"""A single tracked change to apply to the DOCX."""
id: str
type: RevisionType
anchor_bookmark: str
content: str = ""
style: StyleType = "body"
reason: str = ""
anchor_position: Literal["start", "end"] = "end"
@dataclass
class RevisionResult:
"""Result of applying a single revision."""
id: str
status: Literal["applied", "failed"]
error: str | None = None
ins_id: int | None = None
@dataclass
class RevisionBatchResult:
"""Aggregate result of applying a revision batch."""
applied: int = 0
failed: int = 0
results: list[RevisionResult] = field(default_factory=list)
output_path: str = ""
# ── XML helpers ────────────────────────────────────────────────────
def _load_docx_xml(docx_path: Path) -> tuple[dict[str, bytes], etree._Element, etree._Element]:
"""Load a DOCX as a dict of zip members + parsed document/settings trees."""
members: dict[str, bytes] = {}
with zipfile.ZipFile(docx_path, "r") as zf:
for name in zf.namelist():
members[name] = zf.read(name)
if "word/document.xml" not in members:
raise ValueError(f"{docx_path}: missing word/document.xml")
document_tree = etree.fromstring(members["word/document.xml"])
settings_bytes = members.get("word/settings.xml")
if settings_bytes:
settings_tree = etree.fromstring(settings_bytes)
else:
settings_tree = etree.Element(_w("settings"), nsmap=NSMAP)
return members, document_tree, settings_tree
def _save_docx_xml(
members: dict[str, bytes],
document_tree: etree._Element,
settings_tree: etree._Element,
output_path: Path,
) -> None:
"""Write a DOCX back to disk with updated document/settings XML."""
members = dict(members)
members["word/document.xml"] = etree.tostring(
document_tree, xml_declaration=True, encoding="UTF-8", standalone=True
)
members["word/settings.xml"] = etree.tostring(
settings_tree, xml_declaration=True, encoding="UTF-8", standalone=True
)
output_path.parent.mkdir(parents=True, exist_ok=True)
buffer = BytesIO()
with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf:
for name, data in members.items():
zf.writestr(name, data)
output_path.write_bytes(buffer.getvalue())
def _ensure_track_revisions(settings_tree: etree._Element) -> None:
"""Ensure <w:trackRevisions/> is present in settings.xml.
Note: This enables *display* of track changes — actual w:ins/w:del nodes
are rendered as tracked regardless. Word respects trackRevisions for
recording further user edits too.
"""
existing = settings_tree.find(_w("trackRevisions"))
if existing is None:
el = etree.SubElement(settings_tree, _w("trackRevisions"))
el.set(_w("val"), "true")
def _next_revision_id(document_tree: etree._Element) -> int:
"""Find max existing w:id on w:ins/w:del/w:bookmarkStart and return next."""
max_id = 0
for xpath in (
".//w:ins", ".//w:del", ".//w:bookmarkStart", ".//w:bookmarkEnd",
".//w:commentRangeStart", ".//w:comment",
):
for el in document_tree.iterfind(xpath, NSMAP):
val = el.get(_w("id"))
if val:
try:
max_id = max(max_id, int(val))
except ValueError:
pass
return max_id + 1
def _find_bookmark(
document_tree: etree._Element, name: str
) -> tuple[etree._Element | None, etree._Element | None]:
"""Find w:bookmarkStart and w:bookmarkEnd elements by bookmark name."""
start = None
end = None
for el in document_tree.iterfind(".//w:bookmarkStart", NSMAP):
if el.get(_w("name")) == name:
start = el
break
if start is None:
return None, None
bm_id = start.get(_w("id"))
for el in document_tree.iterfind(".//w:bookmarkEnd", NSMAP):
if el.get(_w("id")) == bm_id:
end = el
break
return start, end
def _find_enclosing_paragraph(element: etree._Element) -> etree._Element | None:
"""Walk up from an element to find its enclosing w:p."""
cur = element
while cur is not None:
if cur.tag == _w("p"):
return cur
cur = cur.getparent()
return None
# ── Paragraph builders ─────────────────────────────────────────────
def _build_run(text: str, *, bold: bool = False, italic: bool = False,
font: str = "David", size_half_pt: int | None = None) -> etree._Element:
"""Build a w:r (run) element with RTL/David defaults and given text."""
r = etree.Element(_w("r"))
rPr = etree.SubElement(r, _w("rPr"))
rFonts = etree.SubElement(rPr, _w("rFonts"))
rFonts.set(_w("ascii"), font)
rFonts.set(_w("hAnsi"), font)
rFonts.set(_w("cs"), font)
rFonts.set(_w("hint"), "cs")
if size_half_pt is not None:
sz = etree.SubElement(rPr, _w("sz"))
sz.set(_w("val"), str(size_half_pt))
szCs = etree.SubElement(rPr, _w("szCs"))
szCs.set(_w("val"), str(size_half_pt))
if bold:
etree.SubElement(rPr, _w("b"))
etree.SubElement(rPr, _w("bCs"))
if italic:
etree.SubElement(rPr, _w("i"))
etree.SubElement(rPr, _w("iCs"))
etree.SubElement(rPr, _w("rtl"))
t = etree.SubElement(r, _w("t"))
t.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")
t.text = text
return r
def _build_paragraph(text: str, *, style: StyleType = "body") -> etree._Element:
"""Build a w:p (paragraph) with RTL + David + given text."""
p = etree.Element(_w("p"))
pPr = etree.SubElement(p, _w("pPr"))
bidi = etree.SubElement(pPr, _w("bidi"))
bidi.set(_w("val"), "1")
# Right alignment for body/RTL
jc = etree.SubElement(pPr, _w("jc"))
jc.set(_w("val"), "right")
rPr_p = etree.SubElement(pPr, _w("rPr"))
etree.SubElement(rPr_p, _w("rtl"))
bold = style in ("heading", "bold")
italic = style == "quote"
size = None
if style == "heading":
size = 28 # 14pt
elif style == "quote":
size = 22 # 11pt
run = _build_run(text, bold=bold, italic=italic, size_half_pt=size)
p.append(run)
return p
def _wrap_in_ins(elements: list[etree._Element], *, ins_id: int,
author: str, date_iso: str) -> etree._Element:
"""Wrap a list of *run-level* elements in a single <w:ins>."""
ins = etree.Element(_w("ins"))
ins.set(_w("id"), str(ins_id))
ins.set(_w("author"), author)
ins.set(_w("date"), date_iso)
for el in elements:
ins.append(el)
return ins
def _make_tracked_paragraph_insert(
text: str, *, style: StyleType, ins_id: int, author: str, date_iso: str,
mark_id: int | None = None,
) -> etree._Element:
"""Build a whole tracked-inserted paragraph.
DOCX convention for a fully-inserted paragraph:
1. All <w:r> runs are wrapped in a single <w:ins> (own id).
2. The paragraph's pPr/rPr gets an <w:ins> marker for the paragraph
mark itself (pilcrow) — this uses its *own* id.
"""
if mark_id is None:
mark_id = ins_id
p = _build_paragraph(text, style=style)
pPr = p.find(_w("pPr"))
assert pPr is not None
rPr = pPr.find(_w("rPr"))
if rPr is None:
rPr = etree.SubElement(pPr, _w("rPr"))
ins_mark = etree.SubElement(rPr, _w("ins"))
ins_mark.set(_w("id"), str(mark_id))
ins_mark.set(_w("author"), author)
ins_mark.set(_w("date"), date_iso)
runs = [child for child in list(p) if child.tag == _w("r")]
if runs:
for r in runs:
p.remove(r)
ins = _wrap_in_ins(runs, ins_id=ins_id, author=author, date_iso=date_iso)
p.append(ins)
return p
def _mark_runs_as_deleted(paragraph: etree._Element, *, del_id: int,
author: str, date_iso: str) -> None:
"""Convert all <w:r> in a paragraph to <w:del>-wrapped runs.
Within a <w:del>, <w:t> must become <w:delText>.
"""
runs = [child for child in list(paragraph) if child.tag == _w("r")]
if not runs:
return
# Convert <w:t> → <w:delText> inside each run
for r in runs:
for t in r.findall(_w("t")):
t.tag = _w("delText")
paragraph.remove(r)
wrapper = etree.Element(_w("del"))
wrapper.set(_w("id"), str(del_id))
wrapper.set(_w("author"), author)
wrapper.set(_w("date"), date_iso)
for r in runs:
wrapper.append(r)
paragraph.append(wrapper)
# ── Revision application ───────────────────────────────────────────
def _apply_insert(
document_tree: etree._Element,
revision: Revision,
*,
ins_id: int,
author: str,
date_iso: str,
) -> RevisionResult:
"""Apply insert_after / insert_before relative to a bookmark."""
start, end = _find_bookmark(document_tree, revision.anchor_bookmark)
if start is None:
return RevisionResult(id=revision.id, status="failed",
error=f"bookmark '{revision.anchor_bookmark}' not found")
# Pick anchor element based on position
if revision.type == "insert_before":
anchor = start
else: # insert_after — default
anchor = end if end is not None else start
enclosing_p = _find_enclosing_paragraph(anchor)
if enclosing_p is None:
return RevisionResult(id=revision.id, status="failed",
error="anchor has no enclosing paragraph")
# Build new tracked paragraph. ins_id for run wrapper, ins_id+1 for mark.
new_p = _make_tracked_paragraph_insert(
revision.content, style=revision.style,
ins_id=ins_id, mark_id=ins_id + 1,
author=author, date_iso=date_iso,
)
parent = enclosing_p.getparent()
if parent is None:
return RevisionResult(id=revision.id, status="failed",
error="enclosing paragraph has no parent")
idx = list(parent).index(enclosing_p)
insert_idx = idx if revision.type == "insert_before" else idx + 1
parent.insert(insert_idx, new_p)
return RevisionResult(id=revision.id, status="applied", ins_id=ins_id)
def _apply_delete(
document_tree: etree._Element,
revision: Revision,
*,
del_id: int,
author: str,
date_iso: str,
) -> RevisionResult:
"""Mark the paragraph enclosed by a bookmark as deleted."""
start, end = _find_bookmark(document_tree, revision.anchor_bookmark)
if start is None:
return RevisionResult(id=revision.id, status="failed",
error=f"bookmark '{revision.anchor_bookmark}' not found")
enclosing_p = _find_enclosing_paragraph(start)
if enclosing_p is None:
return RevisionResult(id=revision.id, status="failed",
error="anchor has no enclosing paragraph")
_mark_runs_as_deleted(enclosing_p, del_id=del_id,
author=author, date_iso=date_iso)
return RevisionResult(id=revision.id, status="applied", ins_id=del_id)
def _apply_replace(
document_tree: etree._Element,
revision: Revision,
*,
ins_id: int,
del_id: int,
author: str,
date_iso: str,
) -> RevisionResult:
"""Replace = delete the existing paragraph + insert new one after it."""
start, end = _find_bookmark(document_tree, revision.anchor_bookmark)
if start is None:
return RevisionResult(id=revision.id, status="failed",
error=f"bookmark '{revision.anchor_bookmark}' not found")
enclosing_p = _find_enclosing_paragraph(start)
if enclosing_p is None:
return RevisionResult(id=revision.id, status="failed",
error="anchor has no enclosing paragraph")
parent = enclosing_p.getparent()
if parent is None:
return RevisionResult(id=revision.id, status="failed",
error="enclosing paragraph has no parent")
new_p = _make_tracked_paragraph_insert(
revision.content, style=revision.style,
ins_id=ins_id, mark_id=ins_id + 1,
author=author, date_iso=date_iso,
)
idx = list(parent).index(enclosing_p)
parent.insert(idx + 1, new_p)
_mark_runs_as_deleted(enclosing_p, del_id=del_id,
author=author, date_iso=date_iso)
return RevisionResult(id=revision.id, status="applied", ins_id=ins_id)
# ── Public API ─────────────────────────────────────────────────────
def apply_tracked_revisions(
source_path: str | Path,
output_path: str | Path,
revisions: list[Revision],
*,
author: str = "מערכת AI",
date: datetime | None = None,
) -> RevisionBatchResult:
"""Apply a batch of tracked revisions to a DOCX, producing a new DOCX.
The source file is never mutated. Output is a new DOCX with <w:ins> /
<w:del> markers that Word renders as Track Changes (Accept/Reject).
Args:
source_path: existing DOCX (e.g. עריכה-v1.docx) — retains user edits.
output_path: where to write the revised DOCX (e.g. טיוטה-v6.docx).
revisions: list of Revision objects. Anchors are bookmark names.
author: displayed as the revision author in Word.
date: revision timestamp (defaults to now, UTC).
Returns:
RevisionBatchResult with per-revision status.
"""
source_path = Path(source_path)
output_path = Path(output_path)
if date is None:
date = datetime.now(timezone.utc)
date_iso = date.strftime("%Y-%m-%dT%H:%M:%SZ")
members, doc_tree, settings_tree = _load_docx_xml(source_path)
_ensure_track_revisions(settings_tree)
next_id = _next_revision_id(doc_tree)
batch = RevisionBatchResult()
for rev in revisions:
try:
if rev.type in ("insert_after", "insert_before"):
result = _apply_insert(doc_tree, rev, ins_id=next_id,
author=author, date_iso=date_iso)
# insert consumes 2 IDs: run-wrapper + paragraph-mark
next_id += 2
elif rev.type == "delete":
result = _apply_delete(doc_tree, rev, del_id=next_id,
author=author, date_iso=date_iso)
next_id += 1
elif rev.type == "replace":
result = _apply_replace(doc_tree, rev,
ins_id=next_id, del_id=next_id + 2,
author=author, date_iso=date_iso)
# replace consumes 3 IDs: ins-run, ins-mark, del
next_id += 3
else:
result = RevisionResult(id=rev.id, status="failed",
error=f"unknown type: {rev.type}")
except Exception as e: # pragma: no cover - defensive
logger.exception("revision %s failed", rev.id)
result = RevisionResult(id=rev.id, status="failed", error=str(e))
batch.results.append(result)
if result.status == "applied":
batch.applied += 1
else:
batch.failed += 1
_save_docx_xml(members, doc_tree, settings_tree, output_path)
batch.output_path = str(output_path)
logger.info("applied %d revisions (failed %d) → %s",
batch.applied, batch.failed, output_path)
return batch
def list_bookmarks(docx_path: str | Path) -> list[str]:
"""Return bookmark names present in the DOCX (excluding '_' internal ones)."""
docx_path = Path(docx_path)
members, doc_tree, _ = _load_docx_xml(docx_path)
names: list[str] = []
for el in doc_tree.iterfind(".//w:bookmarkStart", NSMAP):
name = el.get(_w("name"))
if name and not name.startswith("_"):
names.append(name)
return names
def copy_with_revisions(
source_path: str | Path, output_path: str | Path,
) -> None:
"""Copy source → output unchanged (used when revisions list is empty)."""
shutil.copy2(str(source_path), str(output_path))