Add Track Changes architecture for draft revisions (CMP + CMPA)
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m29s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m29s
Fixes critical bug in 1033-25: user-uploaded עריכה-*.docx files were
orphaned on disk while exports kept rebuilding from stale DB blocks.
New architecture:
- User-uploaded DOCX becomes the source of truth (cases.active_draft_path)
- System edits via XML surgery with real Word <w:ins>/<w:del> revisions
- User can Accept/Reject each change from within Word
Components:
- docx_reviser.py: XML surgery for Track Changes (15 tests)
- docx_retrofit.py: retroactive bookmark injection with Hebrew marker
detection + heading heuristic (9 tests)
- docx_exporter.py: emits bookmarks around each of the 12 blocks
- 3 new MCP tools: apply_user_edit, list_bookmarks, revise_draft
- 4 new/updated endpoints: upload (auto-registers active draft),
/exports/revise, /exports/bookmarks, /exports/{filename}/retrofit,
/active-draft
- DB migration: cases.active_draft_path column
- UI: correct banner using real v-numbers, "מקור האמת" badge,
detailed upload toast with bookmarks_added/missing_blocks
- agents: legal-exporter (3 export modes), legal-ceo (stage G for
revision handling), legal-writer (revision mode)
Multi-tenancy:
- Works for both CMP (1xxx cases) and CMPA (8xxx/9xxx cases)
- New revise-draft skill added to both companies
- deploy-track-changes.sh syncs skills CMP ↔ CMPA
- retrofit_case.py: one-off retrofit of existing files
Tests: 34 passing (15 reviser + 9 retrofit + 4 exporter bookmarks + 6 e2e)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
514
mcp-server/src/legal_mcp/services/docx_reviser.py
Normal file
514
mcp-server/src/legal_mcp/services/docx_reviser.py
Normal file
@@ -0,0 +1,514 @@
|
||||
"""עריכת DOCX עם Track Changes אמיתיים של Word.
|
||||
|
||||
השירות מיועד לקבל DOCX קיים (עם bookmarks שזיהו אנקורים) ולהחיל עליו
|
||||
עריכות מסומנות כ-w:ins / w:del, שבאים לידי ביטוי ב-Word כ-Track Changes
|
||||
שהמשתמש יכול Accept/Reject.
|
||||
|
||||
אסטרטגיית אנקורים: bookmarks בשמות כגון 'block-yod', 'block-yod-para-3'
|
||||
שמוכנסים בזמן הייצוא הראשוני (docx_exporter.py) או רטרואקטיבית
|
||||
(docx_retrofit.py).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
import zipfile
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
from lxml import etree
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── XML namespaces ─────────────────────────────────────────────────
|
||||
|
||||
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
NSMAP = {"w": W_NS}
|
||||
|
||||
|
||||
def _w(tag: str) -> str:
|
||||
"""Build a fully qualified tag name in the w: namespace."""
|
||||
return f"{{{W_NS}}}{tag}"
|
||||
|
||||
|
||||
# ── Data models ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
RevisionType = Literal["insert_after", "insert_before", "replace", "delete"]
|
||||
StyleType = Literal["body", "quote", "heading", "bold"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Revision:
|
||||
"""A single tracked change to apply to the DOCX."""
|
||||
|
||||
id: str
|
||||
type: RevisionType
|
||||
anchor_bookmark: str
|
||||
content: str = ""
|
||||
style: StyleType = "body"
|
||||
reason: str = ""
|
||||
anchor_position: Literal["start", "end"] = "end"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RevisionResult:
|
||||
"""Result of applying a single revision."""
|
||||
|
||||
id: str
|
||||
status: Literal["applied", "failed"]
|
||||
error: str | None = None
|
||||
ins_id: int | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RevisionBatchResult:
|
||||
"""Aggregate result of applying a revision batch."""
|
||||
|
||||
applied: int = 0
|
||||
failed: int = 0
|
||||
results: list[RevisionResult] = field(default_factory=list)
|
||||
output_path: str = ""
|
||||
|
||||
|
||||
# ── XML helpers ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _load_docx_xml(docx_path: Path) -> tuple[dict[str, bytes], etree._Element, etree._Element]:
|
||||
"""Load a DOCX as a dict of zip members + parsed document/settings trees."""
|
||||
members: dict[str, bytes] = {}
|
||||
with zipfile.ZipFile(docx_path, "r") as zf:
|
||||
for name in zf.namelist():
|
||||
members[name] = zf.read(name)
|
||||
|
||||
if "word/document.xml" not in members:
|
||||
raise ValueError(f"{docx_path}: missing word/document.xml")
|
||||
|
||||
document_tree = etree.fromstring(members["word/document.xml"])
|
||||
settings_bytes = members.get("word/settings.xml")
|
||||
if settings_bytes:
|
||||
settings_tree = etree.fromstring(settings_bytes)
|
||||
else:
|
||||
settings_tree = etree.Element(_w("settings"), nsmap=NSMAP)
|
||||
|
||||
return members, document_tree, settings_tree
|
||||
|
||||
|
||||
def _save_docx_xml(
|
||||
members: dict[str, bytes],
|
||||
document_tree: etree._Element,
|
||||
settings_tree: etree._Element,
|
||||
output_path: Path,
|
||||
) -> None:
|
||||
"""Write a DOCX back to disk with updated document/settings XML."""
|
||||
members = dict(members)
|
||||
members["word/document.xml"] = etree.tostring(
|
||||
document_tree, xml_declaration=True, encoding="UTF-8", standalone=True
|
||||
)
|
||||
members["word/settings.xml"] = etree.tostring(
|
||||
settings_tree, xml_declaration=True, encoding="UTF-8", standalone=True
|
||||
)
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
buffer = BytesIO()
|
||||
with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf:
|
||||
for name, data in members.items():
|
||||
zf.writestr(name, data)
|
||||
output_path.write_bytes(buffer.getvalue())
|
||||
|
||||
|
||||
def _ensure_track_revisions(settings_tree: etree._Element) -> None:
|
||||
"""Ensure <w:trackRevisions/> is present in settings.xml.
|
||||
|
||||
Note: This enables *display* of track changes — actual w:ins/w:del nodes
|
||||
are rendered as tracked regardless. Word respects trackRevisions for
|
||||
recording further user edits too.
|
||||
"""
|
||||
existing = settings_tree.find(_w("trackRevisions"))
|
||||
if existing is None:
|
||||
el = etree.SubElement(settings_tree, _w("trackRevisions"))
|
||||
el.set(_w("val"), "true")
|
||||
|
||||
|
||||
def _next_revision_id(document_tree: etree._Element) -> int:
|
||||
"""Find max existing w:id on w:ins/w:del/w:bookmarkStart and return next."""
|
||||
max_id = 0
|
||||
for xpath in (
|
||||
".//w:ins", ".//w:del", ".//w:bookmarkStart", ".//w:bookmarkEnd",
|
||||
".//w:commentRangeStart", ".//w:comment",
|
||||
):
|
||||
for el in document_tree.iterfind(xpath, NSMAP):
|
||||
val = el.get(_w("id"))
|
||||
if val:
|
||||
try:
|
||||
max_id = max(max_id, int(val))
|
||||
except ValueError:
|
||||
pass
|
||||
return max_id + 1
|
||||
|
||||
|
||||
def _find_bookmark(
|
||||
document_tree: etree._Element, name: str
|
||||
) -> tuple[etree._Element | None, etree._Element | None]:
|
||||
"""Find w:bookmarkStart and w:bookmarkEnd elements by bookmark name."""
|
||||
start = None
|
||||
end = None
|
||||
for el in document_tree.iterfind(".//w:bookmarkStart", NSMAP):
|
||||
if el.get(_w("name")) == name:
|
||||
start = el
|
||||
break
|
||||
if start is None:
|
||||
return None, None
|
||||
bm_id = start.get(_w("id"))
|
||||
for el in document_tree.iterfind(".//w:bookmarkEnd", NSMAP):
|
||||
if el.get(_w("id")) == bm_id:
|
||||
end = el
|
||||
break
|
||||
return start, end
|
||||
|
||||
|
||||
def _find_enclosing_paragraph(element: etree._Element) -> etree._Element | None:
|
||||
"""Walk up from an element to find its enclosing w:p."""
|
||||
cur = element
|
||||
while cur is not None:
|
||||
if cur.tag == _w("p"):
|
||||
return cur
|
||||
cur = cur.getparent()
|
||||
return None
|
||||
|
||||
|
||||
# ── Paragraph builders ─────────────────────────────────────────────
|
||||
|
||||
|
||||
def _build_run(text: str, *, bold: bool = False, italic: bool = False,
|
||||
font: str = "David", size_half_pt: int | None = None) -> etree._Element:
|
||||
"""Build a w:r (run) element with RTL/David defaults and given text."""
|
||||
r = etree.Element(_w("r"))
|
||||
rPr = etree.SubElement(r, _w("rPr"))
|
||||
|
||||
rFonts = etree.SubElement(rPr, _w("rFonts"))
|
||||
rFonts.set(_w("ascii"), font)
|
||||
rFonts.set(_w("hAnsi"), font)
|
||||
rFonts.set(_w("cs"), font)
|
||||
rFonts.set(_w("hint"), "cs")
|
||||
|
||||
if size_half_pt is not None:
|
||||
sz = etree.SubElement(rPr, _w("sz"))
|
||||
sz.set(_w("val"), str(size_half_pt))
|
||||
szCs = etree.SubElement(rPr, _w("szCs"))
|
||||
szCs.set(_w("val"), str(size_half_pt))
|
||||
|
||||
if bold:
|
||||
etree.SubElement(rPr, _w("b"))
|
||||
etree.SubElement(rPr, _w("bCs"))
|
||||
if italic:
|
||||
etree.SubElement(rPr, _w("i"))
|
||||
etree.SubElement(rPr, _w("iCs"))
|
||||
|
||||
etree.SubElement(rPr, _w("rtl"))
|
||||
|
||||
t = etree.SubElement(r, _w("t"))
|
||||
t.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")
|
||||
t.text = text
|
||||
return r
|
||||
|
||||
|
||||
def _build_paragraph(text: str, *, style: StyleType = "body") -> etree._Element:
|
||||
"""Build a w:p (paragraph) with RTL + David + given text."""
|
||||
p = etree.Element(_w("p"))
|
||||
pPr = etree.SubElement(p, _w("pPr"))
|
||||
bidi = etree.SubElement(pPr, _w("bidi"))
|
||||
bidi.set(_w("val"), "1")
|
||||
|
||||
# Right alignment for body/RTL
|
||||
jc = etree.SubElement(pPr, _w("jc"))
|
||||
jc.set(_w("val"), "right")
|
||||
|
||||
rPr_p = etree.SubElement(pPr, _w("rPr"))
|
||||
etree.SubElement(rPr_p, _w("rtl"))
|
||||
|
||||
bold = style in ("heading", "bold")
|
||||
italic = style == "quote"
|
||||
size = None
|
||||
if style == "heading":
|
||||
size = 28 # 14pt
|
||||
elif style == "quote":
|
||||
size = 22 # 11pt
|
||||
run = _build_run(text, bold=bold, italic=italic, size_half_pt=size)
|
||||
p.append(run)
|
||||
return p
|
||||
|
||||
|
||||
def _wrap_in_ins(elements: list[etree._Element], *, ins_id: int,
|
||||
author: str, date_iso: str) -> etree._Element:
|
||||
"""Wrap a list of *run-level* elements in a single <w:ins>."""
|
||||
ins = etree.Element(_w("ins"))
|
||||
ins.set(_w("id"), str(ins_id))
|
||||
ins.set(_w("author"), author)
|
||||
ins.set(_w("date"), date_iso)
|
||||
for el in elements:
|
||||
ins.append(el)
|
||||
return ins
|
||||
|
||||
|
||||
def _make_tracked_paragraph_insert(
|
||||
text: str, *, style: StyleType, ins_id: int, author: str, date_iso: str,
|
||||
mark_id: int | None = None,
|
||||
) -> etree._Element:
|
||||
"""Build a whole tracked-inserted paragraph.
|
||||
|
||||
DOCX convention for a fully-inserted paragraph:
|
||||
1. All <w:r> runs are wrapped in a single <w:ins> (own id).
|
||||
2. The paragraph's pPr/rPr gets an <w:ins> marker for the paragraph
|
||||
mark itself (pilcrow) — this uses its *own* id.
|
||||
"""
|
||||
if mark_id is None:
|
||||
mark_id = ins_id
|
||||
p = _build_paragraph(text, style=style)
|
||||
pPr = p.find(_w("pPr"))
|
||||
assert pPr is not None
|
||||
rPr = pPr.find(_w("rPr"))
|
||||
if rPr is None:
|
||||
rPr = etree.SubElement(pPr, _w("rPr"))
|
||||
ins_mark = etree.SubElement(rPr, _w("ins"))
|
||||
ins_mark.set(_w("id"), str(mark_id))
|
||||
ins_mark.set(_w("author"), author)
|
||||
ins_mark.set(_w("date"), date_iso)
|
||||
|
||||
runs = [child for child in list(p) if child.tag == _w("r")]
|
||||
if runs:
|
||||
for r in runs:
|
||||
p.remove(r)
|
||||
ins = _wrap_in_ins(runs, ins_id=ins_id, author=author, date_iso=date_iso)
|
||||
p.append(ins)
|
||||
return p
|
||||
|
||||
|
||||
def _mark_runs_as_deleted(paragraph: etree._Element, *, del_id: int,
|
||||
author: str, date_iso: str) -> None:
|
||||
"""Convert all <w:r> in a paragraph to <w:del>-wrapped runs.
|
||||
|
||||
Within a <w:del>, <w:t> must become <w:delText>.
|
||||
"""
|
||||
runs = [child for child in list(paragraph) if child.tag == _w("r")]
|
||||
if not runs:
|
||||
return
|
||||
# Convert <w:t> → <w:delText> inside each run
|
||||
for r in runs:
|
||||
for t in r.findall(_w("t")):
|
||||
t.tag = _w("delText")
|
||||
paragraph.remove(r)
|
||||
wrapper = etree.Element(_w("del"))
|
||||
wrapper.set(_w("id"), str(del_id))
|
||||
wrapper.set(_w("author"), author)
|
||||
wrapper.set(_w("date"), date_iso)
|
||||
for r in runs:
|
||||
wrapper.append(r)
|
||||
paragraph.append(wrapper)
|
||||
|
||||
|
||||
# ── Revision application ───────────────────────────────────────────
|
||||
|
||||
|
||||
def _apply_insert(
|
||||
document_tree: etree._Element,
|
||||
revision: Revision,
|
||||
*,
|
||||
ins_id: int,
|
||||
author: str,
|
||||
date_iso: str,
|
||||
) -> RevisionResult:
|
||||
"""Apply insert_after / insert_before relative to a bookmark."""
|
||||
start, end = _find_bookmark(document_tree, revision.anchor_bookmark)
|
||||
if start is None:
|
||||
return RevisionResult(id=revision.id, status="failed",
|
||||
error=f"bookmark '{revision.anchor_bookmark}' not found")
|
||||
|
||||
# Pick anchor element based on position
|
||||
if revision.type == "insert_before":
|
||||
anchor = start
|
||||
else: # insert_after — default
|
||||
anchor = end if end is not None else start
|
||||
|
||||
enclosing_p = _find_enclosing_paragraph(anchor)
|
||||
if enclosing_p is None:
|
||||
return RevisionResult(id=revision.id, status="failed",
|
||||
error="anchor has no enclosing paragraph")
|
||||
|
||||
# Build new tracked paragraph. ins_id for run wrapper, ins_id+1 for mark.
|
||||
new_p = _make_tracked_paragraph_insert(
|
||||
revision.content, style=revision.style,
|
||||
ins_id=ins_id, mark_id=ins_id + 1,
|
||||
author=author, date_iso=date_iso,
|
||||
)
|
||||
|
||||
parent = enclosing_p.getparent()
|
||||
if parent is None:
|
||||
return RevisionResult(id=revision.id, status="failed",
|
||||
error="enclosing paragraph has no parent")
|
||||
idx = list(parent).index(enclosing_p)
|
||||
insert_idx = idx if revision.type == "insert_before" else idx + 1
|
||||
parent.insert(insert_idx, new_p)
|
||||
|
||||
return RevisionResult(id=revision.id, status="applied", ins_id=ins_id)
|
||||
|
||||
|
||||
def _apply_delete(
|
||||
document_tree: etree._Element,
|
||||
revision: Revision,
|
||||
*,
|
||||
del_id: int,
|
||||
author: str,
|
||||
date_iso: str,
|
||||
) -> RevisionResult:
|
||||
"""Mark the paragraph enclosed by a bookmark as deleted."""
|
||||
start, end = _find_bookmark(document_tree, revision.anchor_bookmark)
|
||||
if start is None:
|
||||
return RevisionResult(id=revision.id, status="failed",
|
||||
error=f"bookmark '{revision.anchor_bookmark}' not found")
|
||||
|
||||
enclosing_p = _find_enclosing_paragraph(start)
|
||||
if enclosing_p is None:
|
||||
return RevisionResult(id=revision.id, status="failed",
|
||||
error="anchor has no enclosing paragraph")
|
||||
|
||||
_mark_runs_as_deleted(enclosing_p, del_id=del_id,
|
||||
author=author, date_iso=date_iso)
|
||||
return RevisionResult(id=revision.id, status="applied", ins_id=del_id)
|
||||
|
||||
|
||||
def _apply_replace(
|
||||
document_tree: etree._Element,
|
||||
revision: Revision,
|
||||
*,
|
||||
ins_id: int,
|
||||
del_id: int,
|
||||
author: str,
|
||||
date_iso: str,
|
||||
) -> RevisionResult:
|
||||
"""Replace = delete the existing paragraph + insert new one after it."""
|
||||
start, end = _find_bookmark(document_tree, revision.anchor_bookmark)
|
||||
if start is None:
|
||||
return RevisionResult(id=revision.id, status="failed",
|
||||
error=f"bookmark '{revision.anchor_bookmark}' not found")
|
||||
|
||||
enclosing_p = _find_enclosing_paragraph(start)
|
||||
if enclosing_p is None:
|
||||
return RevisionResult(id=revision.id, status="failed",
|
||||
error="anchor has no enclosing paragraph")
|
||||
|
||||
parent = enclosing_p.getparent()
|
||||
if parent is None:
|
||||
return RevisionResult(id=revision.id, status="failed",
|
||||
error="enclosing paragraph has no parent")
|
||||
|
||||
new_p = _make_tracked_paragraph_insert(
|
||||
revision.content, style=revision.style,
|
||||
ins_id=ins_id, mark_id=ins_id + 1,
|
||||
author=author, date_iso=date_iso,
|
||||
)
|
||||
idx = list(parent).index(enclosing_p)
|
||||
parent.insert(idx + 1, new_p)
|
||||
|
||||
_mark_runs_as_deleted(enclosing_p, del_id=del_id,
|
||||
author=author, date_iso=date_iso)
|
||||
return RevisionResult(id=revision.id, status="applied", ins_id=ins_id)
|
||||
|
||||
|
||||
# ── Public API ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def apply_tracked_revisions(
|
||||
source_path: str | Path,
|
||||
output_path: str | Path,
|
||||
revisions: list[Revision],
|
||||
*,
|
||||
author: str = "מערכת AI",
|
||||
date: datetime | None = None,
|
||||
) -> RevisionBatchResult:
|
||||
"""Apply a batch of tracked revisions to a DOCX, producing a new DOCX.
|
||||
|
||||
The source file is never mutated. Output is a new DOCX with <w:ins> /
|
||||
<w:del> markers that Word renders as Track Changes (Accept/Reject).
|
||||
|
||||
Args:
|
||||
source_path: existing DOCX (e.g. עריכה-v1.docx) — retains user edits.
|
||||
output_path: where to write the revised DOCX (e.g. טיוטה-v6.docx).
|
||||
revisions: list of Revision objects. Anchors are bookmark names.
|
||||
author: displayed as the revision author in Word.
|
||||
date: revision timestamp (defaults to now, UTC).
|
||||
|
||||
Returns:
|
||||
RevisionBatchResult with per-revision status.
|
||||
"""
|
||||
source_path = Path(source_path)
|
||||
output_path = Path(output_path)
|
||||
|
||||
if date is None:
|
||||
date = datetime.now(timezone.utc)
|
||||
date_iso = date.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
members, doc_tree, settings_tree = _load_docx_xml(source_path)
|
||||
_ensure_track_revisions(settings_tree)
|
||||
|
||||
next_id = _next_revision_id(doc_tree)
|
||||
|
||||
batch = RevisionBatchResult()
|
||||
for rev in revisions:
|
||||
try:
|
||||
if rev.type in ("insert_after", "insert_before"):
|
||||
result = _apply_insert(doc_tree, rev, ins_id=next_id,
|
||||
author=author, date_iso=date_iso)
|
||||
# insert consumes 2 IDs: run-wrapper + paragraph-mark
|
||||
next_id += 2
|
||||
elif rev.type == "delete":
|
||||
result = _apply_delete(doc_tree, rev, del_id=next_id,
|
||||
author=author, date_iso=date_iso)
|
||||
next_id += 1
|
||||
elif rev.type == "replace":
|
||||
result = _apply_replace(doc_tree, rev,
|
||||
ins_id=next_id, del_id=next_id + 2,
|
||||
author=author, date_iso=date_iso)
|
||||
# replace consumes 3 IDs: ins-run, ins-mark, del
|
||||
next_id += 3
|
||||
else:
|
||||
result = RevisionResult(id=rev.id, status="failed",
|
||||
error=f"unknown type: {rev.type}")
|
||||
except Exception as e: # pragma: no cover - defensive
|
||||
logger.exception("revision %s failed", rev.id)
|
||||
result = RevisionResult(id=rev.id, status="failed", error=str(e))
|
||||
|
||||
batch.results.append(result)
|
||||
if result.status == "applied":
|
||||
batch.applied += 1
|
||||
else:
|
||||
batch.failed += 1
|
||||
|
||||
_save_docx_xml(members, doc_tree, settings_tree, output_path)
|
||||
batch.output_path = str(output_path)
|
||||
logger.info("applied %d revisions (failed %d) → %s",
|
||||
batch.applied, batch.failed, output_path)
|
||||
return batch
|
||||
|
||||
|
||||
def list_bookmarks(docx_path: str | Path) -> list[str]:
|
||||
"""Return bookmark names present in the DOCX (excluding '_' internal ones)."""
|
||||
docx_path = Path(docx_path)
|
||||
members, doc_tree, _ = _load_docx_xml(docx_path)
|
||||
names: list[str] = []
|
||||
for el in doc_tree.iterfind(".//w:bookmarkStart", NSMAP):
|
||||
name = el.get(_w("name"))
|
||||
if name and not name.startswith("_"):
|
||||
names.append(name)
|
||||
return names
|
||||
|
||||
|
||||
def copy_with_revisions(
|
||||
source_path: str | Path, output_path: str | Path,
|
||||
) -> None:
|
||||
"""Copy source → output unchanged (used when revisions list is empty)."""
|
||||
shutil.copy2(str(source_path), str(output_path))
|
||||
Reference in New Issue
Block a user