Add Track Changes architecture for draft revisions (CMP + CMPA)
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m29s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m29s
Fixes critical bug in 1033-25: user-uploaded עריכה-*.docx files were
orphaned on disk while exports kept rebuilding from stale DB blocks.
New architecture:
- User-uploaded DOCX becomes the source of truth (cases.active_draft_path)
- System edits via XML surgery with real Word <w:ins>/<w:del> revisions
- User can Accept/Reject each change from within Word
Components:
- docx_reviser.py: XML surgery for Track Changes (15 tests)
- docx_retrofit.py: retroactive bookmark injection with Hebrew marker
detection + heading heuristic (9 tests)
- docx_exporter.py: emits bookmarks around each of the 12 blocks
- 3 new MCP tools: apply_user_edit, list_bookmarks, revise_draft
- 4 new/updated endpoints: upload (auto-registers active draft),
/exports/revise, /exports/bookmarks, /exports/{filename}/retrofit,
/active-draft
- DB migration: cases.active_draft_path column
- UI: correct banner using real v-numbers, "מקור האמת" badge,
detailed upload toast with bookmarks_added/missing_blocks
- agents: legal-exporter (3 export modes), legal-ceo (stage G for
revision handling), legal-writer (revision mode)
Multi-tenancy:
- Works for both CMP (1xxx cases) and CMPA (8xxx/9xxx cases)
- New revise-draft skill added to both companies
- deploy-track-changes.sh syncs skills CMP ↔ CMPA
- retrofit_case.py: one-off retrofit of existing files
Tests: 34 passing (15 reviser + 9 retrofit + 4 exporter bookmarks + 6 e2e)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
290
mcp-server/src/legal_mcp/services/docx_retrofit.py
Normal file
290
mcp-server/src/legal_mcp/services/docx_retrofit.py
Normal file
@@ -0,0 +1,290 @@
|
||||
"""הזרקת bookmarks רטרואקטיבית ל-DOCX שלא נוצרו ע"י ה-exporter.
|
||||
|
||||
כאשר משתמש מעלה `עריכה-v*.docx` שנערך ב-Word מחוץ למערכת, אין בו את ה-
|
||||
bookmarks שאנו מצפים להם (block-alef ... block-yod-bet). השירות כאן
|
||||
מזהה את תחילת כל בלוק לפי סימני הפתיחה העבריים (א., ב., ... יב.) ב-
|
||||
הפסקאות הראשונות שלו, ומזריק bookmarkStart/bookmarkEnd בהתאם.
|
||||
|
||||
נעשה בצורה defensive — אם לא מצליחים לזהות בלוק, הוא פשוט לא יקבל
|
||||
bookmark (`missing_blocks` בתוצאה). השרת אמור להתריע למשתמש.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import shutil
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from legal_mcp.services.docx_reviser import (
|
||||
NSMAP,
|
||||
_load_docx_xml,
|
||||
_save_docx_xml,
|
||||
_w,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Block identification ──────────────────────────────────────────
|
||||
|
||||
# The 12 blocks in order, with their Hebrew letter marker
|
||||
BLOCK_ORDER = [
|
||||
("block-alef", "א"),
|
||||
("block-bet", "ב"),
|
||||
("block-gimel", "ג"),
|
||||
("block-dalet", "ד"),
|
||||
("block-heh", "ה"),
|
||||
("block-vav", "ו"),
|
||||
("block-zayin", "ז"),
|
||||
("block-chet", "ח"),
|
||||
("block-tet", "ט"),
|
||||
("block-yod", "י"),
|
||||
("block-yod-alef", "יא"),
|
||||
("block-yod-bet", "יב"),
|
||||
]
|
||||
|
||||
# Regex matching a paragraph that begins with a Hebrew block marker
|
||||
# followed by '.', ')', ' ', or end-of-string. The marker must be followed
|
||||
# either by whitespace/punctuation or end of text to avoid matching longer
|
||||
# words that happen to start with these letters.
|
||||
_BLOCK_MARKERS_BY_LETTER: dict[str, str] = {letter: name for name, letter in BLOCK_ORDER}
|
||||
|
||||
# Longer markers (יא, יב) first so regex matches them before falling back to 'י'
|
||||
_MARKER_ALTERNATION = "|".join(
|
||||
re.escape(letter)
|
||||
for letter in sorted(_BLOCK_MARKERS_BY_LETTER, key=len, reverse=True)
|
||||
)
|
||||
_BLOCK_MARKER_RE = re.compile(
|
||||
rf"^\s*({_MARKER_ALTERNATION})\s*[\.\)\-]\s*"
|
||||
)
|
||||
|
||||
# Secondary heuristic: Hebrew section headings that reliably mark the
|
||||
# start of each block in the Daphna Tamir style (used when markers
|
||||
# "א.", "ב." etc. are missing — common in user-edited Word files).
|
||||
#
|
||||
# Key observations from the 12-block schema:
|
||||
# block-alef: "בפני: דפנה תמיר" or decision number page
|
||||
# block-bet: "ערר מספר" line
|
||||
# block-gimel: appellants vs respondents (parties)
|
||||
# block-dalet: bold "החלטה" centered
|
||||
# block-heh: "רקע" / "רקע עובדתי" / "פתח דבר"
|
||||
# block-vav: "תכניות חלות" / "ההליך שבפנינו" / "ההליכים בפני"
|
||||
# block-zayin: "תמצית טענות" / "טענות הצדדים"
|
||||
# block-chet: "תגובת המשיבה" / "עמדת הוועדה"
|
||||
# block-tet: "ההליכים בפני ועדת הערר" / "הדיון בפנינו"
|
||||
# block-yod: "דיון והכרעה" / "דיון"
|
||||
# block-yod-alef: "סוף דבר" / "סיכום"
|
||||
# block-yod-bet: "ההחלטה" (signature / closing block)
|
||||
_BLOCK_HEADING_PATTERNS: list[tuple[str, list[str]]] = [
|
||||
("block-alef", [r"בפני[:\s]", r"ועדת הערר"]),
|
||||
("block-bet", [r"^ערר\s+מספר", r"^ערר\s+\d"]),
|
||||
("block-gimel", [r"^נגד\s*$", r"^—\s*נגד\s*—"]),
|
||||
("block-dalet", [r"^החלטה\s*$"]),
|
||||
("block-heh", [r"^רקע\s*$", r"^רקע\s+עובדתי", r"^פתח\s+דבר"]),
|
||||
("block-vav", [r"^תכניות\s+חלות", r"^ההליכים?\s+שבפנינו", r"^ההליכים?\s+בפני\s+הוועדה\s+המקומית"]),
|
||||
("block-zayin", [r"^תמצית\s+טענות", r"^טענות\s+הצדדים", r"^טענות\s+העוררי"]),
|
||||
("block-chet", [r"^תגובת\s+המשיב", r"^עמדת\s+הוועדה\s+המקומית", r"^תשובת"]),
|
||||
("block-tet", [r"^ההליכים?\s+בפני\s+ועדת\s+הערר", r"^הדיון\s+בפנינו"]),
|
||||
("block-yod", [r"^דיון\s+והכרעה", r"^דיון\s*$", r"^ההכרעה"]),
|
||||
("block-yod-alef", [r"^סוף\s+דבר", r"^סיכום\s*$"]),
|
||||
("block-yod-bet", [r"^ההחלטה\s*$", r"^על\s+כן[,\.]?"]),
|
||||
]
|
||||
|
||||
_COMPILED_HEADING_PATTERNS: list[tuple[str, list[re.Pattern[str]]]] = [
|
||||
(name, [re.compile(p) for p in patterns])
|
||||
for name, patterns in _BLOCK_HEADING_PATTERNS
|
||||
]
|
||||
|
||||
|
||||
def _paragraph_text(p: etree._Element) -> str:
|
||||
"""Return the full text of a paragraph, joining all w:t nodes."""
|
||||
return "".join(p.itertext()).strip()
|
||||
|
||||
|
||||
def _detect_block_starts(
|
||||
paragraphs: list[etree._Element],
|
||||
) -> dict[str, int]:
|
||||
"""Return a mapping of block_name → paragraph index (start of that block).
|
||||
|
||||
Uses a greedy scan: for each paragraph, if its text starts with an
|
||||
expected block marker and the block hasn't been assigned yet, assign
|
||||
this paragraph as the block's start.
|
||||
"""
|
||||
found: dict[str, int] = {}
|
||||
expected_order = [name for name, _ in BLOCK_ORDER]
|
||||
pointer = 0 # index into expected_order — next expected block
|
||||
|
||||
for i, p in enumerate(paragraphs):
|
||||
text = _paragraph_text(p)
|
||||
if not text:
|
||||
continue
|
||||
|
||||
matched_name: str | None = None
|
||||
|
||||
# Try marker-based (א., ב., ...) first
|
||||
m = _BLOCK_MARKER_RE.match(text)
|
||||
if m:
|
||||
letter = m.group(1)
|
||||
matched_name = _BLOCK_MARKERS_BY_LETTER.get(letter)
|
||||
|
||||
# Fall back to heading-keyword heuristic (Daphna style)
|
||||
if matched_name is None:
|
||||
for name, patterns in _COMPILED_HEADING_PATTERNS:
|
||||
if name in found:
|
||||
continue
|
||||
# Only check patterns for blocks we haven't assigned yet
|
||||
# AND that come at/after the current pointer — to keep the
|
||||
# greedy forward-scan semantics consistent with markers.
|
||||
if expected_order.index(name) < pointer:
|
||||
continue
|
||||
if any(pat.search(text) for pat in patterns):
|
||||
matched_name = name
|
||||
break
|
||||
|
||||
if matched_name is None:
|
||||
continue
|
||||
if matched_name in found:
|
||||
continue
|
||||
if pointer >= len(expected_order):
|
||||
continue
|
||||
name_idx_in_order = expected_order.index(matched_name)
|
||||
if name_idx_in_order >= pointer:
|
||||
found[matched_name] = i
|
||||
pointer = name_idx_in_order + 1
|
||||
return found
|
||||
|
||||
|
||||
def _insert_bookmark_around_range(
|
||||
body: etree._Element,
|
||||
paragraphs: list[etree._Element],
|
||||
start_idx: int,
|
||||
end_idx: int,
|
||||
name: str,
|
||||
bm_id: int,
|
||||
) -> None:
|
||||
"""Insert bookmarkStart at the start of paragraph start_idx and
|
||||
bookmarkEnd at the end of paragraph end_idx."""
|
||||
start_el = etree.Element(_w("bookmarkStart"))
|
||||
start_el.set(_w("id"), str(bm_id))
|
||||
start_el.set(_w("name"), name)
|
||||
|
||||
end_el = etree.Element(_w("bookmarkEnd"))
|
||||
end_el.set(_w("id"), str(bm_id))
|
||||
|
||||
start_p = paragraphs[start_idx]
|
||||
end_p = paragraphs[end_idx]
|
||||
start_p.insert(0, start_el)
|
||||
end_p.append(end_el)
|
||||
|
||||
|
||||
def _next_bookmark_id(doc_tree: etree._Element) -> int:
|
||||
"""Find max existing bookmark id and return next unused."""
|
||||
max_id = 9999
|
||||
for el in doc_tree.iterfind(".//w:bookmarkStart", NSMAP):
|
||||
wid = el.get(_w("id"))
|
||||
if wid:
|
||||
try:
|
||||
max_id = max(max_id, int(wid))
|
||||
except ValueError:
|
||||
pass
|
||||
return max_id + 1
|
||||
|
||||
|
||||
# ── Public API ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def retrofit_bookmarks(
|
||||
docx_path: str | Path,
|
||||
*,
|
||||
output_path: str | Path | None = None,
|
||||
backup: bool = True,
|
||||
) -> dict:
|
||||
"""Inject block-* bookmarks into an existing DOCX via heuristic detection.
|
||||
|
||||
Args:
|
||||
docx_path: path to DOCX file (modified in place unless output_path set).
|
||||
output_path: if given, write to this path instead of overwriting.
|
||||
backup: if True and writing in place, save the original as
|
||||
`<path>.pre-retrofit.docx` first.
|
||||
|
||||
Returns:
|
||||
{
|
||||
'bookmarks_added': ['block-alef', ...],
|
||||
'missing_blocks': ['block-dalet', ...],
|
||||
'existing_bookmarks': [...] # bookmarks already on the doc
|
||||
}
|
||||
"""
|
||||
docx_path = Path(docx_path)
|
||||
if not docx_path.exists():
|
||||
raise FileNotFoundError(str(docx_path))
|
||||
|
||||
if output_path is None:
|
||||
output_path = docx_path
|
||||
output_path = Path(output_path)
|
||||
|
||||
members, doc_tree, settings_tree = _load_docx_xml(docx_path)
|
||||
|
||||
# Existing bookmarks
|
||||
existing_names: list[str] = []
|
||||
for el in doc_tree.iterfind(".//w:bookmarkStart", NSMAP):
|
||||
name = el.get(_w("name"))
|
||||
if name:
|
||||
existing_names.append(name)
|
||||
|
||||
# Collect *top-level* body paragraphs (don't descend into tables etc.
|
||||
# for now — MVP). The XPath ".//w:p" would include table cells too;
|
||||
# for retrofitting we only care about the main flow.
|
||||
body = doc_tree.find(f".//{_w('body')}")
|
||||
if body is None:
|
||||
raise ValueError("document has no <w:body>")
|
||||
paragraphs = [p for p in body if p.tag == _w("p")]
|
||||
|
||||
if not paragraphs:
|
||||
return {
|
||||
"bookmarks_added": [],
|
||||
"missing_blocks": [n for n, _ in BLOCK_ORDER],
|
||||
"existing_bookmarks": existing_names,
|
||||
}
|
||||
|
||||
block_starts = _detect_block_starts(paragraphs)
|
||||
|
||||
# Calculate end_idx for each block = paragraph before the next block's start,
|
||||
# or last paragraph if this is the last block found.
|
||||
ordered_found = sorted(block_starts.items(), key=lambda kv: kv[1])
|
||||
ranges: list[tuple[str, int, int]] = []
|
||||
for i, (name, start_idx) in enumerate(ordered_found):
|
||||
if i + 1 < len(ordered_found):
|
||||
end_idx = ordered_found[i + 1][1] - 1
|
||||
else:
|
||||
end_idx = len(paragraphs) - 1
|
||||
ranges.append((name, start_idx, max(start_idx, end_idx)))
|
||||
|
||||
# Backup if overwriting in place
|
||||
if backup and output_path.resolve() == docx_path.resolve():
|
||||
backup_path = docx_path.with_suffix(".pre-retrofit.docx")
|
||||
shutil.copy2(str(docx_path), str(backup_path))
|
||||
|
||||
# Inject bookmarks, skipping any that already exist
|
||||
next_id = _next_bookmark_id(doc_tree)
|
||||
added: list[str] = []
|
||||
for name, s, e in ranges:
|
||||
if name in existing_names:
|
||||
continue
|
||||
_insert_bookmark_around_range(body, paragraphs, s, e, name, next_id)
|
||||
added.append(name)
|
||||
next_id += 1
|
||||
|
||||
_save_docx_xml(members, doc_tree, settings_tree, output_path)
|
||||
|
||||
missing = [n for n, _ in BLOCK_ORDER if n not in block_starts and n not in existing_names]
|
||||
logger.info("retrofit %s: added=%s missing=%s",
|
||||
docx_path.name, added, missing)
|
||||
return {
|
||||
"bookmarks_added": added,
|
||||
"missing_blocks": missing,
|
||||
"existing_bookmarks": existing_names,
|
||||
}
|
||||
Reference in New Issue
Block a user