"""הזרקת bookmarks רטרואקטיבית ל-DOCX שלא נוצרו ע"י ה-exporter. כאשר משתמש מעלה `עריכה-v*.docx` שנערך ב-Word מחוץ למערכת, אין בו את ה- bookmarks שאנו מצפים להם (block-alef ... block-yod-bet). השירות כאן מזהה את תחילת כל בלוק לפי סימני הפתיחה העבריים (א., ב., ... יב.) ב- הפסקאות הראשונות שלו, ומזריק bookmarkStart/bookmarkEnd בהתאם. נעשה בצורה defensive — אם לא מצליחים לזהות בלוק, הוא פשוט לא יקבל bookmark (`missing_blocks` בתוצאה). השרת אמור להתריע למשתמש. """ from __future__ import annotations import logging import re import shutil import zipfile from io import BytesIO from pathlib import Path from lxml import etree from legal_mcp.services.docx_reviser import ( NSMAP, _load_docx_xml, _save_docx_xml, _w, ) logger = logging.getLogger(__name__) # ── Block identification ────────────────────────────────────────── # The 12 blocks in order, with their Hebrew letter marker BLOCK_ORDER = [ ("block-alef", "א"), ("block-bet", "ב"), ("block-gimel", "ג"), ("block-dalet", "ד"), ("block-heh", "ה"), ("block-vav", "ו"), ("block-zayin", "ז"), ("block-chet", "ח"), ("block-tet", "ט"), ("block-yod", "י"), ("block-yod-alef", "יא"), ("block-yod-bet", "יב"), ] # Regex matching a paragraph that begins with a Hebrew block marker # followed by '.', ')', ' ', or end-of-string. The marker must be followed # either by whitespace/punctuation or end of text to avoid matching longer # words that happen to start with these letters. _BLOCK_MARKERS_BY_LETTER: dict[str, str] = {letter: name for name, letter in BLOCK_ORDER} # Longer markers (יא, יב) first so regex matches them before falling back to 'י' _MARKER_ALTERNATION = "|".join( re.escape(letter) for letter in sorted(_BLOCK_MARKERS_BY_LETTER, key=len, reverse=True) ) _BLOCK_MARKER_RE = re.compile( rf"^\s*({_MARKER_ALTERNATION})\s*[\.\)\-]\s*" ) # Secondary heuristic: Hebrew section headings that reliably mark the # start of each block in the Daphna Tamir style (used when markers # "א.", "ב." etc. are missing — common in user-edited Word files). # # Key observations from the 12-block schema: # block-alef: "בפני: דפנה תמיר" or decision number page # block-bet: "ערר מספר" line # block-gimel: appellants vs respondents (parties) # block-dalet: bold "החלטה" centered # block-heh: "רקע" / "רקע עובדתי" / "פתח דבר" # block-vav: "תכניות חלות" / "ההליך שבפנינו" / "ההליכים בפני" # block-zayin: "תמצית טענות" / "טענות הצדדים" # block-chet: "תגובת המשיבה" / "עמדת הוועדה" # block-tet: "ההליכים בפני ועדת הערר" / "הדיון בפנינו" # block-yod: "דיון והכרעה" / "דיון" # block-yod-alef: "סוף דבר" / "סיכום" # block-yod-bet: "ההחלטה" (signature / closing block) _BLOCK_HEADING_PATTERNS: list[tuple[str, list[str]]] = [ ("block-alef", [r"בפני[:\s]", r"ועדת הערר"]), ("block-bet", [r"^ערר\s+מספר", r"^ערר\s+\d"]), ("block-gimel", [r"^נגד\s*$", r"^—\s*נגד\s*—"]), ("block-dalet", [r"^החלטה\s*$"]), ("block-heh", [r"^רקע\s*$", r"^רקע\s+עובדתי", r"^פתח\s+דבר"]), ("block-vav", [ r"^תכניות\s+חלות", r"^ההליכים?\s+שבפנינו", r"^ההליכים?\s+בפני\s+הוועדה\s+המקומית", r"^על\s+המקרקעין\s+חלות", r"^התכניות?\s+החלות", r"^במצב\s+התכנוני", ]), ("block-zayin", [ r"^תמצית\s+טענות", r"^טענות\s+הצדדים", r"^טענות\s+העוררי", r"^טענות\s+העוררת", ]), ("block-chet", [ r"^תגובת\s+המשיב", r"^עמדת\s+הוועדה\s+המקומית", r"^תשובת", r"^עיקר\s+תגובת\s+המשיב", ]), ("block-tet", [ r"^ההליכים?\s+בפני\s+ועדת\s+הערר", r"^הדיון\s+בפנינו", r"^הדיון\s+בוועדת\s+הערר", ]), ("block-yod", [r"^דיון\s+והכרעה", r"^דיון\s*$", r"^ההכרעה"]), ("block-yod-alef", [r"^סוף\s+דבר", r"^סיכום\s*$"]), # block-yod-bet "על כן" must be operative — paired with אנו/הערר/הוועדה. # Loose `^על כן` alone matches mid-discussion transitions ("על כן, במקום בו...") # and steals the bookmark from block-yod-alef via forward-scan. ("block-yod-bet", [ r"^ההחלטה\s*$", r"^על\s+כן[,\.\s]+(?:אנו|הערר|הוועדה|ועדת\s+הערר)\b", ]), ] _COMPILED_HEADING_PATTERNS: list[tuple[str, list[re.Pattern[str]]]] = [ (name, [re.compile(p) for p in patterns]) for name, patterns in _BLOCK_HEADING_PATTERNS ] def _paragraph_text(p: etree._Element) -> str: """Return the full text of a paragraph, joining all w:t nodes.""" return "".join(p.itertext()).strip() def _detect_block_starts( paragraphs: list[etree._Element], ) -> dict[str, int]: """Return a mapping of block_name → paragraph index (start of that block). Uses a greedy scan: for each paragraph, if its text starts with an expected block marker and the block hasn't been assigned yet, assign this paragraph as the block's start. """ found: dict[str, int] = {} expected_order = [name for name, _ in BLOCK_ORDER] pointer = 0 # index into expected_order — next expected block for i, p in enumerate(paragraphs): text = _paragraph_text(p) if not text: continue matched_name: str | None = None # Try marker-based (א., ב., ...) first m = _BLOCK_MARKER_RE.match(text) if m: letter = m.group(1) matched_name = _BLOCK_MARKERS_BY_LETTER.get(letter) # Fall back to heading-keyword heuristic (Daphna style) if matched_name is None: for name, patterns in _COMPILED_HEADING_PATTERNS: if name in found: continue # Only check patterns for blocks we haven't assigned yet # AND that come at/after the current pointer — to keep the # greedy forward-scan semantics consistent with markers. if expected_order.index(name) < pointer: continue if any(pat.search(text) for pat in patterns): matched_name = name break if matched_name is None: continue if matched_name in found: continue if pointer >= len(expected_order): continue name_idx_in_order = expected_order.index(matched_name) if name_idx_in_order >= pointer: found[matched_name] = i pointer = name_idx_in_order + 1 return found def _insert_bookmark_around_range( body: etree._Element, paragraphs: list[etree._Element], start_idx: int, end_idx: int, name: str, bm_id: int, ) -> None: """Insert bookmarkStart at the start of paragraph start_idx and bookmarkEnd at the end of paragraph end_idx.""" start_el = etree.Element(_w("bookmarkStart")) start_el.set(_w("id"), str(bm_id)) start_el.set(_w("name"), name) end_el = etree.Element(_w("bookmarkEnd")) end_el.set(_w("id"), str(bm_id)) start_p = paragraphs[start_idx] end_p = paragraphs[end_idx] start_p.insert(0, start_el) end_p.append(end_el) def _next_bookmark_id(doc_tree: etree._Element) -> int: """Find max existing bookmark id and return next unused.""" max_id = 9999 for el in doc_tree.iterfind(".//w:bookmarkStart", NSMAP): wid = el.get(_w("id")) if wid: try: max_id = max(max_id, int(wid)) except ValueError: pass return max_id + 1 # ── Public API ──────────────────────────────────────────────────── def retrofit_bookmarks( docx_path: str | Path, *, output_path: str | Path | None = None, backup: bool = True, ) -> dict: """Inject block-* bookmarks into an existing DOCX via heuristic detection. Args: docx_path: path to DOCX file (modified in place unless output_path set). output_path: if given, write to this path instead of overwriting. backup: if True and writing in place, save the original as `.pre-retrofit.docx` first. Returns: { 'bookmarks_added': ['block-alef', ...], 'missing_blocks': ['block-dalet', ...], 'existing_bookmarks': [...] # bookmarks already on the doc } """ docx_path = Path(docx_path) if not docx_path.exists(): raise FileNotFoundError(str(docx_path)) if output_path is None: output_path = docx_path output_path = Path(output_path) members, doc_tree, settings_tree = _load_docx_xml(docx_path) # Existing bookmarks existing_names: list[str] = [] for el in doc_tree.iterfind(".//w:bookmarkStart", NSMAP): name = el.get(_w("name")) if name: existing_names.append(name) # Collect *top-level* body paragraphs (don't descend into tables etc. # for now — MVP). The XPath ".//w:p" would include table cells too; # for retrofitting we only care about the main flow. body = doc_tree.find(f".//{_w('body')}") if body is None: raise ValueError("document has no ") paragraphs = [p for p in body if p.tag == _w("p")] if not paragraphs: return { "bookmarks_added": [], "missing_blocks": [n for n, _ in BLOCK_ORDER], "existing_bookmarks": existing_names, } block_starts = _detect_block_starts(paragraphs) # Cover-block fallback: alef/bet/gimel/dalet are template metadata # (judges, case number, parties, "החלטה" title) that don't appear in # the body of user-edited DOCX files — they live in headers/template. # Inject zero-content anchors at paragraph 0 so apply_user_edit can # still target them later. structural_fallback: list[str] = [] cover_blocks = ["block-alef", "block-bet", "block-gimel", "block-dalet"] first_detected_idx = min(block_starts.values()) if block_starts else 0 for i, name in enumerate(cover_blocks): if name not in block_starts: idx = min(i, max(0, first_detected_idx - 1)) block_starts[name] = idx structural_fallback.append(name) # Calculate end_idx for each block = paragraph before the next block's start, # or last paragraph if this is the last block found. ordered_found = sorted(block_starts.items(), key=lambda kv: kv[1]) ranges: list[tuple[str, int, int]] = [] for i, (name, start_idx) in enumerate(ordered_found): if i + 1 < len(ordered_found): end_idx = ordered_found[i + 1][1] - 1 else: end_idx = len(paragraphs) - 1 ranges.append((name, start_idx, max(start_idx, end_idx))) # Backup if overwriting in place if backup and output_path.resolve() == docx_path.resolve(): backup_path = docx_path.with_suffix(".pre-retrofit.docx") shutil.copy2(str(docx_path), str(backup_path)) # Inject bookmarks, skipping any that already exist next_id = _next_bookmark_id(doc_tree) added: list[str] = [] for name, s, e in ranges: if name in existing_names: continue _insert_bookmark_around_range(body, paragraphs, s, e, name, next_id) added.append(name) next_id += 1 _save_docx_xml(members, doc_tree, settings_tree, output_path) missing = [ n for n, _ in BLOCK_ORDER if n not in block_starts and n not in existing_names ] logger.info("retrofit %s: added=%s missing=%s structural=%s", docx_path.name, added, missing, structural_fallback) return { "bookmarks_added": added, "missing_blocks": missing, "structural_fallback": structural_fallback, "existing_bookmarks": existing_names, }