Files
legal-ai/mcp-server/tests/test_docx_retrofit.py
Chaim 36ca713dfa
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 6s
Retrofit: tighten yod-bet pattern, add cover-block fallback
The "על כן" pattern for block-yod-bet was too greedy and matched mid-discussion
transitional sentences (e.g. "על כן, במקום בו..."), which caused forward-scan
to skip block-yod-alef ("סוף דבר") via the pointer advance.

Tightened to require an operative subject (אנו / הערר / הוועדה / ועדת הערר)
so terminal "על כן, אנו מחליטים" still matches but mid-block transitions don't.

Added structural_fallback for cover blocks (alef/bet/gimel/dalet) — these are
template metadata not present in user-edited DOCX bodies. Inject zero-content
anchors so apply_user_edit can still target them later. The frontend toast
distinguishes real content gaps from fallback anchors.

Also expanded heading patterns based on training corpus inspection:
- block-vav: על המקרקעין חלות / במצב התכנוני / התכניות החלות
- block-zayin: טענות העוררת
- block-chet: עיקר תגובת המשיב
- block-tet: הדיון בוועדת הערר

For case 1130-25, this raises detection from 6/12 to 11/12 blocks — only
block-yod-bet remains missing (Daphna's edit ends at "סוף דבר" + numbered
ruling, no terminal "ההחלטה" or "על כן אנו מחליטים" paragraph).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 06:57:41 +00:00

220 lines
8.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""בדיקות docx_retrofit — הזרקת bookmarks רטרואקטיבית."""
from __future__ import annotations
from pathlib import Path
from docx import Document
from legal_mcp.services.docx_retrofit import (
BLOCK_ORDER,
retrofit_bookmarks,
)
from legal_mcp.services.docx_reviser import list_bookmarks
def _make_docx_with_hebrew_blocks(path: Path, markers: list[str]) -> None:
"""Create a DOCX where each paragraph starts with a Hebrew block marker."""
doc = Document()
for marker in markers:
doc.add_paragraph(f"{marker}. תוכן הבלוק שמתחיל ב-{marker}")
doc.add_paragraph(f"עוד פסקה בבלוק {marker}")
doc.save(str(path))
def test_retrofit_detects_all_standard_blocks(tmp_path: Path) -> None:
src = tmp_path / "src.docx"
_make_docx_with_hebrew_blocks(
src, ["א", "ב", "ג", "ד", "ה", "ו", "ז", "ח", "ט", "י", "יא", "יב"],
)
result = retrofit_bookmarks(src, backup=False)
assert len(result["bookmarks_added"]) == 12
assert result["missing_blocks"] == []
names = list_bookmarks(src)
expected = {name for name, _ in BLOCK_ORDER}
assert set(names) == expected
def test_retrofit_reports_missing_blocks(tmp_path: Path) -> None:
src = tmp_path / "src.docx"
# Only 4 blocks present
_make_docx_with_hebrew_blocks(src, ["א", "ב", "ג", "ד"])
result = retrofit_bookmarks(src, backup=False)
assert result["bookmarks_added"] == [
"block-alef", "block-bet", "block-gimel", "block-dalet",
]
assert "block-heh" in result["missing_blocks"]
assert "block-yod-bet" in result["missing_blocks"]
def test_retrofit_distinguishes_yod_from_yod_alef_yod_bet(tmp_path: Path) -> None:
"""י, יא, יב must all be distinguished — longer markers win."""
src = tmp_path / "src.docx"
_make_docx_with_hebrew_blocks(src, ["ט", "י", "יא", "יב"])
result = retrofit_bookmarks(src, backup=False)
# The four content blocks must all be detected; cover blocks added via fallback.
assert {"block-tet", "block-yod", "block-yod-alef", "block-yod-bet"} <= set(
result["bookmarks_added"]
)
def test_retrofit_skips_existing_bookmarks(tmp_path: Path) -> None:
"""Running retrofit twice doesn't duplicate bookmarks."""
src = tmp_path / "src.docx"
_make_docx_with_hebrew_blocks(src, ["א", "ב"])
first = retrofit_bookmarks(src, backup=False)
# alef/bet from markers; gimel/dalet from cover-block fallback
assert {"block-alef", "block-bet"} <= set(first["bookmarks_added"])
second = retrofit_bookmarks(src, backup=False)
assert second["bookmarks_added"] == [] # nothing new
# All previously added bookmarks now exist on the document
assert set(first["bookmarks_added"]) <= set(second["existing_bookmarks"])
def test_retrofit_creates_backup(tmp_path: Path) -> None:
src = tmp_path / "file.docx"
_make_docx_with_hebrew_blocks(src, ["א", "ב"])
retrofit_bookmarks(src) # backup=True (default)
backup = src.with_suffix(".pre-retrofit.docx")
assert backup.exists()
def test_retrofit_to_different_output_path_no_backup(tmp_path: Path) -> None:
src = tmp_path / "src.docx"
out = tmp_path / "out.docx"
_make_docx_with_hebrew_blocks(src, ["א", "ב"])
retrofit_bookmarks(src, output_path=out)
# source untouched
assert list_bookmarks(src) == []
# output has bookmarks (alef+bet from markers; gimel+dalet via fallback)
assert {"block-alef", "block-bet"} <= set(list_bookmarks(out))
def test_retrofit_ignores_marker_in_middle_of_text(tmp_path: Path) -> None:
"""A lone 'י' inside body text (not at start) should not be detected as block."""
src = tmp_path / "src.docx"
doc = Document()
doc.add_paragraph("א. תחילת הבלוק")
doc.add_paragraph("טקסט עם האות י לא בתחילת שורה, זה לא בלוק.")
doc.add_paragraph("ב. בלוק שני")
doc.save(str(src))
result = retrofit_bookmarks(src, backup=False)
assert "block-alef" in result["bookmarks_added"]
assert "block-bet" in result["bookmarks_added"]
# 'block-yod' should NOT be detected
assert "block-yod" not in result["bookmarks_added"]
def test_retrofit_out_of_order_markers_picks_forward_only(tmp_path: Path) -> None:
"""If a later-ordered marker appears first, earlier ones are treated as missing.
Scanner advances forward through BLOCK_ORDER — it won't go back to claim
an earlier marker after already seeing a later one. block-alef will be
surfaced via the cover-block fallback rather than from the actual marker.
"""
src = tmp_path / "src.docx"
doc = Document()
doc.add_paragraph("ב. מופיע ראשון")
doc.add_paragraph("א. מופיע אחרי — יידחה כי 'א' לפני 'ב'")
doc.add_paragraph("ג. בלוק גימל")
doc.save(str(src))
result = retrofit_bookmarks(src, backup=False)
assert "block-bet" in result["bookmarks_added"]
assert "block-gimel" in result["bookmarks_added"]
# 'א' marker was skipped by forward-scan, so it appears as a structural
# fallback (no real content), not from real detection.
assert "block-alef" in result["structural_fallback"]
def test_retrofit_empty_document_reports_all_missing(tmp_path: Path) -> None:
src = tmp_path / "empty.docx"
doc = Document()
doc.save(str(src))
result = retrofit_bookmarks(src, backup=False)
assert result["bookmarks_added"] == []
assert len(result["missing_blocks"]) == 12
def test_retrofit_al_ken_midblock_does_not_capture_yod_bet(tmp_path: Path) -> None:
"""'על כן, במקום בו...' באמצע block-yod לא צריך להיתפס כ-yod-bet."""
src = tmp_path / "src.docx"
doc = Document()
doc.add_paragraph("פתח דבר")
doc.add_paragraph("רקע עובדתי קצר.")
doc.add_paragraph("דיון והכרעה")
doc.add_paragraph("על כן, במקום בו קיים פתרון חניה אין מקום להתערב.")
doc.add_paragraph("סוף דבר")
doc.add_paragraph("פסק דין סופי.")
doc.save(str(src))
result = retrofit_bookmarks(src, backup=False)
assert "block-yod-alef" in result["bookmarks_added"]
assert "block-yod-bet" not in result["bookmarks_added"]
def test_retrofit_al_ken_operative_captures_yod_bet(tmp_path: Path) -> None:
"""'על כן, אנו מחליטים' באמת אופרטיבי — צריך להיתפס כ-yod-bet."""
src = tmp_path / "src.docx"
doc = Document()
doc.add_paragraph("דיון והכרעה")
doc.add_paragraph("נימוקים מפורטים.")
doc.add_paragraph("סוף דבר")
doc.add_paragraph("על כן, אנו מחליטים לקבל את הערר.")
doc.save(str(src))
result = retrofit_bookmarks(src, backup=False)
assert "block-yod-alef" in result["bookmarks_added"]
assert "block-yod-bet" in result["bookmarks_added"]
def test_retrofit_vav_al_hamekarkein_pattern(tmp_path: Path) -> None:
"""'על המקרקעין חלות התכניות' — דפוס block-vav מקורפוס 1130."""
src = tmp_path / "src.docx"
doc = Document()
doc.add_paragraph("פתח דבר")
doc.add_paragraph("המקרקעין מצויים בכתובת...")
doc.add_paragraph("על המקרקעין חלות התכניות הבאות")
doc.add_paragraph("פירוט תכניות.")
doc.add_paragraph("תמצית טענות הצדדים")
doc.save(str(src))
result = retrofit_bookmarks(src, backup=False)
assert "block-vav" in result["bookmarks_added"]
def test_retrofit_cover_blocks_structural_fallback(tmp_path: Path) -> None:
"""אם alef-dalet לא בקובץ — לקבל bookmarks ריקים בהתחלה (structural_fallback)."""
src = tmp_path / "src.docx"
doc = Document()
doc.add_paragraph("פתח דבר")
doc.add_paragraph("תוכן.")
doc.add_paragraph("דיון והכרעה")
doc.add_paragraph("הכרעה.")
doc.save(str(src))
result = retrofit_bookmarks(src, backup=False)
for name in ["block-alef", "block-bet", "block-gimel", "block-dalet"]:
assert name in result["bookmarks_added"]
assert name not in result["missing_blocks"]
assert set(result["structural_fallback"]) == {
"block-alef", "block-bet", "block-gimel", "block-dalet",
}
def test_retrofit_no_double_fallback_when_cover_present(tmp_path: Path) -> None:
"""אם block-alef קיים בקובץ אמיתית — לא לזרוק fallback מבני."""
src = tmp_path / "src.docx"
_make_docx_with_hebrew_blocks(
src, ["א", "ב", "ג", "ד", "ה", "ו", "ז", "ח", "ט", "י", "יא", "יב"],
)
result = retrofit_bookmarks(src, backup=False)
assert result["structural_fallback"] == []