Retrofit: tighten yod-bet pattern, add cover-block fallback
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 6s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 6s
The "על כן" pattern for block-yod-bet was too greedy and matched mid-discussion
transitional sentences (e.g. "על כן, במקום בו..."), which caused forward-scan
to skip block-yod-alef ("סוף דבר") via the pointer advance.
Tightened to require an operative subject (אנו / הערר / הוועדה / ועדת הערר)
so terminal "על כן, אנו מחליטים" still matches but mid-block transitions don't.
Added structural_fallback for cover blocks (alef/bet/gimel/dalet) — these are
template metadata not present in user-edited DOCX bodies. Inject zero-content
anchors so apply_user_edit can still target them later. The frontend toast
distinguishes real content gaps from fallback anchors.
Also expanded heading patterns based on training corpus inspection:
- block-vav: על המקרקעין חלות / במצב התכנוני / התכניות החלות
- block-zayin: טענות העוררת
- block-chet: עיקר תגובת המשיב
- block-tet: הדיון בוועדת הערר
For case 1130-25, this raises detection from 6/12 to 11/12 blocks — only
block-yod-bet remains missing (Daphna's edit ends at "סוף דבר" + numbered
ruling, no terminal "ההחלטה" or "על כן אנו מחליטים" paragraph).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -56,9 +56,10 @@ def test_retrofit_distinguishes_yod_from_yod_alef_yod_bet(tmp_path: Path) -> Non
|
||||
_make_docx_with_hebrew_blocks(src, ["ט", "י", "יא", "יב"])
|
||||
|
||||
result = retrofit_bookmarks(src, backup=False)
|
||||
assert set(result["bookmarks_added"]) == {
|
||||
"block-tet", "block-yod", "block-yod-alef", "block-yod-bet",
|
||||
}
|
||||
# The four content blocks must all be detected; cover blocks added via fallback.
|
||||
assert {"block-tet", "block-yod", "block-yod-alef", "block-yod-bet"} <= set(
|
||||
result["bookmarks_added"]
|
||||
)
|
||||
|
||||
|
||||
def test_retrofit_skips_existing_bookmarks(tmp_path: Path) -> None:
|
||||
@@ -67,14 +68,13 @@ def test_retrofit_skips_existing_bookmarks(tmp_path: Path) -> None:
|
||||
_make_docx_with_hebrew_blocks(src, ["א", "ב"])
|
||||
|
||||
first = retrofit_bookmarks(src, backup=False)
|
||||
assert first["bookmarks_added"] == ["block-alef", "block-bet"]
|
||||
# alef/bet from markers; gimel/dalet from cover-block fallback
|
||||
assert {"block-alef", "block-bet"} <= set(first["bookmarks_added"])
|
||||
|
||||
second = retrofit_bookmarks(src, backup=False)
|
||||
assert second["bookmarks_added"] == [] # nothing new
|
||||
assert set(second["existing_bookmarks"]) == {"block-alef", "block-bet"}
|
||||
|
||||
# Final document should still have exactly 2 bookmarks
|
||||
assert set(list_bookmarks(src)) == {"block-alef", "block-bet"}
|
||||
# All previously added bookmarks now exist on the document
|
||||
assert set(first["bookmarks_added"]) <= set(second["existing_bookmarks"])
|
||||
|
||||
|
||||
def test_retrofit_creates_backup(tmp_path: Path) -> None:
|
||||
@@ -92,8 +92,8 @@ def test_retrofit_to_different_output_path_no_backup(tmp_path: Path) -> None:
|
||||
retrofit_bookmarks(src, output_path=out)
|
||||
# source untouched
|
||||
assert list_bookmarks(src) == []
|
||||
# output has bookmarks
|
||||
assert set(list_bookmarks(out)) == {"block-alef", "block-bet"}
|
||||
# output has bookmarks (alef+bet from markers; gimel+dalet via fallback)
|
||||
assert {"block-alef", "block-bet"} <= set(list_bookmarks(out))
|
||||
|
||||
|
||||
def test_retrofit_ignores_marker_in_middle_of_text(tmp_path: Path) -> None:
|
||||
@@ -116,7 +116,8 @@ def test_retrofit_out_of_order_markers_picks_forward_only(tmp_path: Path) -> Non
|
||||
"""If a later-ordered marker appears first, earlier ones are treated as missing.
|
||||
|
||||
Scanner advances forward through BLOCK_ORDER — it won't go back to claim
|
||||
an earlier marker after already seeing a later one.
|
||||
an earlier marker after already seeing a later one. block-alef will be
|
||||
surfaced via the cover-block fallback rather than from the actual marker.
|
||||
"""
|
||||
src = tmp_path / "src.docx"
|
||||
doc = Document()
|
||||
@@ -128,8 +129,9 @@ def test_retrofit_out_of_order_markers_picks_forward_only(tmp_path: Path) -> Non
|
||||
result = retrofit_bookmarks(src, backup=False)
|
||||
assert "block-bet" in result["bookmarks_added"]
|
||||
assert "block-gimel" in result["bookmarks_added"]
|
||||
# 'א' was not detected (the first paragraph was 'ב' — scanner advanced past א)
|
||||
assert "block-alef" in result["missing_blocks"]
|
||||
# 'א' marker was skipped by forward-scan, so it appears as a structural
|
||||
# fallback (no real content), not from real detection.
|
||||
assert "block-alef" in result["structural_fallback"]
|
||||
|
||||
|
||||
def test_retrofit_empty_document_reports_all_missing(tmp_path: Path) -> None:
|
||||
@@ -139,3 +141,79 @@ def test_retrofit_empty_document_reports_all_missing(tmp_path: Path) -> None:
|
||||
result = retrofit_bookmarks(src, backup=False)
|
||||
assert result["bookmarks_added"] == []
|
||||
assert len(result["missing_blocks"]) == 12
|
||||
|
||||
|
||||
def test_retrofit_al_ken_midblock_does_not_capture_yod_bet(tmp_path: Path) -> None:
|
||||
"""'על כן, במקום בו...' באמצע block-yod לא צריך להיתפס כ-yod-bet."""
|
||||
src = tmp_path / "src.docx"
|
||||
doc = Document()
|
||||
doc.add_paragraph("פתח דבר")
|
||||
doc.add_paragraph("רקע עובדתי קצר.")
|
||||
doc.add_paragraph("דיון והכרעה")
|
||||
doc.add_paragraph("על כן, במקום בו קיים פתרון חניה אין מקום להתערב.")
|
||||
doc.add_paragraph("סוף דבר")
|
||||
doc.add_paragraph("פסק דין סופי.")
|
||||
doc.save(str(src))
|
||||
|
||||
result = retrofit_bookmarks(src, backup=False)
|
||||
assert "block-yod-alef" in result["bookmarks_added"]
|
||||
assert "block-yod-bet" not in result["bookmarks_added"]
|
||||
|
||||
|
||||
def test_retrofit_al_ken_operative_captures_yod_bet(tmp_path: Path) -> None:
|
||||
"""'על כן, אנו מחליטים' באמת אופרטיבי — צריך להיתפס כ-yod-bet."""
|
||||
src = tmp_path / "src.docx"
|
||||
doc = Document()
|
||||
doc.add_paragraph("דיון והכרעה")
|
||||
doc.add_paragraph("נימוקים מפורטים.")
|
||||
doc.add_paragraph("סוף דבר")
|
||||
doc.add_paragraph("על כן, אנו מחליטים לקבל את הערר.")
|
||||
doc.save(str(src))
|
||||
|
||||
result = retrofit_bookmarks(src, backup=False)
|
||||
assert "block-yod-alef" in result["bookmarks_added"]
|
||||
assert "block-yod-bet" in result["bookmarks_added"]
|
||||
|
||||
|
||||
def test_retrofit_vav_al_hamekarkein_pattern(tmp_path: Path) -> None:
|
||||
"""'על המקרקעין חלות התכניות' — דפוס block-vav מקורפוס 1130."""
|
||||
src = tmp_path / "src.docx"
|
||||
doc = Document()
|
||||
doc.add_paragraph("פתח דבר")
|
||||
doc.add_paragraph("המקרקעין מצויים בכתובת...")
|
||||
doc.add_paragraph("על המקרקעין חלות התכניות הבאות")
|
||||
doc.add_paragraph("פירוט תכניות.")
|
||||
doc.add_paragraph("תמצית טענות הצדדים")
|
||||
doc.save(str(src))
|
||||
|
||||
result = retrofit_bookmarks(src, backup=False)
|
||||
assert "block-vav" in result["bookmarks_added"]
|
||||
|
||||
|
||||
def test_retrofit_cover_blocks_structural_fallback(tmp_path: Path) -> None:
|
||||
"""אם alef-dalet לא בקובץ — לקבל bookmarks ריקים בהתחלה (structural_fallback)."""
|
||||
src = tmp_path / "src.docx"
|
||||
doc = Document()
|
||||
doc.add_paragraph("פתח דבר")
|
||||
doc.add_paragraph("תוכן.")
|
||||
doc.add_paragraph("דיון והכרעה")
|
||||
doc.add_paragraph("הכרעה.")
|
||||
doc.save(str(src))
|
||||
|
||||
result = retrofit_bookmarks(src, backup=False)
|
||||
for name in ["block-alef", "block-bet", "block-gimel", "block-dalet"]:
|
||||
assert name in result["bookmarks_added"]
|
||||
assert name not in result["missing_blocks"]
|
||||
assert set(result["structural_fallback"]) == {
|
||||
"block-alef", "block-bet", "block-gimel", "block-dalet",
|
||||
}
|
||||
|
||||
|
||||
def test_retrofit_no_double_fallback_when_cover_present(tmp_path: Path) -> None:
|
||||
"""אם block-alef קיים בקובץ אמיתית — לא לזרוק fallback מבני."""
|
||||
src = tmp_path / "src.docx"
|
||||
_make_docx_with_hebrew_blocks(
|
||||
src, ["א", "ב", "ג", "ד", "ה", "ו", "ז", "ח", "ט", "י", "יא", "יב"],
|
||||
)
|
||||
result = retrofit_bookmarks(src, backup=False)
|
||||
assert result["structural_fallback"] == []
|
||||
|
||||
Reference in New Issue
Block a user