Retrofit: tighten yod-bet pattern, add cover-block fallback
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 6s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 6s
The "על כן" pattern for block-yod-bet was too greedy and matched mid-discussion
transitional sentences (e.g. "על כן, במקום בו..."), which caused forward-scan
to skip block-yod-alef ("סוף דבר") via the pointer advance.
Tightened to require an operative subject (אנו / הערר / הוועדה / ועדת הערר)
so terminal "על כן, אנו מחליטים" still matches but mid-block transitions don't.
Added structural_fallback for cover blocks (alef/bet/gimel/dalet) — these are
template metadata not present in user-edited DOCX bodies. Inject zero-content
anchors so apply_user_edit can still target them later. The frontend toast
distinguishes real content gaps from fallback anchors.
Also expanded heading patterns based on training corpus inspection:
- block-vav: על המקרקעין חלות / במצב התכנוני / התכניות החלות
- block-zayin: טענות העוררת
- block-chet: עיקר תגובת המשיב
- block-tet: הדיון בוועדת הערר
For case 1130-25, this raises detection from 6/12 to 11/12 blocks — only
block-yod-bet remains missing (Daphna's edit ends at "סוף דבר" + numbered
ruling, no terminal "ההחלטה" or "על כן אנו מחליטים" paragraph).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -85,13 +85,40 @@ _BLOCK_HEADING_PATTERNS: list[tuple[str, list[str]]] = [
|
|||||||
("block-gimel", [r"^נגד\s*$", r"^—\s*נגד\s*—"]),
|
("block-gimel", [r"^נגד\s*$", r"^—\s*נגד\s*—"]),
|
||||||
("block-dalet", [r"^החלטה\s*$"]),
|
("block-dalet", [r"^החלטה\s*$"]),
|
||||||
("block-heh", [r"^רקע\s*$", r"^רקע\s+עובדתי", r"^פתח\s+דבר"]),
|
("block-heh", [r"^רקע\s*$", r"^רקע\s+עובדתי", r"^פתח\s+דבר"]),
|
||||||
("block-vav", [r"^תכניות\s+חלות", r"^ההליכים?\s+שבפנינו", r"^ההליכים?\s+בפני\s+הוועדה\s+המקומית"]),
|
("block-vav", [
|
||||||
("block-zayin", [r"^תמצית\s+טענות", r"^טענות\s+הצדדים", r"^טענות\s+העוררי"]),
|
r"^תכניות\s+חלות",
|
||||||
("block-chet", [r"^תגובת\s+המשיב", r"^עמדת\s+הוועדה\s+המקומית", r"^תשובת"]),
|
r"^ההליכים?\s+שבפנינו",
|
||||||
("block-tet", [r"^ההליכים?\s+בפני\s+ועדת\s+הערר", r"^הדיון\s+בפנינו"]),
|
r"^ההליכים?\s+בפני\s+הוועדה\s+המקומית",
|
||||||
|
r"^על\s+המקרקעין\s+חלות",
|
||||||
|
r"^התכניות?\s+החלות",
|
||||||
|
r"^במצב\s+התכנוני",
|
||||||
|
]),
|
||||||
|
("block-zayin", [
|
||||||
|
r"^תמצית\s+טענות",
|
||||||
|
r"^טענות\s+הצדדים",
|
||||||
|
r"^טענות\s+העוררי",
|
||||||
|
r"^טענות\s+העוררת",
|
||||||
|
]),
|
||||||
|
("block-chet", [
|
||||||
|
r"^תגובת\s+המשיב",
|
||||||
|
r"^עמדת\s+הוועדה\s+המקומית",
|
||||||
|
r"^תשובת",
|
||||||
|
r"^עיקר\s+תגובת\s+המשיב",
|
||||||
|
]),
|
||||||
|
("block-tet", [
|
||||||
|
r"^ההליכים?\s+בפני\s+ועדת\s+הערר",
|
||||||
|
r"^הדיון\s+בפנינו",
|
||||||
|
r"^הדיון\s+בוועדת\s+הערר",
|
||||||
|
]),
|
||||||
("block-yod", [r"^דיון\s+והכרעה", r"^דיון\s*$", r"^ההכרעה"]),
|
("block-yod", [r"^דיון\s+והכרעה", r"^דיון\s*$", r"^ההכרעה"]),
|
||||||
("block-yod-alef", [r"^סוף\s+דבר", r"^סיכום\s*$"]),
|
("block-yod-alef", [r"^סוף\s+דבר", r"^סיכום\s*$"]),
|
||||||
("block-yod-bet", [r"^ההחלטה\s*$", r"^על\s+כן[,\.]?"]),
|
# block-yod-bet "על כן" must be operative — paired with אנו/הערר/הוועדה.
|
||||||
|
# Loose `^על כן` alone matches mid-discussion transitions ("על כן, במקום בו...")
|
||||||
|
# and steals the bookmark from block-yod-alef via forward-scan.
|
||||||
|
("block-yod-bet", [
|
||||||
|
r"^ההחלטה\s*$",
|
||||||
|
r"^על\s+כן[,\.\s]+(?:אנו|הערר|הוועדה|ועדת\s+הערר)\b",
|
||||||
|
]),
|
||||||
]
|
]
|
||||||
|
|
||||||
_COMPILED_HEADING_PATTERNS: list[tuple[str, list[re.Pattern[str]]]] = [
|
_COMPILED_HEADING_PATTERNS: list[tuple[str, list[re.Pattern[str]]]] = [
|
||||||
@@ -252,6 +279,20 @@ def retrofit_bookmarks(
|
|||||||
|
|
||||||
block_starts = _detect_block_starts(paragraphs)
|
block_starts = _detect_block_starts(paragraphs)
|
||||||
|
|
||||||
|
# Cover-block fallback: alef/bet/gimel/dalet are template metadata
|
||||||
|
# (judges, case number, parties, "החלטה" title) that don't appear in
|
||||||
|
# the body of user-edited DOCX files — they live in headers/template.
|
||||||
|
# Inject zero-content anchors at paragraph 0 so apply_user_edit can
|
||||||
|
# still target them later.
|
||||||
|
structural_fallback: list[str] = []
|
||||||
|
cover_blocks = ["block-alef", "block-bet", "block-gimel", "block-dalet"]
|
||||||
|
first_detected_idx = min(block_starts.values()) if block_starts else 0
|
||||||
|
for i, name in enumerate(cover_blocks):
|
||||||
|
if name not in block_starts:
|
||||||
|
idx = min(i, max(0, first_detected_idx - 1))
|
||||||
|
block_starts[name] = idx
|
||||||
|
structural_fallback.append(name)
|
||||||
|
|
||||||
# Calculate end_idx for each block = paragraph before the next block's start,
|
# Calculate end_idx for each block = paragraph before the next block's start,
|
||||||
# or last paragraph if this is the last block found.
|
# or last paragraph if this is the last block found.
|
||||||
ordered_found = sorted(block_starts.items(), key=lambda kv: kv[1])
|
ordered_found = sorted(block_starts.items(), key=lambda kv: kv[1])
|
||||||
@@ -280,11 +321,16 @@ def retrofit_bookmarks(
|
|||||||
|
|
||||||
_save_docx_xml(members, doc_tree, settings_tree, output_path)
|
_save_docx_xml(members, doc_tree, settings_tree, output_path)
|
||||||
|
|
||||||
missing = [n for n, _ in BLOCK_ORDER if n not in block_starts and n not in existing_names]
|
missing = [
|
||||||
logger.info("retrofit %s: added=%s missing=%s",
|
n for n, _ in BLOCK_ORDER
|
||||||
docx_path.name, added, missing)
|
if n not in block_starts
|
||||||
|
and n not in existing_names
|
||||||
|
]
|
||||||
|
logger.info("retrofit %s: added=%s missing=%s structural=%s",
|
||||||
|
docx_path.name, added, missing, structural_fallback)
|
||||||
return {
|
return {
|
||||||
"bookmarks_added": added,
|
"bookmarks_added": added,
|
||||||
"missing_blocks": missing,
|
"missing_blocks": missing,
|
||||||
|
"structural_fallback": structural_fallback,
|
||||||
"existing_bookmarks": existing_names,
|
"existing_bookmarks": existing_names,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -576,6 +576,7 @@ async def apply_user_edit(case_number: str, edit_filename: str) -> str:
|
|||||||
"active_draft_path": str(edit_path),
|
"active_draft_path": str(edit_path),
|
||||||
"bookmarks_added": retrofit_result.get("bookmarks_added", []),
|
"bookmarks_added": retrofit_result.get("bookmarks_added", []),
|
||||||
"missing_blocks": retrofit_result.get("missing_blocks", []),
|
"missing_blocks": retrofit_result.get("missing_blocks", []),
|
||||||
|
"structural_fallback": retrofit_result.get("structural_fallback", []),
|
||||||
"existing_bookmarks": retrofit_result.get("existing_bookmarks", []),
|
"existing_bookmarks": retrofit_result.get("existing_bookmarks", []),
|
||||||
}, ensure_ascii=False, indent=2)
|
}, ensure_ascii=False, indent=2)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -56,9 +56,10 @@ def test_retrofit_distinguishes_yod_from_yod_alef_yod_bet(tmp_path: Path) -> Non
|
|||||||
_make_docx_with_hebrew_blocks(src, ["ט", "י", "יא", "יב"])
|
_make_docx_with_hebrew_blocks(src, ["ט", "י", "יא", "יב"])
|
||||||
|
|
||||||
result = retrofit_bookmarks(src, backup=False)
|
result = retrofit_bookmarks(src, backup=False)
|
||||||
assert set(result["bookmarks_added"]) == {
|
# The four content blocks must all be detected; cover blocks added via fallback.
|
||||||
"block-tet", "block-yod", "block-yod-alef", "block-yod-bet",
|
assert {"block-tet", "block-yod", "block-yod-alef", "block-yod-bet"} <= set(
|
||||||
}
|
result["bookmarks_added"]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_retrofit_skips_existing_bookmarks(tmp_path: Path) -> None:
|
def test_retrofit_skips_existing_bookmarks(tmp_path: Path) -> None:
|
||||||
@@ -67,14 +68,13 @@ def test_retrofit_skips_existing_bookmarks(tmp_path: Path) -> None:
|
|||||||
_make_docx_with_hebrew_blocks(src, ["א", "ב"])
|
_make_docx_with_hebrew_blocks(src, ["א", "ב"])
|
||||||
|
|
||||||
first = retrofit_bookmarks(src, backup=False)
|
first = retrofit_bookmarks(src, backup=False)
|
||||||
assert first["bookmarks_added"] == ["block-alef", "block-bet"]
|
# alef/bet from markers; gimel/dalet from cover-block fallback
|
||||||
|
assert {"block-alef", "block-bet"} <= set(first["bookmarks_added"])
|
||||||
|
|
||||||
second = retrofit_bookmarks(src, backup=False)
|
second = retrofit_bookmarks(src, backup=False)
|
||||||
assert second["bookmarks_added"] == [] # nothing new
|
assert second["bookmarks_added"] == [] # nothing new
|
||||||
assert set(second["existing_bookmarks"]) == {"block-alef", "block-bet"}
|
# All previously added bookmarks now exist on the document
|
||||||
|
assert set(first["bookmarks_added"]) <= set(second["existing_bookmarks"])
|
||||||
# Final document should still have exactly 2 bookmarks
|
|
||||||
assert set(list_bookmarks(src)) == {"block-alef", "block-bet"}
|
|
||||||
|
|
||||||
|
|
||||||
def test_retrofit_creates_backup(tmp_path: Path) -> None:
|
def test_retrofit_creates_backup(tmp_path: Path) -> None:
|
||||||
@@ -92,8 +92,8 @@ def test_retrofit_to_different_output_path_no_backup(tmp_path: Path) -> None:
|
|||||||
retrofit_bookmarks(src, output_path=out)
|
retrofit_bookmarks(src, output_path=out)
|
||||||
# source untouched
|
# source untouched
|
||||||
assert list_bookmarks(src) == []
|
assert list_bookmarks(src) == []
|
||||||
# output has bookmarks
|
# output has bookmarks (alef+bet from markers; gimel+dalet via fallback)
|
||||||
assert set(list_bookmarks(out)) == {"block-alef", "block-bet"}
|
assert {"block-alef", "block-bet"} <= set(list_bookmarks(out))
|
||||||
|
|
||||||
|
|
||||||
def test_retrofit_ignores_marker_in_middle_of_text(tmp_path: Path) -> None:
|
def test_retrofit_ignores_marker_in_middle_of_text(tmp_path: Path) -> None:
|
||||||
@@ -116,7 +116,8 @@ def test_retrofit_out_of_order_markers_picks_forward_only(tmp_path: Path) -> Non
|
|||||||
"""If a later-ordered marker appears first, earlier ones are treated as missing.
|
"""If a later-ordered marker appears first, earlier ones are treated as missing.
|
||||||
|
|
||||||
Scanner advances forward through BLOCK_ORDER — it won't go back to claim
|
Scanner advances forward through BLOCK_ORDER — it won't go back to claim
|
||||||
an earlier marker after already seeing a later one.
|
an earlier marker after already seeing a later one. block-alef will be
|
||||||
|
surfaced via the cover-block fallback rather than from the actual marker.
|
||||||
"""
|
"""
|
||||||
src = tmp_path / "src.docx"
|
src = tmp_path / "src.docx"
|
||||||
doc = Document()
|
doc = Document()
|
||||||
@@ -128,8 +129,9 @@ def test_retrofit_out_of_order_markers_picks_forward_only(tmp_path: Path) -> Non
|
|||||||
result = retrofit_bookmarks(src, backup=False)
|
result = retrofit_bookmarks(src, backup=False)
|
||||||
assert "block-bet" in result["bookmarks_added"]
|
assert "block-bet" in result["bookmarks_added"]
|
||||||
assert "block-gimel" in result["bookmarks_added"]
|
assert "block-gimel" in result["bookmarks_added"]
|
||||||
# 'א' was not detected (the first paragraph was 'ב' — scanner advanced past א)
|
# 'א' marker was skipped by forward-scan, so it appears as a structural
|
||||||
assert "block-alef" in result["missing_blocks"]
|
# fallback (no real content), not from real detection.
|
||||||
|
assert "block-alef" in result["structural_fallback"]
|
||||||
|
|
||||||
|
|
||||||
def test_retrofit_empty_document_reports_all_missing(tmp_path: Path) -> None:
|
def test_retrofit_empty_document_reports_all_missing(tmp_path: Path) -> None:
|
||||||
@@ -139,3 +141,79 @@ def test_retrofit_empty_document_reports_all_missing(tmp_path: Path) -> None:
|
|||||||
result = retrofit_bookmarks(src, backup=False)
|
result = retrofit_bookmarks(src, backup=False)
|
||||||
assert result["bookmarks_added"] == []
|
assert result["bookmarks_added"] == []
|
||||||
assert len(result["missing_blocks"]) == 12
|
assert len(result["missing_blocks"]) == 12
|
||||||
|
|
||||||
|
|
||||||
|
def test_retrofit_al_ken_midblock_does_not_capture_yod_bet(tmp_path: Path) -> None:
|
||||||
|
"""'על כן, במקום בו...' באמצע block-yod לא צריך להיתפס כ-yod-bet."""
|
||||||
|
src = tmp_path / "src.docx"
|
||||||
|
doc = Document()
|
||||||
|
doc.add_paragraph("פתח דבר")
|
||||||
|
doc.add_paragraph("רקע עובדתי קצר.")
|
||||||
|
doc.add_paragraph("דיון והכרעה")
|
||||||
|
doc.add_paragraph("על כן, במקום בו קיים פתרון חניה אין מקום להתערב.")
|
||||||
|
doc.add_paragraph("סוף דבר")
|
||||||
|
doc.add_paragraph("פסק דין סופי.")
|
||||||
|
doc.save(str(src))
|
||||||
|
|
||||||
|
result = retrofit_bookmarks(src, backup=False)
|
||||||
|
assert "block-yod-alef" in result["bookmarks_added"]
|
||||||
|
assert "block-yod-bet" not in result["bookmarks_added"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_retrofit_al_ken_operative_captures_yod_bet(tmp_path: Path) -> None:
|
||||||
|
"""'על כן, אנו מחליטים' באמת אופרטיבי — צריך להיתפס כ-yod-bet."""
|
||||||
|
src = tmp_path / "src.docx"
|
||||||
|
doc = Document()
|
||||||
|
doc.add_paragraph("דיון והכרעה")
|
||||||
|
doc.add_paragraph("נימוקים מפורטים.")
|
||||||
|
doc.add_paragraph("סוף דבר")
|
||||||
|
doc.add_paragraph("על כן, אנו מחליטים לקבל את הערר.")
|
||||||
|
doc.save(str(src))
|
||||||
|
|
||||||
|
result = retrofit_bookmarks(src, backup=False)
|
||||||
|
assert "block-yod-alef" in result["bookmarks_added"]
|
||||||
|
assert "block-yod-bet" in result["bookmarks_added"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_retrofit_vav_al_hamekarkein_pattern(tmp_path: Path) -> None:
|
||||||
|
"""'על המקרקעין חלות התכניות' — דפוס block-vav מקורפוס 1130."""
|
||||||
|
src = tmp_path / "src.docx"
|
||||||
|
doc = Document()
|
||||||
|
doc.add_paragraph("פתח דבר")
|
||||||
|
doc.add_paragraph("המקרקעין מצויים בכתובת...")
|
||||||
|
doc.add_paragraph("על המקרקעין חלות התכניות הבאות")
|
||||||
|
doc.add_paragraph("פירוט תכניות.")
|
||||||
|
doc.add_paragraph("תמצית טענות הצדדים")
|
||||||
|
doc.save(str(src))
|
||||||
|
|
||||||
|
result = retrofit_bookmarks(src, backup=False)
|
||||||
|
assert "block-vav" in result["bookmarks_added"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_retrofit_cover_blocks_structural_fallback(tmp_path: Path) -> None:
|
||||||
|
"""אם alef-dalet לא בקובץ — לקבל bookmarks ריקים בהתחלה (structural_fallback)."""
|
||||||
|
src = tmp_path / "src.docx"
|
||||||
|
doc = Document()
|
||||||
|
doc.add_paragraph("פתח דבר")
|
||||||
|
doc.add_paragraph("תוכן.")
|
||||||
|
doc.add_paragraph("דיון והכרעה")
|
||||||
|
doc.add_paragraph("הכרעה.")
|
||||||
|
doc.save(str(src))
|
||||||
|
|
||||||
|
result = retrofit_bookmarks(src, backup=False)
|
||||||
|
for name in ["block-alef", "block-bet", "block-gimel", "block-dalet"]:
|
||||||
|
assert name in result["bookmarks_added"]
|
||||||
|
assert name not in result["missing_blocks"]
|
||||||
|
assert set(result["structural_fallback"]) == {
|
||||||
|
"block-alef", "block-bet", "block-gimel", "block-dalet",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_retrofit_no_double_fallback_when_cover_present(tmp_path: Path) -> None:
|
||||||
|
"""אם block-alef קיים בקובץ אמיתית — לא לזרוק fallback מבני."""
|
||||||
|
src = tmp_path / "src.docx"
|
||||||
|
_make_docx_with_hebrew_blocks(
|
||||||
|
src, ["א", "ב", "ג", "ד", "ה", "ו", "ז", "ח", "ט", "י", "יא", "יב"],
|
||||||
|
)
|
||||||
|
result = retrofit_bookmarks(src, backup=False)
|
||||||
|
assert result["structural_fallback"] == []
|
||||||
|
|||||||
@@ -115,15 +115,17 @@ export function DraftsPanel({
|
|||||||
onSuccess: (data) => {
|
onSuccess: (data) => {
|
||||||
const added = data.bookmarks_added?.length ?? 0;
|
const added = data.bookmarks_added?.length ?? 0;
|
||||||
const missing = data.missing_blocks?.length ?? 0;
|
const missing = data.missing_blocks?.length ?? 0;
|
||||||
|
const fallback = data.structural_fallback?.length ?? 0;
|
||||||
|
const realDetected = added - fallback;
|
||||||
if (data.apply_status === "completed" || data.apply_status === "ok") {
|
if (data.apply_status === "completed" || data.apply_status === "ok") {
|
||||||
if (added > 0) {
|
if (realDetected > 0) {
|
||||||
toast.success(`הועלה: ${data.filename} — זוהו ${added} בלוקים`);
|
toast.success(`הועלה: ${data.filename} — זוהו ${realDetected} בלוקי תוכן`);
|
||||||
} else {
|
} else {
|
||||||
toast.success(`הועלה: ${data.filename}`);
|
toast.success(`הועלה: ${data.filename}`);
|
||||||
}
|
}
|
||||||
if (missing > 0) {
|
if (missing > 0) {
|
||||||
toast.warning(
|
toast.warning(
|
||||||
`שימו לב: ${missing} בלוקים לא זוהו — ייתכנו בעיות בתיקונים עתידיים`,
|
`שימו לב: ${missing} בלוקי תוכן לא זוהו — בדוק את הכותרות`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ export type UploadResult = {
|
|||||||
active_draft?: string;
|
active_draft?: string;
|
||||||
bookmarks_added?: string[];
|
bookmarks_added?: string[];
|
||||||
missing_blocks?: string[];
|
missing_blocks?: string[];
|
||||||
|
structural_fallback?: string[];
|
||||||
apply_status?: string;
|
apply_status?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -155,6 +156,7 @@ export function useRetrofit(caseNumber: string) {
|
|||||||
active_draft_path: string;
|
active_draft_path: string;
|
||||||
bookmarks_added: string[];
|
bookmarks_added: string[];
|
||||||
missing_blocks: string[];
|
missing_blocks: string[];
|
||||||
|
structural_fallback?: string[];
|
||||||
}>(`/api/cases/${caseNumber}/exports/${filename}/retrofit`, {
|
}>(`/api/cases/${caseNumber}/exports/${filename}/retrofit`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
}),
|
}),
|
||||||
|
|||||||
@@ -2060,6 +2060,7 @@ async def api_upload_export(case_number: str, file: UploadFile = File(...)):
|
|||||||
"active_draft": auto_result.get("active_draft_path"),
|
"active_draft": auto_result.get("active_draft_path"),
|
||||||
"bookmarks_added": auto_result.get("bookmarks_added", []),
|
"bookmarks_added": auto_result.get("bookmarks_added", []),
|
||||||
"missing_blocks": auto_result.get("missing_blocks", []),
|
"missing_blocks": auto_result.get("missing_blocks", []),
|
||||||
|
"structural_fallback": auto_result.get("structural_fallback", []),
|
||||||
"apply_status": auto_result.get("status", "error"),
|
"apply_status": auto_result.get("status", "error"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user