From 36ca713dfa18ac36bf27790285003337e8297e0a Mon Sep 17 00:00:00 2001 From: Chaim Date: Sun, 26 Apr 2026 06:57:41 +0000 Subject: [PATCH] Retrofit: tighten yod-bet pattern, add cover-block fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "על כן" pattern for block-yod-bet was too greedy and matched mid-discussion transitional sentences (e.g. "על כן, במקום בו..."), which caused forward-scan to skip block-yod-alef ("סוף דבר") via the pointer advance. Tightened to require an operative subject (אנו / הערר / הוועדה / ועדת הערר) so terminal "על כן, אנו מחליטים" still matches but mid-block transitions don't. Added structural_fallback for cover blocks (alef/bet/gimel/dalet) — these are template metadata not present in user-edited DOCX bodies. Inject zero-content anchors so apply_user_edit can still target them later. The frontend toast distinguishes real content gaps from fallback anchors. Also expanded heading patterns based on training corpus inspection: - block-vav: על המקרקעין חלות / במצב התכנוני / התכניות החלות - block-zayin: טענות העוררת - block-chet: עיקר תגובת המשיב - block-tet: הדיון בוועדת הערר For case 1130-25, this raises detection from 6/12 to 11/12 blocks — only block-yod-bet remains missing (Daphna's edit ends at "סוף דבר" + numbered ruling, no terminal "ההחלטה" or "על כן אנו מחליטים" paragraph). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/legal_mcp/services/docx_retrofit.py | 62 +++++++++-- mcp-server/src/legal_mcp/tools/drafting.py | 1 + mcp-server/tests/test_docx_retrofit.py | 104 +++++++++++++++--- web-ui/src/components/cases/drafts-panel.tsx | 8 +- web-ui/src/lib/api/exports.ts | 2 + web/app.py | 1 + 6 files changed, 154 insertions(+), 24 deletions(-) diff --git a/mcp-server/src/legal_mcp/services/docx_retrofit.py b/mcp-server/src/legal_mcp/services/docx_retrofit.py index 4291259..bc1dc8e 100644 --- a/mcp-server/src/legal_mcp/services/docx_retrofit.py +++ b/mcp-server/src/legal_mcp/services/docx_retrofit.py @@ -85,13 +85,40 @@ _BLOCK_HEADING_PATTERNS: list[tuple[str, list[str]]] = [ ("block-gimel", [r"^נגד\s*$", r"^—\s*נגד\s*—"]), ("block-dalet", [r"^החלטה\s*$"]), ("block-heh", [r"^רקע\s*$", r"^רקע\s+עובדתי", r"^פתח\s+דבר"]), - ("block-vav", [r"^תכניות\s+חלות", r"^ההליכים?\s+שבפנינו", r"^ההליכים?\s+בפני\s+הוועדה\s+המקומית"]), - ("block-zayin", [r"^תמצית\s+טענות", r"^טענות\s+הצדדים", r"^טענות\s+העוררי"]), - ("block-chet", [r"^תגובת\s+המשיב", r"^עמדת\s+הוועדה\s+המקומית", r"^תשובת"]), - ("block-tet", [r"^ההליכים?\s+בפני\s+ועדת\s+הערר", r"^הדיון\s+בפנינו"]), + ("block-vav", [ + r"^תכניות\s+חלות", + r"^ההליכים?\s+שבפנינו", + r"^ההליכים?\s+בפני\s+הוועדה\s+המקומית", + r"^על\s+המקרקעין\s+חלות", + r"^התכניות?\s+החלות", + r"^במצב\s+התכנוני", + ]), + ("block-zayin", [ + r"^תמצית\s+טענות", + r"^טענות\s+הצדדים", + r"^טענות\s+העוררי", + r"^טענות\s+העוררת", + ]), + ("block-chet", [ + r"^תגובת\s+המשיב", + r"^עמדת\s+הוועדה\s+המקומית", + r"^תשובת", + r"^עיקר\s+תגובת\s+המשיב", + ]), + ("block-tet", [ + r"^ההליכים?\s+בפני\s+ועדת\s+הערר", + r"^הדיון\s+בפנינו", + r"^הדיון\s+בוועדת\s+הערר", + ]), ("block-yod", [r"^דיון\s+והכרעה", r"^דיון\s*$", r"^ההכרעה"]), ("block-yod-alef", [r"^סוף\s+דבר", r"^סיכום\s*$"]), - ("block-yod-bet", [r"^ההחלטה\s*$", r"^על\s+כן[,\.]?"]), + # block-yod-bet "על כן" must be operative — paired with אנו/הערר/הוועדה. + # Loose `^על כן` alone matches mid-discussion transitions ("על כן, במקום בו...") + # and steals the bookmark from block-yod-alef via forward-scan. + ("block-yod-bet", [ + r"^ההחלטה\s*$", + r"^על\s+כן[,\.\s]+(?:אנו|הערר|הוועדה|ועדת\s+הערר)\b", + ]), ] _COMPILED_HEADING_PATTERNS: list[tuple[str, list[re.Pattern[str]]]] = [ @@ -252,6 +279,20 @@ def retrofit_bookmarks( block_starts = _detect_block_starts(paragraphs) + # Cover-block fallback: alef/bet/gimel/dalet are template metadata + # (judges, case number, parties, "החלטה" title) that don't appear in + # the body of user-edited DOCX files — they live in headers/template. + # Inject zero-content anchors at paragraph 0 so apply_user_edit can + # still target them later. + structural_fallback: list[str] = [] + cover_blocks = ["block-alef", "block-bet", "block-gimel", "block-dalet"] + first_detected_idx = min(block_starts.values()) if block_starts else 0 + for i, name in enumerate(cover_blocks): + if name not in block_starts: + idx = min(i, max(0, first_detected_idx - 1)) + block_starts[name] = idx + structural_fallback.append(name) + # Calculate end_idx for each block = paragraph before the next block's start, # or last paragraph if this is the last block found. ordered_found = sorted(block_starts.items(), key=lambda kv: kv[1]) @@ -280,11 +321,16 @@ def retrofit_bookmarks( _save_docx_xml(members, doc_tree, settings_tree, output_path) - missing = [n for n, _ in BLOCK_ORDER if n not in block_starts and n not in existing_names] - logger.info("retrofit %s: added=%s missing=%s", - docx_path.name, added, missing) + missing = [ + n for n, _ in BLOCK_ORDER + if n not in block_starts + and n not in existing_names + ] + logger.info("retrofit %s: added=%s missing=%s structural=%s", + docx_path.name, added, missing, structural_fallback) return { "bookmarks_added": added, "missing_blocks": missing, + "structural_fallback": structural_fallback, "existing_bookmarks": existing_names, } diff --git a/mcp-server/src/legal_mcp/tools/drafting.py b/mcp-server/src/legal_mcp/tools/drafting.py index 81a69f1..eb955f2 100644 --- a/mcp-server/src/legal_mcp/tools/drafting.py +++ b/mcp-server/src/legal_mcp/tools/drafting.py @@ -576,6 +576,7 @@ async def apply_user_edit(case_number: str, edit_filename: str) -> str: "active_draft_path": str(edit_path), "bookmarks_added": retrofit_result.get("bookmarks_added", []), "missing_blocks": retrofit_result.get("missing_blocks", []), + "structural_fallback": retrofit_result.get("structural_fallback", []), "existing_bookmarks": retrofit_result.get("existing_bookmarks", []), }, ensure_ascii=False, indent=2) except Exception as e: diff --git a/mcp-server/tests/test_docx_retrofit.py b/mcp-server/tests/test_docx_retrofit.py index 40f30bd..834ab80 100644 --- a/mcp-server/tests/test_docx_retrofit.py +++ b/mcp-server/tests/test_docx_retrofit.py @@ -56,9 +56,10 @@ def test_retrofit_distinguishes_yod_from_yod_alef_yod_bet(tmp_path: Path) -> Non _make_docx_with_hebrew_blocks(src, ["ט", "י", "יא", "יב"]) result = retrofit_bookmarks(src, backup=False) - assert set(result["bookmarks_added"]) == { - "block-tet", "block-yod", "block-yod-alef", "block-yod-bet", - } + # The four content blocks must all be detected; cover blocks added via fallback. + assert {"block-tet", "block-yod", "block-yod-alef", "block-yod-bet"} <= set( + result["bookmarks_added"] + ) def test_retrofit_skips_existing_bookmarks(tmp_path: Path) -> None: @@ -67,14 +68,13 @@ def test_retrofit_skips_existing_bookmarks(tmp_path: Path) -> None: _make_docx_with_hebrew_blocks(src, ["א", "ב"]) first = retrofit_bookmarks(src, backup=False) - assert first["bookmarks_added"] == ["block-alef", "block-bet"] + # alef/bet from markers; gimel/dalet from cover-block fallback + assert {"block-alef", "block-bet"} <= set(first["bookmarks_added"]) second = retrofit_bookmarks(src, backup=False) assert second["bookmarks_added"] == [] # nothing new - assert set(second["existing_bookmarks"]) == {"block-alef", "block-bet"} - - # Final document should still have exactly 2 bookmarks - assert set(list_bookmarks(src)) == {"block-alef", "block-bet"} + # All previously added bookmarks now exist on the document + assert set(first["bookmarks_added"]) <= set(second["existing_bookmarks"]) def test_retrofit_creates_backup(tmp_path: Path) -> None: @@ -92,8 +92,8 @@ def test_retrofit_to_different_output_path_no_backup(tmp_path: Path) -> None: retrofit_bookmarks(src, output_path=out) # source untouched assert list_bookmarks(src) == [] - # output has bookmarks - assert set(list_bookmarks(out)) == {"block-alef", "block-bet"} + # output has bookmarks (alef+bet from markers; gimel+dalet via fallback) + assert {"block-alef", "block-bet"} <= set(list_bookmarks(out)) def test_retrofit_ignores_marker_in_middle_of_text(tmp_path: Path) -> None: @@ -116,7 +116,8 @@ def test_retrofit_out_of_order_markers_picks_forward_only(tmp_path: Path) -> Non """If a later-ordered marker appears first, earlier ones are treated as missing. Scanner advances forward through BLOCK_ORDER — it won't go back to claim - an earlier marker after already seeing a later one. + an earlier marker after already seeing a later one. block-alef will be + surfaced via the cover-block fallback rather than from the actual marker. """ src = tmp_path / "src.docx" doc = Document() @@ -128,8 +129,9 @@ def test_retrofit_out_of_order_markers_picks_forward_only(tmp_path: Path) -> Non result = retrofit_bookmarks(src, backup=False) assert "block-bet" in result["bookmarks_added"] assert "block-gimel" in result["bookmarks_added"] - # 'א' was not detected (the first paragraph was 'ב' — scanner advanced past א) - assert "block-alef" in result["missing_blocks"] + # 'א' marker was skipped by forward-scan, so it appears as a structural + # fallback (no real content), not from real detection. + assert "block-alef" in result["structural_fallback"] def test_retrofit_empty_document_reports_all_missing(tmp_path: Path) -> None: @@ -139,3 +141,79 @@ def test_retrofit_empty_document_reports_all_missing(tmp_path: Path) -> None: result = retrofit_bookmarks(src, backup=False) assert result["bookmarks_added"] == [] assert len(result["missing_blocks"]) == 12 + + +def test_retrofit_al_ken_midblock_does_not_capture_yod_bet(tmp_path: Path) -> None: + """'על כן, במקום בו...' באמצע block-yod לא צריך להיתפס כ-yod-bet.""" + src = tmp_path / "src.docx" + doc = Document() + doc.add_paragraph("פתח דבר") + doc.add_paragraph("רקע עובדתי קצר.") + doc.add_paragraph("דיון והכרעה") + doc.add_paragraph("על כן, במקום בו קיים פתרון חניה אין מקום להתערב.") + doc.add_paragraph("סוף דבר") + doc.add_paragraph("פסק דין סופי.") + doc.save(str(src)) + + result = retrofit_bookmarks(src, backup=False) + assert "block-yod-alef" in result["bookmarks_added"] + assert "block-yod-bet" not in result["bookmarks_added"] + + +def test_retrofit_al_ken_operative_captures_yod_bet(tmp_path: Path) -> None: + """'על כן, אנו מחליטים' באמת אופרטיבי — צריך להיתפס כ-yod-bet.""" + src = tmp_path / "src.docx" + doc = Document() + doc.add_paragraph("דיון והכרעה") + doc.add_paragraph("נימוקים מפורטים.") + doc.add_paragraph("סוף דבר") + doc.add_paragraph("על כן, אנו מחליטים לקבל את הערר.") + doc.save(str(src)) + + result = retrofit_bookmarks(src, backup=False) + assert "block-yod-alef" in result["bookmarks_added"] + assert "block-yod-bet" in result["bookmarks_added"] + + +def test_retrofit_vav_al_hamekarkein_pattern(tmp_path: Path) -> None: + """'על המקרקעין חלות התכניות' — דפוס block-vav מקורפוס 1130.""" + src = tmp_path / "src.docx" + doc = Document() + doc.add_paragraph("פתח דבר") + doc.add_paragraph("המקרקעין מצויים בכתובת...") + doc.add_paragraph("על המקרקעין חלות התכניות הבאות") + doc.add_paragraph("פירוט תכניות.") + doc.add_paragraph("תמצית טענות הצדדים") + doc.save(str(src)) + + result = retrofit_bookmarks(src, backup=False) + assert "block-vav" in result["bookmarks_added"] + + +def test_retrofit_cover_blocks_structural_fallback(tmp_path: Path) -> None: + """אם alef-dalet לא בקובץ — לקבל bookmarks ריקים בהתחלה (structural_fallback).""" + src = tmp_path / "src.docx" + doc = Document() + doc.add_paragraph("פתח דבר") + doc.add_paragraph("תוכן.") + doc.add_paragraph("דיון והכרעה") + doc.add_paragraph("הכרעה.") + doc.save(str(src)) + + result = retrofit_bookmarks(src, backup=False) + for name in ["block-alef", "block-bet", "block-gimel", "block-dalet"]: + assert name in result["bookmarks_added"] + assert name not in result["missing_blocks"] + assert set(result["structural_fallback"]) == { + "block-alef", "block-bet", "block-gimel", "block-dalet", + } + + +def test_retrofit_no_double_fallback_when_cover_present(tmp_path: Path) -> None: + """אם block-alef קיים בקובץ אמיתית — לא לזרוק fallback מבני.""" + src = tmp_path / "src.docx" + _make_docx_with_hebrew_blocks( + src, ["א", "ב", "ג", "ד", "ה", "ו", "ז", "ח", "ט", "י", "יא", "יב"], + ) + result = retrofit_bookmarks(src, backup=False) + assert result["structural_fallback"] == [] diff --git a/web-ui/src/components/cases/drafts-panel.tsx b/web-ui/src/components/cases/drafts-panel.tsx index 5433dae..6712d19 100644 --- a/web-ui/src/components/cases/drafts-panel.tsx +++ b/web-ui/src/components/cases/drafts-panel.tsx @@ -115,15 +115,17 @@ export function DraftsPanel({ onSuccess: (data) => { const added = data.bookmarks_added?.length ?? 0; const missing = data.missing_blocks?.length ?? 0; + const fallback = data.structural_fallback?.length ?? 0; + const realDetected = added - fallback; if (data.apply_status === "completed" || data.apply_status === "ok") { - if (added > 0) { - toast.success(`הועלה: ${data.filename} — זוהו ${added} בלוקים`); + if (realDetected > 0) { + toast.success(`הועלה: ${data.filename} — זוהו ${realDetected} בלוקי תוכן`); } else { toast.success(`הועלה: ${data.filename}`); } if (missing > 0) { toast.warning( - `שימו לב: ${missing} בלוקים לא זוהו — ייתכנו בעיות בתיקונים עתידיים`, + `שימו לב: ${missing} בלוקי תוכן לא זוהו — בדוק את הכותרות`, ); } } else { diff --git a/web-ui/src/lib/api/exports.ts b/web-ui/src/lib/api/exports.ts index a941453..d6b3cf3 100644 --- a/web-ui/src/lib/api/exports.ts +++ b/web-ui/src/lib/api/exports.ts @@ -35,6 +35,7 @@ export type UploadResult = { active_draft?: string; bookmarks_added?: string[]; missing_blocks?: string[]; + structural_fallback?: string[]; apply_status?: string; }; @@ -155,6 +156,7 @@ export function useRetrofit(caseNumber: string) { active_draft_path: string; bookmarks_added: string[]; missing_blocks: string[]; + structural_fallback?: string[]; }>(`/api/cases/${caseNumber}/exports/${filename}/retrofit`, { method: "POST", }), diff --git a/web/app.py b/web/app.py index a0c60fb..b95710c 100644 --- a/web/app.py +++ b/web/app.py @@ -2060,6 +2060,7 @@ async def api_upload_export(case_number: str, file: UploadFile = File(...)): "active_draft": auto_result.get("active_draft_path"), "bookmarks_added": auto_result.get("bookmarks_added", []), "missing_blocks": auto_result.get("missing_blocks", []), + "structural_fallback": auto_result.get("structural_fallback", []), "apply_status": auto_result.get("status", "error"), }