fix(cases): מספור 5-ספרתי לבל"מ — סיווג, ולידציה, וחיפוש פסיקה-חסרה

נוהל-יו"ר (2026-06-11): מבנה מספר-תיק = <סידורי>-<חודש>-<שנה>, ואורך הסידורי מקודד את סוג-ההליך — 4 ספרות = ערר, 5 ספרות = בל"מ. הספרה הראשונה ממשיכה לקבוע תחום בשני האורכים (1→רישוי, 8→היטל, 9→פיצויים). הכלל חד-כיווני: 5-ספרתי הוא תמיד בל"מ; 4-ספרתי אינו מחייב ערר (בל"מ-מורשת מזוהה מהנושא). הבאג שדיווח עליו היו"ר: חיפוש פסיקה-חסרה לפי מספר-תיק החזיר 404 על כל ערך שאינו תיק קיים — שבר את הטבלה תוך כדי הקלדה ועל מספרי 5-ספרות. תיקונים: - web/app.py: GET /api/missing-precedents — מסנן case_number שלא תאם תיק מחזיר רשימה ריקה (200), לא 404. סמנטיקה תקינה ל-collection-filter. - missing-precedents/page.tsx: debounce (350ms) על שדות-הסינון — קוורי אחד אחרי שמפסיקים להקליד, לא אחד לכל הקשה. - practice_area.py: regex סידורי \d{4}→\d{4,5}; case_serial_digits() + is_blam_by_number() (5⇒בל"מ); derive_subtype_with_blam ו-derive_proceeding_type מזהים בל"מ גם מ-5-ספרות (בנוסף לנושא). callers: cases.py, internal_decisions.py. - proofreader.py: דפוסי חילוץ-שם-קובץ \d{3,4}→\d{3,5}. - web-ui: practice-area.ts (מראָה ל-backend), schemas/case.ts (regex serial-month-year, 4-or-5 ספרות, superRefine 5⇒בל"מ), placeholder בוויזרד. - תיעוד: docs/spec/X1-identifiers.md §1א + legal-ai/CLAUDE.md. Invariants: מקיים G1 (נרמול-במקור — ספרה ראשונה כמקור-אמת יחיד לתחום), G2 (מסלול-סיווג יחיד, אין כפילות), INV-DM/X1 (מפתח קנוני + proceeding_type). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-12 06:16:42 +00:00
parent 9cd290e08e
commit e8bcb9c1ea
11 changed files with 157 additions and 26 deletions
--- a/mcp-server/src/legal_mcp/services/internal_decisions.py
+++ b/mcp-server/src/legal_mcp/services/internal_decisions.py
@@ -58,6 +58,7 @@ def _internal_validate(inputs: dict) -> None:
 def _internal_derive(inputs: dict) -> dict:
    district = (inputs.get("district") or "").strip() or _district_from_court(inputs.get("court") or "")
    proc = (inputs.get("proceeding_type") or "").strip() or derive_proceeding_type(
+        case_number=inputs.get("case_number") or "",
        appeal_subtype=inputs.get("appeal_subtype") or "", subject=inputs.get("case_name") or "",
    )
    return {"district": district, "proceeding_type": proc}
--- a/mcp-server/src/legal_mcp/services/practice_area.py
+++ b/mcp-server/src/legal_mcp/services/practice_area.py
@@ -176,8 +176,12 @@ _APPEALS_COMMITTEE_DIGIT_TO_SUBTYPE = {

 # Match the case number (last numeric group) in formats like:
 #   ARAR-25-8126, ARAR-24-01-8007-33, 8126/25, 1170, ערר 1024-25
-_CASE_NUM = re.compile(r"(?:ARAR[-\s]*\d{2}[-\s]*(?:\d{2}[-\s]*)?)(\d{4})", re.IGNORECASE)
-_PLAIN_NUM = re.compile(r"(\d{4})")
+# Serial is 4 OR 5 digits: 4 = ערר (appeal), 5 = בל"מ (extension-of-time) per
+# the post-reform numbering convention (Jerusalem adopted 5-digit בל"מ; Tel Aviv
+# long predates it — e.g. 81002-01-21). The leading digit still encodes the
+# domain (1→רישוי, 8→היטל, 9→פיצויים) in BOTH widths — see is_blam_by_number().
+_CASE_NUM = re.compile(r"(?:ARAR[-\s]*\d{2}[-\s]*(?:\d{2}[-\s]*)?)(\d{4,5})", re.IGNORECASE)
+_PLAIN_NUM = re.compile(r"(\d{4,5})")


 _DOMAIN_TO_SUBTYPE: dict[str, str] = {
@@ -216,6 +220,29 @@ def derive_subtype(case_number: str, practice_area: str = DEFAULT_PRACTICE_AREA)
    return _APPEALS_COMMITTEE_DIGIT_TO_SUBTYPE.get(first_digit, "unknown")


+def case_serial_digits(case_number: str) -> int | None:
+    """Return the digit-count of the case serial, or None if unparseable.
+
+    The serial is the leading numeric group of the case number (the part
+    before month/year): ``8126-03-25`` → 4, ``81002-01-21`` → 5.
+    """
+    cn = case_number or ""
+    m = _CASE_NUM.search(cn) or _PLAIN_NUM.search(cn)
+    return len(m.group(1)) if m else None
+
+
+def is_blam_by_number(case_number: str) -> bool:
+    """True iff the case serial has 5 digits.
+
+    Post-reform numbering convention: a 4-digit serial is an ערר (appeal),
+    a 5-digit serial is a בל"מ (בקשה להארכת מועד). This is the authoritative
+    signal going forward; legacy 4-digit בל"מ cases are still detected from
+    the subject via ``is_blam_subject``. The rule is **one-directional** — a
+    5-digit serial implies בל"מ, but a 4-digit serial does NOT imply ערר.
+    """
+    return case_serial_digits(case_number) == 5
+
+
 def derive_subtype_with_blam(
    case_number: str,
    subject: str = "",
@@ -236,9 +263,11 @@ def derive_subtype_with_blam(
        'building_permit'
    """
    base = derive_subtype(case_number, practice_area)
-    if not is_blam_subject(subject):
+    # בל"מ is signalled either by the subject text (legacy 4-digit cases) or by
+    # a 5-digit serial (post-reform convention).
+    if not (is_blam_subject(subject) or is_blam_by_number(case_number)):
        return base
-    # subject says it's בל"מ — return the matching extension_request_* variant.
+    # it's a בל"מ — return the matching extension_request_* variant.
    # For domain practice_area (axis B), use the direct mapping.
    if practice_area in DOMAIN_PRACTICE_AREAS:
        return _DOMAIN_TO_BLAM_SUBTYPE.get(practice_area, base)
@@ -263,15 +292,21 @@ def is_blam_subtype(appeal_subtype: str) -> bool:
    return appeal_subtype in BLAM_SUBTYPES


-def derive_proceeding_type(*, appeal_subtype: str = "", subject: str = "") -> str:
+def derive_proceeding_type(
+    *, case_number: str = "", appeal_subtype: str = "", subject: str = "",
+) -> str:
    """Return 'בל"מ' / 'ערר' for appeals-committee decisions/cases.

-    Priority: explicit subtype prefix → subject regex → default 'ערר'.
+    Priority: explicit subtype prefix → subject regex → 5-digit serial →
+    default 'ערר'. The 5-digit signal is one-directional (a 4-digit serial
+    does not force 'ערר' — a legacy 4-digit בל"מ is caught by the subject).
    """
    if appeal_subtype and appeal_subtype.startswith("extension_request_"):
        return 'בל"מ'
    if subject and is_blam_subject(subject):
        return 'בל"מ'
+    if case_number and is_blam_by_number(case_number):
+        return 'בל"מ'
    return "ערר"


--- a/mcp-server/src/legal_mcp/services/proofreader.py
+++ b/mcp-server/src/legal_mcp/services/proofreader.py
@@ -268,12 +268,13 @@ async def proofread(path: Path) -> tuple[str, dict]:

 # ── Metadata extraction ──────────────────────────────────────────

+# Serial is 3–5 digits: 4 = ערר, 5 = בל"מ (post-reform). 3 tolerates legacy short serials.
 FILENAME_NUMBER_PATTERNS = [
-    re.compile(r"^ARAR-(\d{2})-(\d{3,4})"),
-    re.compile(r"^ערר\s+(\d{3,4})-(\d{2})"),
-    re.compile(r"^ערר\s+(\d{3,4})\s*-"),
+    re.compile(r"^ARAR-(\d{2})-(\d{3,5})"),
+    re.compile(r"^ערר\s+(\d{3,5})-(\d{2})"),
+    re.compile(r"^ערר\s+(\d{3,5})\s*-"),
 ]
-LEGACY_MULTI_PATTERN = re.compile(r"(\d{3,4})\+(\d{3,4})")
+LEGACY_MULTI_PATTERN = re.compile(r"(\d{3,5})\+(\d{3,5})")


 def decision_number_from_filename(stem: str) -> str | None: