בהעלאה דרך "פסיקה-חסרה" (ענף ועדת-ערר), כשטופס case_number ריק המסלול נפל-לאחור לציטוט המלא (committee_case_number = case_number.strip() or citation), כך שמחרוזת- תצוגה עם שמות-צדדים הושתלה בשדה-המזהה — הפרת INV-ID2/INV-ID1 (X1). נצפה על precedent 1bf0bae0 (ערר 85074-04-25 רפאל לוי/חולון): case_number=85074/0425, case_name=ציטוט שלם. תיקון (G1 — נרמול-במקור, G2 — שימוש-חוזר בפרסר הקנוני): - court_citation.case_number_from_citation(citation) — מחזיר את אסימון-המספר המנורמל בלבד (classify; '' כשאין מספר). חולץ נכון 85074-04-25 גם מתוך "ערר (ת\"א 85074-04-25) ...". reuse של הפרסר היחיד, בלי regex מקביל. - web/app.py (ענף ועדת-ערר): fallback דרך case_number_from_citation; אם אין מספר — HTTPException 400 "נא להזין מספר-תיק ידנית" במקום השתלת ציטוט-מלא. - db._canonical_case_number: מוקשח לחלץ את אסימון-המספר (זורק זנב שמות-צדדים), כך ששדה-המזהה לעולם לא נשמר מזוהם — גם בקריאה ישירה (committee + active cases). מספר נקי חוזר ללא שינוי; חודש לא מומצא (X1 §1). - תיקון-נתון: scripts/fix_137_committee_case_number.py (בוצע) — 1bf0bae0: case_number→85074-04-25, case_name→צדדים, token ב-citation_formatted. אומת היחיד עם canon(num)≠num ב-internal_committee. אידמפוטנטי. מחוץ-לתחום (תועד כ-follow-up): מסלול external (precedent_library) משתמש בציטוט- מלא כמזהה-מורשת — זהו פריט-המיגרציה X1 §5 (138 רשומות external/cited_only), לא הבאג הזה. prefill ב-UI של /missing-precedents — דורש שער Claude Design. בדיקות: test_court_citation (case_number_from_citation: party-strip/forms/empty), test_canonical_case_number (harden). כל 339 בדיקות mcp עוברות. guards נקיים. Invariants: G1 (נרמול-במקור), INV-ID1/ID2 (מזהה מנורמל, אין ציטוט-מלא כמזהה), G2 (פרסר יחיד), G12 (leak-guard נקי). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
230 lines
9.0 KiB
Python
230 lines
9.0 KiB
Python
"""Court-citation classifier for the auto-fetch subsystem (X13).
|
||
|
||
Given a raw citation string (typically a digest's ``underlying_citation``,
|
||
e.g. ``עת"מ 46111-12-22 יכין-אפק נ' הוועדה המחוזית``), decide:
|
||
|
||
* **which tier** can fetch it (``supreme`` | ``admin`` | ``skip``), and
|
||
* the **canonical case number** plus, for נט המשפט, the
|
||
(file, month, year) triple the public case-search form needs.
|
||
|
||
Tier mapping (INV-CF6 — only court rulings are auto-fetched; ועדת-ערר is
|
||
never sent to a public fetch, it needs Nevo):
|
||
|
||
* ``supreme`` — Supreme Court prefixes (עע"מ/בג"ץ/ע"א/רע"א/דנ"א/בר"מ/בש"א).
|
||
Fetched directly from ``supremedecisions.court.gov.il`` (Tier 0, no CAPTCHA).
|
||
* ``admin`` — district / administrative-court prefixes (עת"מ/עמ"נ/…) and
|
||
the bare נט-המשפט "filed" format ``NNNNN-MM-YY``. Fetched via the
|
||
host-side stealth browser against נט המשפט (Tier 1).
|
||
* ``skip`` — ועדת-ערר (ערר/בל"מ). Not publicly fetchable → missing_precedent.
|
||
|
||
Regex families intentionally mirror ``citation_extractor.py`` (the canonical
|
||
prefix/number patterns) so the two stay in sync — we reuse ``_NUM_RX`` shape
|
||
and ``_normalize_case_number`` semantics rather than inventing a parallel
|
||
parser (INV-CF1 / engineering "symmetry" rule).
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
from dataclasses import dataclass
|
||
|
||
# Canonical number core, identical shape to citation_extractor._NUM_RX:
|
||
# 3-5 digits, optional separator + 2-4 digits, optional third group
|
||
# (the NNNNN-MM-YY "filed" format — 46111-12-22 = file 46111, month 12, yr 22).
|
||
_NUM_RX = r"\d{1,5}(?:[-/]\d{1,4}(?:[-/]\d{2,4})?)?"
|
||
|
||
# Hebrew gershayim: straight (") or curly (״).
|
||
_Q = r"[\"״]"
|
||
|
||
# Optional leading one-letter Hebrew preposition/conjunction (ב/ל/ה/ו/כ/מ/ש)
|
||
# attached to the prefix — e.g. "בערר", "וערר", "כפי שקבעתי בערר". Anchored by
|
||
# a lookbehind that forbids a *preceding* Hebrew letter, so we don't match a
|
||
# prefix buried inside a longer word. Regex backtracking lets the preposition
|
||
# match empty when the prefix itself starts with one of these letters (בג"ץ).
|
||
_LEAD = r"(?<![א-ת])(?:[בלהוכמש])?"
|
||
|
||
# Supreme Court prefixes → Tier 0 (supremedecisions public download API).
|
||
_SUPREME_PREFIXES = [
|
||
rf"עע{_Q}מ", # ערעור מנהלי (לעליון)
|
||
rf"בג{_Q}ץ", # בג"ץ
|
||
rf"בג{_Q}צ", # variant spelling
|
||
rf"דנג{_Q}ץ", # דיון נוסף בג"ץ
|
||
rf"ע{_Q}א", # ערעור אזרחי
|
||
rf"רע{_Q}א", # רשות ערעור אזרחי
|
||
rf"דנ{_Q}א", # דיון נוסף אזרחי
|
||
rf"בר{_Q}מ", # בקשת רשות ערעור מנהלי (עליון)
|
||
rf"בש{_Q}א", # בקשת רשות … (עליון)
|
||
]
|
||
|
||
# District / administrative-court prefixes → Tier 1 (נט המשפט case viewer).
|
||
_ADMIN_PREFIXES = [
|
||
rf"עת{_Q}מ", # עתירה מנהלית (בימ"ש לעניינים מנהליים)
|
||
rf"עמ{_Q}נ", # ערעור מנהלי (מחוזי)
|
||
rf"ת{_Q}א", # תביעה אזרחית (מחוזי/שלום)
|
||
rf"ה{_Q}פ", # המרצת פתיחה
|
||
]
|
||
|
||
# Appeals-committee → skip (needs Nevo; never auto-fetched).
|
||
_SKIP_PREFIXES = [
|
||
rf"ערר",
|
||
rf"בל{_Q}מ",
|
||
]
|
||
|
||
_SUPREME_RX = re.compile(
|
||
_LEAD + r"(" + "|".join(_SUPREME_PREFIXES) + r")\s*(" + _NUM_RX + r")",
|
||
re.UNICODE,
|
||
)
|
||
_ADMIN_RX = re.compile(
|
||
_LEAD + r"(" + "|".join(_ADMIN_PREFIXES) + r")\s*(" + _NUM_RX + r")",
|
||
re.UNICODE,
|
||
)
|
||
_SKIP_RX = re.compile(
|
||
_LEAD + r"(" + "|".join(_SKIP_PREFIXES) + r")" + r"(?:\s*\([^)\n]{0,80}\))?\s*(" + _NUM_RX + r")",
|
||
re.UNICODE,
|
||
)
|
||
|
||
# Bare נט-המשפט filed format with no prefix: 46111-12-22 (5/4-digit file,
|
||
# 1-2 digit month, 2-4 digit year). Used when a digest gives just the number.
|
||
_BARE_FILED_RX = re.compile(r"(?<!\d)(\d{1,5})-(\d{1,2})-(\d{2,4})(?!\d)", re.UNICODE)
|
||
|
||
|
||
@dataclass
|
||
class CourtCitation:
|
||
"""Result of classifying a citation for auto-fetch routing."""
|
||
|
||
tier: str # "supreme" | "admin" | "skip" | "unknown"
|
||
court_prefix: str # e.g. 'עת"מ', or "" for bare/unknown
|
||
case_number_raw: str # the matched number as written, e.g. "46111-12-22"
|
||
case_number_norm: str # canonical: slashes→dashes, digits/sep only
|
||
# נט-המשפט form fields (only when the filed format NNNNN-MM-YY is present):
|
||
file_number: str | None = None
|
||
month: str | None = None
|
||
year: str | None = None
|
||
|
||
@property
|
||
def fetchable(self) -> bool:
|
||
return self.tier in ("supreme", "admin")
|
||
|
||
|
||
def normalize_case_number(raw: str) -> str:
|
||
"""Canonicalize a case number for idempotency keys / matching.
|
||
|
||
Mirrors ``citation_extractor._normalize_case_number``: strip everything
|
||
but digits and separators, unify ``/`` → ``-``. Display value is never
|
||
derived from this.
|
||
"""
|
||
cleaned = re.sub(r"[^\d/\-]", "", raw or "")
|
||
return cleaned.replace("/", "-").strip("-")
|
||
|
||
|
||
def case_number_from_citation(citation: str) -> str:
|
||
"""Canonical ``case_number`` extracted from a full citation, or ``''``.
|
||
|
||
Returns the normalized number token only (e.g. ``85074-04-25``) — NEVER the
|
||
full citation string with party names / court / date. This is the
|
||
identifier-field rule from X1 (INV-ID2): a citation like
|
||
``ערר (ת"א 85074-04-25) רפאל לוי ואח' נ' הוועדה … - חולון`` yields
|
||
``85074-04-25``, not the whole display string.
|
||
|
||
Reuses ``classify`` (the one canonical citation parser) so callers that need
|
||
a case_number out of an arbitrary citation never roll their own regex (#137,
|
||
G2). Returns ``''`` when no number can be parsed — the caller MUST treat that
|
||
as "needs a manual case_number" and never fall back to the raw citation.
|
||
"""
|
||
return classify(citation).case_number_norm
|
||
|
||
|
||
def _split_filed(num_norm: str) -> tuple[str, str, str] | None:
|
||
"""Split a normalized NNNNN-MM-YY number into (file, month, year).
|
||
|
||
Only the three-group "filed" format yields a נט-המשפט triple; two-group
|
||
formats (1234-22 / 1234/22) are Supreme-style serials and return None.
|
||
"""
|
||
m = _BARE_FILED_RX.fullmatch(num_norm)
|
||
if not m:
|
||
return None
|
||
file_no, month, year = m.group(1), m.group(2), m.group(3)
|
||
# Plausibility: month 1-12, year 2-4 digits. Reject implausible months
|
||
# (avoids mis-reading a 2-group serial that slipped through).
|
||
if not (1 <= int(month) <= 12):
|
||
return None
|
||
return file_no, month, year
|
||
|
||
|
||
def classify(citation: str) -> CourtCitation:
|
||
"""Classify a raw citation string into a fetch tier + parsed number.
|
||
|
||
Resolution order: ועדת-ערר (skip) is checked FIRST so an "ערר" prefix is
|
||
never mis-routed to a court tier; then Supreme prefixes; then admin
|
||
prefixes; then a bare filed number defaults to ``admin`` (נט המשפט is the
|
||
only public source for prefix-less district/שלום numbers).
|
||
"""
|
||
text = (citation or "").strip()
|
||
if not text:
|
||
return CourtCitation("unknown", "", "", "")
|
||
|
||
# 1. ועדת-ערר → skip (must win over any court match).
|
||
m = _SKIP_RX.search(text)
|
||
if m:
|
||
raw = m.group(2)
|
||
return CourtCitation(
|
||
tier="skip",
|
||
court_prefix=m.group(1),
|
||
case_number_raw=raw,
|
||
case_number_norm=normalize_case_number(raw),
|
||
)
|
||
|
||
# 2. Supreme Court prefix → Tier 0. Still parse a נט-format triple when the
|
||
# number carries one (e.g. בר"מ 72182-06-25): נט המשפט serves Supreme
|
||
# cases too, so a triple lets the orchestrator route to the validated
|
||
# Tier-1 flow instead of the serial-only Tier-0.
|
||
m = _SUPREME_RX.search(text)
|
||
if m:
|
||
raw = m.group(2)
|
||
norm = normalize_case_number(raw)
|
||
filed = _split_filed(norm)
|
||
return CourtCitation(
|
||
tier="supreme",
|
||
court_prefix=m.group(1),
|
||
case_number_raw=raw,
|
||
case_number_norm=norm,
|
||
file_number=filed[0] if filed else None,
|
||
month=filed[1] if filed else None,
|
||
year=filed[2] if filed else None,
|
||
)
|
||
|
||
# 3. District / admin prefix → Tier 1.
|
||
m = _ADMIN_RX.search(text)
|
||
if m:
|
||
raw = m.group(2)
|
||
norm = normalize_case_number(raw)
|
||
filed = _split_filed(norm)
|
||
return CourtCitation(
|
||
tier="admin",
|
||
court_prefix=m.group(1),
|
||
case_number_raw=raw,
|
||
case_number_norm=norm,
|
||
file_number=filed[0] if filed else None,
|
||
month=filed[1] if filed else None,
|
||
year=filed[2] if filed else None,
|
||
)
|
||
|
||
# 4. Bare filed number (no prefix) → default admin (נט המשפט).
|
||
m = _BARE_FILED_RX.search(text)
|
||
if m:
|
||
raw = m.group(0)
|
||
norm = normalize_case_number(raw)
|
||
filed = _split_filed(norm)
|
||
if filed:
|
||
return CourtCitation(
|
||
tier="admin",
|
||
court_prefix="",
|
||
case_number_raw=raw,
|
||
case_number_norm=norm,
|
||
file_number=filed[0],
|
||
month=filed[1],
|
||
year=filed[2],
|
||
)
|
||
|
||
return CourtCitation("unknown", "", "", "")
|