"""Court-citation classifier for the auto-fetch subsystem (X13). Given a raw citation string (typically a digest's ``underlying_citation``, e.g. ``עת"מ 46111-12-22 יכין-אפק נ' הוועדה המחוזית``), decide: * **which tier** can fetch it (``supreme`` | ``admin`` | ``skip``), and * the **canonical case number** plus, for נט המשפט, the (file, month, year) triple the public case-search form needs. Tier mapping (INV-CF6 — only court rulings are auto-fetched; ועדת-ערר is never sent to a public fetch, it needs Nevo): * ``supreme`` — Supreme Court prefixes (עע"מ/בג"ץ/ע"א/רע"א/דנ"א/בר"מ/בש"א). Fetched directly from ``supremedecisions.court.gov.il`` (Tier 0, no CAPTCHA). * ``admin`` — district / administrative-court prefixes (עת"מ/עמ"נ/…) and the bare נט-המשפט "filed" format ``NNNNN-MM-YY``. Fetched via the host-side stealth browser against נט המשפט (Tier 1). * ``skip`` — ועדת-ערר (ערר/בל"מ). Not publicly fetchable → missing_precedent. Regex families intentionally mirror ``citation_extractor.py`` (the canonical prefix/number patterns) so the two stay in sync — we reuse ``_NUM_RX`` shape and ``_normalize_case_number`` semantics rather than inventing a parallel parser (INV-CF1 / engineering "symmetry" rule). """ from __future__ import annotations import re from dataclasses import dataclass # Canonical number core, identical shape to citation_extractor._NUM_RX: # 3-5 digits, optional separator + 2-4 digits, optional third group # (the NNNNN-MM-YY "filed" format — 46111-12-22 = file 46111, month 12, yr 22). _NUM_RX = r"\d{1,5}(?:[-/]\d{1,4}(?:[-/]\d{2,4})?)?" # Hebrew gershayim: straight (") or curly (״). _Q = r"[\"״]" # Optional leading one-letter Hebrew preposition/conjunction (ב/ל/ה/ו/כ/מ/ש) # attached to the prefix — e.g. "בערר", "וערר", "כפי שקבעתי בערר". Anchored by # a lookbehind that forbids a *preceding* Hebrew letter, so we don't match a # prefix buried inside a longer word. Regex backtracking lets the preposition # match empty when the prefix itself starts with one of these letters (בג"ץ). _LEAD = r"(? bool: return self.tier in ("supreme", "admin") def normalize_case_number(raw: str) -> str: """Canonicalize a case number for idempotency keys / matching. Mirrors ``citation_extractor._normalize_case_number``: strip everything but digits and separators, unify ``/`` → ``-``. Display value is never derived from this. """ cleaned = re.sub(r"[^\d/\-]", "", raw or "") return cleaned.replace("/", "-").strip("-") def _split_filed(num_norm: str) -> tuple[str, str, str] | None: """Split a normalized NNNNN-MM-YY number into (file, month, year). Only the three-group "filed" format yields a נט-המשפט triple; two-group formats (1234-22 / 1234/22) are Supreme-style serials and return None. """ m = _BARE_FILED_RX.fullmatch(num_norm) if not m: return None file_no, month, year = m.group(1), m.group(2), m.group(3) # Plausibility: month 1-12, year 2-4 digits. Reject implausible months # (avoids mis-reading a 2-group serial that slipped through). if not (1 <= int(month) <= 12): return None return file_no, month, year def classify(citation: str) -> CourtCitation: """Classify a raw citation string into a fetch tier + parsed number. Resolution order: ועדת-ערר (skip) is checked FIRST so an "ערר" prefix is never mis-routed to a court tier; then Supreme prefixes; then admin prefixes; then a bare filed number defaults to ``admin`` (נט המשפט is the only public source for prefix-less district/שלום numbers). """ text = (citation or "").strip() if not text: return CourtCitation("unknown", "", "", "") # 1. ועדת-ערר → skip (must win over any court match). m = _SKIP_RX.search(text) if m: raw = m.group(2) return CourtCitation( tier="skip", court_prefix=m.group(1), case_number_raw=raw, case_number_norm=normalize_case_number(raw), ) # 2. Supreme Court prefix → Tier 0. m = _SUPREME_RX.search(text) if m: raw = m.group(2) return CourtCitation( tier="supreme", court_prefix=m.group(1), case_number_raw=raw, case_number_norm=normalize_case_number(raw), ) # 3. District / admin prefix → Tier 1. m = _ADMIN_RX.search(text) if m: raw = m.group(2) norm = normalize_case_number(raw) filed = _split_filed(norm) return CourtCitation( tier="admin", court_prefix=m.group(1), case_number_raw=raw, case_number_norm=norm, file_number=filed[0] if filed else None, month=filed[1] if filed else None, year=filed[2] if filed else None, ) # 4. Bare filed number (no prefix) → default admin (נט המשפט). m = _BARE_FILED_RX.search(text) if m: raw = m.group(0) norm = normalize_case_number(raw) filed = _split_filed(norm) if filed: return CourtCitation( tier="admin", court_prefix="", case_number_raw=raw, case_number_norm=norm, file_number=filed[0], month=filed[1], year=filed[2], ) return CourtCitation("unknown", "", "", "")