legal-ai/mcp-server/src/legal_mcp/services/court_citation.py

"""Court-citation classifier for the auto-fetch subsystem (X13).

Given a raw citation string (typically a digest's ``underlying_citation``,
e.g. ``עת"מ 46111-12-22 יכין-אפק נ' הוועדה המחוזית``), decide:

  * **which tier** can fetch it (``supreme`` | ``admin`` | ``skip``), and
  * the **canonical case number** plus, for נט המשפט, the
    (file, month, year) triple the public case-search form needs.

Tier mapping (INV-CF6 — only court rulings are auto-fetched; ועדת-ערר is
never sent to a public fetch, it needs Nevo):

  * ``supreme`` — Supreme Court prefixes (עע"מ/בג"ץ/ע"א/רע"א/דנ"א/בר"מ/בש"א).
    Fetched directly from ``supremedecisions.court.gov.il`` (Tier 0, no CAPTCHA).
  * ``admin``   — district / administrative-court prefixes (עת"מ/עמ"נ/…) and
    the bare נט-המשפט "filed" format ``NNNNN-MM-YY``. Fetched via the
    host-side stealth browser against נט המשפט (Tier 1).
  * ``skip``    — ועדת-ערר (ערר/בל"מ). Not publicly fetchable → missing_precedent.

Regex families intentionally mirror ``citation_extractor.py`` (the canonical
prefix/number patterns) so the two stay in sync — we reuse ``_NUM_RX`` shape
and ``_normalize_case_number`` semantics rather than inventing a parallel
parser (INV-CF1 / engineering "symmetry" rule).
"""

from __future__ import annotations

import re
from dataclasses import dataclass

# Canonical number core, identical shape to citation_extractor._NUM_RX:
# 3-5 digits, optional separator + 2-4 digits, optional third group
# (the NNNNN-MM-YY "filed" format — 46111-12-22 = file 46111, month 12, yr 22).
_NUM_RX = r"\d{1,5}(?:[-/]\d{1,4}(?:[-/]\d{2,4})?)?"

# Hebrew gershayim: straight (") or curly (״).
_Q = r"[\"״]"

# Optional leading one-letter Hebrew preposition/conjunction (ב/ל/ה/ו/כ/מ/ש)
# attached to the prefix — e.g. "בערר", "וערר", "כפי שקבעתי בערר". Anchored by
# a lookbehind that forbids a *preceding* Hebrew letter, so we don't match a
# prefix buried inside a longer word. Regex backtracking lets the preposition
# match empty when the prefix itself starts with one of these letters (בג"ץ).
_LEAD = r"(?<![א-ת])(?:[בלהוכמש])?"

# Supreme Court prefixes → Tier 0 (supremedecisions public download API).
_SUPREME_PREFIXES = [
    rf"עע{_Q}מ",   # ערעור מנהלי (לעליון)
    rf"בג{_Q}ץ",   # בג"ץ
    rf"בג{_Q}צ",   # variant spelling
    rf"דנג{_Q}ץ",  # דיון נוסף בג"ץ
    rf"ע{_Q}א",    # ערעור אזרחי
    rf"רע{_Q}א",   # רשות ערעור אזרחי
    rf"דנ{_Q}א",   # דיון נוסף אזרחי
    rf"בר{_Q}מ",   # בקשת רשות ערעור מנהלי (עליון)
    rf"בש{_Q}א",   # בקשת רשות … (עליון)
]

# District / administrative-court prefixes → Tier 1 (נט המשפט case viewer).
_ADMIN_PREFIXES = [
    rf"עת{_Q}מ",   # עתירה מנהלית (בימ"ש לעניינים מנהליים)
    rf"עמ{_Q}נ",   # ערעור מנהלי (מחוזי)
    rf"ת{_Q}א",    # תביעה אזרחית (מחוזי/שלום)
    rf"ה{_Q}פ",    # המרצת פתיחה
]

# Appeals-committee → skip (needs Nevo; never auto-fetched).
_SKIP_PREFIXES = [
    rf"ערר",
    rf"בל{_Q}מ",
]

_SUPREME_RX = re.compile(
    _LEAD + r"(" + "|".join(_SUPREME_PREFIXES) + r")\s*(" + _NUM_RX + r")",
    re.UNICODE,
)
_ADMIN_RX = re.compile(
    _LEAD + r"(" + "|".join(_ADMIN_PREFIXES) + r")\s*(" + _NUM_RX + r")",
    re.UNICODE,
)
_SKIP_RX = re.compile(
    _LEAD + r"(" + "|".join(_SKIP_PREFIXES) + r")" + r"(?:\s*\([^)\n]{0,80}\))?\s*(" + _NUM_RX + r")",
    re.UNICODE,
)

# Bare נט-המשפט filed format with no prefix: 46111-12-22 (5/4-digit file,
# 1-2 digit month, 2-4 digit year). Used when a digest gives just the number.
_BARE_FILED_RX = re.compile(r"(?<!\d)(\d{1,5})-(\d{1,2})-(\d{2,4})(?!\d)", re.UNICODE)


@dataclass
class CourtCitation:
    """Result of classifying a citation for auto-fetch routing."""

    tier: str  # "supreme" | "admin" | "skip" | "unknown"
    court_prefix: str  # e.g. 'עת"מ', or "" for bare/unknown
    case_number_raw: str  # the matched number as written, e.g. "46111-12-22"
    case_number_norm: str  # canonical: slashes→dashes, digits/sep only
    # נט-המשפט form fields (only when the filed format NNNNN-MM-YY is present):
    file_number: str | None = None
    month: str | None = None
    year: str | None = None

    @property
    def fetchable(self) -> bool:
        return self.tier in ("supreme", "admin")


def normalize_case_number(raw: str) -> str:
    """Canonicalize a case number for idempotency keys / matching.

    Mirrors ``citation_extractor._normalize_case_number``: strip everything
    but digits and separators, unify ``/`` → ``-``. Display value is never
    derived from this.
    """
    cleaned = re.sub(r"[^\d/\-]", "", raw or "")
    return cleaned.replace("/", "-").strip("-")


def _split_filed(num_norm: str) -> tuple[str, str, str] | None:
    """Split a normalized NNNNN-MM-YY number into (file, month, year).

    Only the three-group "filed" format yields a נט-המשפט triple; two-group
    formats (1234-22 / 1234/22) are Supreme-style serials and return None.
    """
    m = _BARE_FILED_RX.fullmatch(num_norm)
    if not m:
        return None
    file_no, month, year = m.group(1), m.group(2), m.group(3)
    # Plausibility: month 1-12, year 2-4 digits. Reject implausible months
    # (avoids mis-reading a 2-group serial that slipped through).
    if not (1 <= int(month) <= 12):
        return None
    return file_no, month, year


def classify(citation: str) -> CourtCitation:
    """Classify a raw citation string into a fetch tier + parsed number.

    Resolution order: ועדת-ערר (skip) is checked FIRST so an "ערר" prefix is
    never mis-routed to a court tier; then Supreme prefixes; then admin
    prefixes; then a bare filed number defaults to ``admin`` (נט המשפט is the
    only public source for prefix-less district/שלום numbers).
    """
    text = (citation or "").strip()
    if not text:
        return CourtCitation("unknown", "", "", "")

    # 1. ועדת-ערר → skip (must win over any court match).
    m = _SKIP_RX.search(text)
    if m:
        raw = m.group(2)
        return CourtCitation(
            tier="skip",
            court_prefix=m.group(1),
            case_number_raw=raw,
            case_number_norm=normalize_case_number(raw),
        )

    # 2. Supreme Court prefix → Tier 0. Still parse a נט-format triple when the
    #    number carries one (e.g. בר"מ 72182-06-25): נט המשפט serves Supreme
    #    cases too, so a triple lets the orchestrator route to the validated
    #    Tier-1 flow instead of the serial-only Tier-0.
    m = _SUPREME_RX.search(text)
    if m:
        raw = m.group(2)
        norm = normalize_case_number(raw)
        filed = _split_filed(norm)
        return CourtCitation(
            tier="supreme",
            court_prefix=m.group(1),
            case_number_raw=raw,
            case_number_norm=norm,
            file_number=filed[0] if filed else None,
            month=filed[1] if filed else None,
            year=filed[2] if filed else None,
        )

    # 3. District / admin prefix → Tier 1.
    m = _ADMIN_RX.search(text)
    if m:
        raw = m.group(2)
        norm = normalize_case_number(raw)
        filed = _split_filed(norm)
        return CourtCitation(
            tier="admin",
            court_prefix=m.group(1),
            case_number_raw=raw,
            case_number_norm=norm,
            file_number=filed[0] if filed else None,
            month=filed[1] if filed else None,
            year=filed[2] if filed else None,
        )

    # 4. Bare filed number (no prefix) → default admin (נט המשפט).
    m = _BARE_FILED_RX.search(text)
    if m:
        raw = m.group(0)
        norm = normalize_case_number(raw)
        filed = _split_filed(norm)
        if filed:
            return CourtCitation(
                tier="admin",
                court_prefix="",
                case_number_raw=raw,
                case_number_norm=norm,
                file_number=filed[0],
                month=filed[1],
                year=filed[2],
            )

    return CourtCitation("unknown", "", "", "")