"""Practice area + appeal subtype: derivation, validation, constants. Two orthogonal axes used to separate legal domains across the system: practice_area — top-level domain (multi-tenant axis). Examples: appeals_committee, national_insurance, labor_law. appeal_subtype — refines within a domain. For appeals_committee: building_permit (1xxx), betterment_levy (8xxx), compensation_197 (9xxx), unknown. Both columns are denormalized into documents/chunks/decisions/style_corpus so vector searches can filter cheaply. """ from __future__ import annotations import re # ── Enums ────────────────────────────────────────────────────────── PRACTICE_AREAS: set[str] = { "appeals_committee", "national_insurance", "labor_law", } APPEALS_COMMITTEE_SUBTYPES: set[str] = { "building_permit", "betterment_levy", "compensation_197", "unknown", } DEFAULT_PRACTICE_AREA = "appeals_committee" # Subtypes per practice_area (extend when adding domains) SUBTYPES_BY_AREA: dict[str, set[str]] = { "appeals_committee": APPEALS_COMMITTEE_SUBTYPES, "national_insurance": {"unknown"}, "labor_law": {"unknown"}, } # ── Derivation ───────────────────────────────────────────────────── _APPEALS_COMMITTEE_DIGIT_TO_SUBTYPE = { "1": "building_permit", "8": "betterment_levy", "9": "compensation_197", } # Match the case number (last numeric group) in formats like: # ARAR-25-8126, ARAR-24-01-8007-33, 8126/25, 1170, ערר 1024-25 _CASE_NUM = re.compile(r"(?:ARAR[-\s]*\d{2}[-\s]*(?:\d{2}[-\s]*)?)(\d{4})", re.IGNORECASE) _PLAIN_NUM = re.compile(r"(\d{4})") def derive_subtype(case_number: str, practice_area: str = DEFAULT_PRACTICE_AREA) -> str: """Infer the appeal_subtype from case_number. For appeals_committee, the convention is: 1xxx → building_permit, 8xxx → betterment_levy, 9xxx → compensation_197. Handles multiple formats: ARAR-25-8126, 8126/25, 1170, ערר 1024-25. """ if practice_area != "appeals_committee": return "unknown" cn = case_number or "" # Try ARAR format first (extracts the 4-digit case number after year prefix) m = _CASE_NUM.search(cn) if not m: # Fallback: first 4-digit number in the string m = _PLAIN_NUM.search(cn) if not m: return "unknown" first_digit = m.group(1)[0] return _APPEALS_COMMITTEE_DIGIT_TO_SUBTYPE.get(first_digit, "unknown") # ── Validation ───────────────────────────────────────────────────── def validate(practice_area: str, appeal_subtype: str | None) -> None: """Raise ValueError on unknown values. appeal_subtype=None is allowed.""" if practice_area not in PRACTICE_AREAS: raise ValueError( f"unknown practice_area: {practice_area!r}. " f"expected one of {sorted(PRACTICE_AREAS)}" ) if appeal_subtype is None: return allowed = SUBTYPES_BY_AREA.get(practice_area, {"unknown"}) if appeal_subtype not in allowed: raise ValueError( f"unknown appeal_subtype {appeal_subtype!r} for practice_area " f"{practice_area!r}. expected one of {sorted(allowed)}" ) def is_override(case_number: str, practice_area: str, appeal_subtype: str) -> bool: """True iff the user-supplied subtype disagrees with what derive_subtype would have produced (and the derived value is not 'unknown').""" derived = derive_subtype(case_number, practice_area) return derived != "unknown" and derived != appeal_subtype