Support ingestion of betterment levy (היטל השבחה) decisions into a separate training corpus (CMPA). Key changes: - Add .doc file extraction via LibreOffice conversion in extractor - Add practice_area/appeal_subtype columns to style_corpus table - Route training files to cmp/ or cmpa/ subdirs based on appeal subtype - Fix derive_subtype to handle ARAR-YY-NNNN format (was matching year digit) - Expose practice_area/appeal_subtype params in MCP upload_training tool - Add appeal_subtype filter to analyze_style for per-type style analysis - Update betterment levy methodology in lessons.py: checklist (from generic to corpus-based), opening/closing strategies, and discussion rules Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
105 lines
3.8 KiB
Python
105 lines
3.8 KiB
Python
"""Practice area + appeal subtype: derivation, validation, constants.
|
|
|
|
Two orthogonal axes used to separate legal domains across the system:
|
|
|
|
practice_area — top-level domain (multi-tenant axis). Examples:
|
|
appeals_committee, national_insurance, labor_law.
|
|
appeal_subtype — refines within a domain. For appeals_committee:
|
|
building_permit (1xxx), betterment_levy (8xxx),
|
|
compensation_197 (9xxx), unknown.
|
|
|
|
Both columns are denormalized into documents/chunks/decisions/style_corpus
|
|
so vector searches can filter cheaply.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
# ── Enums ──────────────────────────────────────────────────────────
|
|
|
|
PRACTICE_AREAS: set[str] = {
|
|
"appeals_committee",
|
|
"national_insurance",
|
|
"labor_law",
|
|
}
|
|
|
|
APPEALS_COMMITTEE_SUBTYPES: set[str] = {
|
|
"building_permit",
|
|
"betterment_levy",
|
|
"compensation_197",
|
|
"unknown",
|
|
}
|
|
|
|
DEFAULT_PRACTICE_AREA = "appeals_committee"
|
|
|
|
# Subtypes per practice_area (extend when adding domains)
|
|
SUBTYPES_BY_AREA: dict[str, set[str]] = {
|
|
"appeals_committee": APPEALS_COMMITTEE_SUBTYPES,
|
|
"national_insurance": {"unknown"},
|
|
"labor_law": {"unknown"},
|
|
}
|
|
|
|
|
|
# ── Derivation ─────────────────────────────────────────────────────
|
|
|
|
_APPEALS_COMMITTEE_DIGIT_TO_SUBTYPE = {
|
|
"1": "building_permit",
|
|
"8": "betterment_levy",
|
|
"9": "compensation_197",
|
|
}
|
|
|
|
# Match the case number (last numeric group) in formats like:
|
|
# ARAR-25-8126, ARAR-24-01-8007-33, 8126/25, 1170, ערר 1024-25
|
|
_CASE_NUM = re.compile(r"(?:ARAR[-\s]*\d{2}[-\s]*(?:\d{2}[-\s]*)?)(\d{4})", re.IGNORECASE)
|
|
_PLAIN_NUM = re.compile(r"(\d{4})")
|
|
|
|
|
|
def derive_subtype(case_number: str, practice_area: str = DEFAULT_PRACTICE_AREA) -> str:
|
|
"""Infer the appeal_subtype from case_number.
|
|
|
|
For appeals_committee, the convention is:
|
|
1xxx → building_permit, 8xxx → betterment_levy, 9xxx → compensation_197.
|
|
|
|
Handles multiple formats: ARAR-25-8126, 8126/25, 1170, ערר 1024-25.
|
|
"""
|
|
if practice_area != "appeals_committee":
|
|
return "unknown"
|
|
cn = case_number or ""
|
|
# Try ARAR format first (extracts the 4-digit case number after year prefix)
|
|
m = _CASE_NUM.search(cn)
|
|
if not m:
|
|
# Fallback: first 4-digit number in the string
|
|
m = _PLAIN_NUM.search(cn)
|
|
if not m:
|
|
return "unknown"
|
|
first_digit = m.group(1)[0]
|
|
return _APPEALS_COMMITTEE_DIGIT_TO_SUBTYPE.get(first_digit, "unknown")
|
|
|
|
|
|
# ── Validation ─────────────────────────────────────────────────────
|
|
|
|
|
|
def validate(practice_area: str, appeal_subtype: str | None) -> None:
|
|
"""Raise ValueError on unknown values. appeal_subtype=None is allowed."""
|
|
if practice_area not in PRACTICE_AREAS:
|
|
raise ValueError(
|
|
f"unknown practice_area: {practice_area!r}. "
|
|
f"expected one of {sorted(PRACTICE_AREAS)}"
|
|
)
|
|
if appeal_subtype is None:
|
|
return
|
|
allowed = SUBTYPES_BY_AREA.get(practice_area, {"unknown"})
|
|
if appeal_subtype not in allowed:
|
|
raise ValueError(
|
|
f"unknown appeal_subtype {appeal_subtype!r} for practice_area "
|
|
f"{practice_area!r}. expected one of {sorted(allowed)}"
|
|
)
|
|
|
|
|
|
def is_override(case_number: str, practice_area: str, appeal_subtype: str) -> bool:
|
|
"""True iff the user-supplied subtype disagrees with what derive_subtype
|
|
would have produced (and the derived value is not 'unknown')."""
|
|
derived = derive_subtype(case_number, practice_area)
|
|
return derived != "unknown" and derived != appeal_subtype
|