diff --git a/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py b/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py index 7209c59..1a689d5 100644 --- a/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py +++ b/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py @@ -15,6 +15,7 @@ in ``apply_to_record``. from __future__ import annotations import logging +import re from datetime import date as date_type from uuid import UUID @@ -220,6 +221,31 @@ async def extract_metadata(case_law_id: UUID | str) -> dict: return out +# Israeli court docket: digits with slash/dash separators, no spaces, no letters +# (e.g. "1132-09-24", "4768/22", "35758-09-25"). Used to (a) detect a +# citation-shaped case_number that must be normalized and (b) guard against ever +# writing a non-docket string into the identity field. +_DOCKET_RE = re.compile(r"\d{1,6}(?:[-/]\d{1,4}){1,2}") + + +def _is_clean_docket(s: str) -> bool: + return bool(_DOCKET_RE.fullmatch((s or "").strip())) + + +def _source_type_for_level(level: str) -> str: + """Derive source_type from precedent_level — the library section is driven by + source_type, so the two MUST agree (an LLM slip pairing + precedent_level='ועדת_ערר_מחוזית' with source_type='court_ruling' files a + committee decision under "court rulings"). Empty when the level is + indeterminate (don't force a guess).""" + level = (level or "").strip() + if level.startswith("ועדת_ערר"): + return "appeals_committee" + if level in ("עליון", "מנהלי"): + return "court_ruling" + return "" + + async def apply_to_record( case_law_id: UUID | str, suggested: dict, @@ -327,10 +353,23 @@ async def apply_to_record( if pt and (record.get("source_kind") == "internal_committee"): fields_to_update["proceeding_type"] = pt - if overwrite_case_number: - cn = (suggested.get("case_number_clean") or "").strip() - if cn: - fields_to_update["case_number"] = cn + # case_number normalization. The precedent upload / missing-precedent flow + # stores the FULL citation string into case_number (precedent_library: + # case_number=citation). Replace it with the clean docket when the LLM gives + # one AND either (a) caller forces it (overwrite_case_number — migrations) or + # (b) the stored value is clearly citation-shaped (has a space / is long — a + # real docket never is). Guard: only write a value that IS a clean docket, so + # a bad LLM output can never corrupt the identity field. + cn_clean = (suggested.get("case_number_clean") or "").strip() + cur_cn = cur_case_number + citation_shaped = bool(cur_cn) and (" " in cur_cn or len(cur_cn) > 20) + if ( + cn_clean + and _is_clean_docket(cn_clean) + and cn_clean != cur_cn + and (overwrite_case_number or citation_shaped) + ): + fields_to_update["case_number"] = cn_clean # citation_formatted — full citation per Israeli citation rules. Only # fill if empty; user edits in /precedents/[id] are preserved. @@ -355,6 +394,26 @@ async def apply_to_record( if s: fields_to_update["district"] = s + # Enforce source_type ↔ precedent_level consistency in CODE (the LLM prompt + # asks for it, but a slip would file a ועדת-ערר decision under "court + # rulings"). Derive from the EFFECTIVE level (this run's update or the stored + # value) and override an inconsistent source_type — even one already on the + # record, since the library section depends on it. + eff_level = ( + fields_to_update.get("precedent_level") + or record.get("precedent_level") + or "" + ).strip() + derived_st = _source_type_for_level(eff_level) + if derived_st: + eff_st = ( + fields_to_update.get("source_type") + or record.get("source_type") + or "" + ).strip() + if eff_st != derived_st: + fields_to_update["source_type"] = derived_st + if not fields_to_update: return {"updated": False, "fields": []}