2026-06-14 20:57:34 +00:00
1 changed files with 63 additions and 4 deletions
--- a/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py
+++ b/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py
@@ -15,6 +15,7 @@ in ``apply_to_record``.
 from __future__ import annotations

 import logging
+import re
 from datetime import date as date_type
 from uuid import UUID

@@ -220,6 +221,31 @@ async def extract_metadata(case_law_id: UUID | str) -> dict:
    return out


+# Israeli court docket: digits with slash/dash separators, no spaces, no letters
+# (e.g. "1132-09-24", "4768/22", "35758-09-25"). Used to (a) detect a
+# citation-shaped case_number that must be normalized and (b) guard against ever
+# writing a non-docket string into the identity field.
+_DOCKET_RE = re.compile(r"\d{1,6}(?:[-/]\d{1,4}){1,2}")
+
+
+def _is_clean_docket(s: str) -> bool:
+    return bool(_DOCKET_RE.fullmatch((s or "").strip()))
+
+
+def _source_type_for_level(level: str) -> str:
+    """Derive source_type from precedent_level — the library section is driven by
+    source_type, so the two MUST agree (an LLM slip pairing
+    precedent_level='ועדת_ערר_מחוזית' with source_type='court_ruling' files a
+    committee decision under "court rulings"). Empty when the level is
+    indeterminate (don't force a guess)."""
+    level = (level or "").strip()
+    if level.startswith("ועדת_ערר"):
+        return "appeals_committee"
+    if level in ("עליון", "מנהלי"):
+        return "court_ruling"
+    return ""
+
+
 async def apply_to_record(
    case_law_id: UUID | str,
    suggested: dict,
@@ -327,10 +353,23 @@ async def apply_to_record(
        if pt and (record.get("source_kind") == "internal_committee"):
            fields_to_update["proceeding_type"] = pt

-    if overwrite_case_number:
-        cn = (suggested.get("case_number_clean") or "").strip()
-        if cn:
-            fields_to_update["case_number"] = cn
+    # case_number normalization. The precedent upload / missing-precedent flow
+    # stores the FULL citation string into case_number (precedent_library:
+    # case_number=citation). Replace it with the clean docket when the LLM gives
+    # one AND either (a) caller forces it (overwrite_case_number — migrations) or
+    # (b) the stored value is clearly citation-shaped (has a space / is long — a
+    # real docket never is). Guard: only write a value that IS a clean docket, so
+    # a bad LLM output can never corrupt the identity field.
+    cn_clean = (suggested.get("case_number_clean") or "").strip()
+    cur_cn = cur_case_number
+    citation_shaped = bool(cur_cn) and (" " in cur_cn or len(cur_cn) > 20)
+    if (
+        cn_clean
+        and _is_clean_docket(cn_clean)
+        and cn_clean != cur_cn
+        and (overwrite_case_number or citation_shaped)
+    ):
+        fields_to_update["case_number"] = cn_clean

    # citation_formatted — full citation per Israeli citation rules. Only
    # fill if empty; user edits in /precedents/[id] are preserved.
@@ -355,6 +394,26 @@ async def apply_to_record(
            if s:
                fields_to_update["district"] = s

+    # Enforce source_type ↔ precedent_level consistency in CODE (the LLM prompt
+    # asks for it, but a slip would file a ועדת-ערר decision under "court
+    # rulings"). Derive from the EFFECTIVE level (this run's update or the stored
+    # value) and override an inconsistent source_type — even one already on the
+    # record, since the library section depends on it.
+    eff_level = (
+        fields_to_update.get("precedent_level")
+        or record.get("precedent_level")
+        or ""
+    ).strip()
+    derived_st = _source_type_for_level(eff_level)
+    if derived_st:
+        eff_st = (
+            fields_to_update.get("source_type")
+            or record.get("source_type")
+            or ""
+        ).strip()
+        if eff_st != derived_st:
+            fields_to_update["source_type"] = derived_st
+
    if not fields_to_update:
        return {"updated": False, "fields": []}