Merge pull request 'feat(precedents): citation_formatted דטרמיניסטי בקוד — Gemini מחלץ רכיבים, לא מעצב (#145)' (#262) from worktree-precedent-deterministic-citation into main

2026-06-15 03:38:51 +00:00
parent 33b07eebcf d6608ce849
commit 7043de0ac2
4 changed files with 261 additions and 33 deletions
--- a/mcp-server/src/legal_mcp/services/db.py
+++ b/mcp-server/src/legal_mcp/services/db.py
@@ -1560,6 +1560,18 @@ CREATE INDEX IF NOT EXISTS idx_plans_meta_tsv ON plans USING gin(meta_tsv);
 """


+# ── V39: case_law.parties ──────────────────────────────────────────
+# The "עורר נ' משיב" line, extracted from the caption as a structured component.
+# It is the re-derivable BASIS for the deterministic citation_formatted
+# (format_precedent_citation) — the LLM extracts the party line (a reliable caption
+# read) instead of formatting the whole Markdown citation, which it dropped outright
+# (#145). citation_formatted stays a DERIVED display field (X1 §3 / INV-ID2); this
+# column is its irreducible bold component.
+SCHEMA_V39_SQL = """
+ALTER TABLE case_law ADD COLUMN IF NOT EXISTS parties TEXT DEFAULT '';
+"""
+
+
 # Stable, arbitrary key for the session-level advisory lock that serialises
 # schema DDL across processes. Every short-lived process (cron drains, services)
 # re-runs the idempotent migrations on startup; without this lock two processes
@@ -1620,6 +1632,7 @@ async def _apply_schema_ddl(conn: asyncpg.Connection) -> None:
        await conn.execute(SCHEMA_V36_SQL)
        await conn.execute(SCHEMA_V37_SQL)
        await conn.execute(SCHEMA_V38_SQL)
+        await conn.execute(SCHEMA_V39_SQL)


 async def init_schema() -> None:
@@ -3513,6 +3526,88 @@ def format_plan_citation(plan: dict) -> str:
    return sentence


+# Clean court docket inside a possibly citation-shaped case_number
+# ("עע\"מ 683/13" → "683/13"). Legacy court-ruling rows stored the full citation
+# in the identity field (X1 §4 known violation); pull the docket out so the
+# assembled citation never doubles the prefix.
+_CITATION_DOCKET_RE = re.compile(r"\d{1,6}(?:[-/]\d{1,4}){1,2}")
+
+# District → administrative-court abbreviation as it appears in citations
+# (`עת"מ (י-ם) 1234/56 ...`). Empty/unknown → the abbrev parenthetical is omitted
+# rather than guessed.
+_DISTRICT_COURT_ABBREV = {
+    "ירושלים": "י-ם",
+    "תל אביב": 'ת"א',
+    "מרכז": "מרכז",
+    "חיפה": "חי'",
+    "צפון": "נצ'",
+    "דרום": 'ב"ש',
+}
+
+
+def _citation_docket(case_number: str) -> str:
+    s = (case_number or "").strip()
+    if not s:
+        return ""
+    m = _CITATION_DOCKET_RE.search(s)
+    return m.group(0) if m else s
+
+
+def format_precedent_citation(
+    record: dict, *, parties: str | None = None, court_prefix: str = "",
+) -> str:
+    """Deterministically render a precedent's unified-rules citation (מראה מקום).
+
+    DERIVED display field (X1 §3 / INV-ID2) assembled from stored components — NEVER
+    formatted by an LLM, which proved to drop the field outright (#145). The LLM's job
+    shrinks to extracting reliable COMPONENTS (the ``parties`` line and, for court
+    rulings, the caption ``court_prefix``); the formatted string is built here.
+
+      • ועדת-ערר family — prefix from ``proceeding_type`` ('ערר'/'בל"מ'), forum from
+        ``district``, national level → 'ערר ארצי'. Reporter: our own decisions
+        (``source_kind='internal_committee'``) are unpublished → date only; external /
+        Nevo rows → 'נבו '.
+      • court rulings (עליון/מנהלי) — prefix from the caption (``court_prefix``, e.g.
+        'ע"א'/'עת"מ'/'ת"א'); admin-court district abbrev when known; reporter 'נבו '.
+
+    Abstains (returns '') when an essential component is missing — parties, docket, date,
+    or an indeterminate court prefix — never inventing one (INV-AH).
+    """
+    parties = (parties if parties is not None else (record.get("parties") or "")).strip()
+    docket = _citation_docket(record.get("case_number") or "")
+    d = _coerce_plan_date(record.get("date"))
+    if not (parties and docket and d):
+        return ""
+    date_str = f"{d.day}.{d.month}.{d.year}"
+
+    level = (record.get("precedent_level") or "").strip()
+    source_type = (record.get("source_type") or "").strip()
+    is_committee = level.startswith("ועדת_ערר") or source_type == "appeals_committee"
+
+    if is_committee:
+        reporter = "" if record.get("source_kind") == "internal_committee" else "נבו "
+        if level == "ועדת_ערר_ארצית":
+            head = f"ערר ארצי {docket}"
+        else:
+            prefix = 'בל"מ' if (record.get("proceeding_type") or "").strip() == 'בל"מ' else "ערר"
+            district = (record.get("district") or "").strip()
+            if not district:
+                return ""
+            head = f"{prefix} (ועדות ערר - מחוז {district}) {docket}"
+    else:
+        prefix = (court_prefix or "").strip()
+        if not prefix:
+            return ""  # court-ruling prefix is not derivable from structured fields
+        reporter = "נבו "
+        if level == "מנהלי":
+            abbrev = _DISTRICT_COURT_ABBREV.get((record.get("district") or "").strip(), "")
+            head = f"{prefix} ({abbrev}) {docket}" if abbrev else f"{prefix} {docket}"
+        else:
+            head = f"{prefix} {docket}"
+
+    return f"{head} **{parties}** ({reporter}{date_str})"
+
+
 def _plan_row_to_dict(row) -> dict | None:
    if row is None:
        return None
@@ -4259,7 +4354,7 @@ async def update_case_law(case_law_id: UUID, **fields) -> dict | None:
        "case_number", "case_name", "court", "date", "practice_area", "appeal_subtype",
        "subject_tags", "summary", "headnote", "nevo_ratio", "key_quote", "source_url",
        "source_type", "precedent_level", "is_binding", "district", "chair_name",
-        "proceeding_type", "citation_formatted",
+        "proceeding_type", "citation_formatted", "parties",
    }
    updates = {k: v for k, v in fields.items() if k in allowed}
    if not updates:
--- a/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py
+++ b/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py
@@ -1,12 +1,18 @@
 """Auto-extract precedent metadata from a freshly-uploaded ruling.

-Runs after chunking. Reads the precedent's full_text and asks Claude to
+Runs after chunking. Reads the precedent's full_text and asks Gemini to
 fill in the metadata fields that an upload form usually leaves empty:
 short case_name, summary, headnote, key_quote, subject_tags,
 appeal_subtype, decision_date, precedent_level, court — plus
 chair_name + district for internal_committee rows (which the upload
 path stamps with PLACEHOLDER_PENDING_EXTRACTION when missing).

+The full citation (citation_formatted) is NOT formatted by the LLM — a Flash
+model reliably extracts the party line but drops the formatted string outright
+(#145). Instead the LLM extracts COMPONENTS (parties, citation_prefix) and
+``apply_to_record`` assembles the citation deterministically via
+``db.format_precedent_citation`` (X1 §3 / INV-ID2 — a derived display field).
+
 Caller policy: only empty user-supplied fields are filled. Anything the
 chair already typed in the upload form is preserved. This is enforced
 in ``apply_to_record``.
@@ -64,7 +70,8 @@ METADATA_EXTRACTION_PROMPT = """אתה מסייע משפטי בכיר. קרא א
  "case_number_clean": "מספר הערר/תיק כפי שמופיע בכותרת — רק הספרות והאלכסון, למשל '1062/24' או '8031/21'. ללא המילה 'ערר', ללא שם הצדדים, ללא סוגריים. אם יש כמה עררים מאוחדים — הרשום הראשון. מחרוזת ריקה אם לא ניתן לזהות.",
  "chair_name": "שם יו\\\"ר ההרכב — רלוונטי **רק להחלטות ועדת ערר**, לא לפסקי בית משפט. חפש בכותרת/חתימה: 'עו\\\"ד דפנה תמיר, יו\\\"ר ועדת הערר', 'בפני: עו\\\"ד פלוני אלמוני (יו\\\"ר)'. השאר שם פרטי+משפחה בלי תוארים ('עו\\\"ד', 'אדריכל'). אם זה פסק דין של בית משפט — מחרוזת ריקה.",
  "district": "מחוז ועדת הערר — רלוונטי **רק להחלטות ועדת ערר**. ערכים מותרים: 'ירושלים', 'תל אביב', 'מרכז', 'חיפה', 'צפון', 'דרום', 'ארצית'. זהה מהכותרת ('ועדת הערר לתכנון ובניה — מחוז ירושלים' → 'ירושלים'; 'ועדות ערר - תכנון ובנייה תל אביב-יפו' → 'תל אביב'). אם זה פסק דין של בית משפט — מחרוזת ריקה.",
-  "citation_formatted": "המראה מקום המלא לפי **כללי הציטוט האחיד**, בפורמט Markdown — שמות הצדדים בלבד מוקפים בכפול-כוכבית (`**…**`), הכל השאר רגיל. ראה כללים מפורטים בסעיף 12 למטה."
+  "parties": "שמות הצדדים בשורה אחת בצורה 'עורר נ\\' משיב' — בדיוק כפי שמופיעים בכותרת/רובריקה. בלי הדגשה, בלי מספר-תיק, בלי תוארים מיותרים. למשל 'ישיבת חברת אהבת שלום נ\\' תאיה' או 'ראם חיים נ\\' הוועדה המקומית לתכנון ובניה ירושלים'. אם הצדדים אינם מופיעים בטקסט (למשל החלטה שמתחילה בגוף בלי רובריקה) — מחרוזת ריקה. **אל תמציא שמות.**",
+  "citation_prefix": "קידומת-ההליך של פסיקת בית-משפט בלבד, כפי שמופיעה בראש הכותרת: ע\\"א / רע\\"א / בג\\"ץ / עע\\"מ / עת\\"מ / ע\\"פ / דנ\\"א / ת\\"א וכד'. **רק לפסקי בית-משפט (עליון/מנהלי)** — להחלטות ועדת-ערר השאר ריק (הקוד גוזר 'ערר'/'בל\\"מ' מעצמו). אם לא ברור — מחרוזת ריקה."
 }

 ## כללי איכות
@@ -80,22 +87,10 @@ METADATA_EXTRACTION_PROMPT = """אתה מסייע משפטי בכיר. קרא א
 10. **court** — מהכותרת הראשית של הפסק. ניסוח מלא (לא קיצור). מחרוזת ריקה אם לא ניתן לזהות.
 11. **proceeding_type** — חובה לזהות עבור החלטות ועדת ערר; ריק עבור פסיקת בית משפט. הסימן הברור: בכותרת הראשונה של המסמך כתוב "ערר (ועדות ערר ...) NNNN/YY" → 'ערר'; "בל\"מ NNNN/YY" או הנושא "בקשה להארכת מועד להגשת ערר" → 'בל\"מ'. שני הסוגים יכולים לחלוק אותו מספר תיק — לכן חשוב להבחין מפורשות.
 12. **chair_name / district** — חובה למלא רק עבור החלטות ועדת ערר (source_type='appeals_committee'). chair_name נמצא בכותרת ("בפני: עו\"ד פלוני אלמוני, יו\"ר") או בחתימה. district = מחוז הוועדה, מתוך רשימה סגורה. עבור פסקי בית משפט — שני השדות ריקים.
-13. **citation_formatted — כללי הציטוט האחיד הישראלי**. הרכב את המראה מקום במחרוזת אחת בפורמט Markdown, **כשרק שמות הצדדים מודגשים** (מוקפים ב-`**…**`). כל השאר — קיצור הערכאה, סוגריים של הרכב/מחוז, מספר תיק, מאגר/תאריך — **רגיל ללא הדגשה**.
-
-   תבניות לסוגי פסיקה:
-   * **בית משפט עליון — לא פורסם:** `ע"א 1234/56 **פלוני נ' אלמוני** (נבו 1.2.3456)`
-   * **בית משפט עליון — פורסם:** `ע"א 1234/56 **פלוני נ' אלמוני**, פ"ד יב(3) 456 (1990)`
-   * **בית משפט מנהלי:** `עת"מ (י-ם) 1234/56 **פלוני נ' הוועדה** (נבו 1.2.3456)` — "(י-ם)" / "(ת"א)" / וכד' = קיצור המחוז
-   * **ועדת ערר תכנון ובנייה (מחוזית):** `ערר (ועדות ערר - תכנון ובנייה ת"א-יפו) 81002-01-21 **אברהם אגסי נ' הועדה המקומית לתכנון ובנייה תל אביב** (נבו 25.9.2025)`
-   * **בל"מ (בקשה להארכת מועד):** `בל"מ (ועדות ערר - ירושלים) 1028/20 **חלוואני ריאד נ' רשות הרישוי - הוועדה המקומית ירושלים** (נבו 7.1.2021)`
-   * **ועדת ערר ארצית:** `ערר ארצי 8047/23 **פלוני נ' אלמוני** (נבו 1.2.3456)`
-
-   כללים:
-   - **הצדדים מודגשים בלבד** — כל השאר רגיל. אל תדגיש את "ע"א" / "ערר" / מספר התיק / "(נבו ...)" / "פ"ד".
-   - הצדדים = מי שמופיע **בין מספר התיק לבין הסוגריים הסופיים** (תאריך/מאגר), כלומר "[עורר/מבקש] נ' [משיב]".
-   - תאריך בסוגריים סופיים בפורמט עברי "(נבו 25.9.2025)" — יום.חודש.שנה ללא אפסים מובילים.
-   - אם המאגר הוא נבו והפסיקה לא פורסמה ב-פ"ד — השתמש ב-"(נבו DATE)". אם פורסמה ב-פ"ד — הוסף את ההפניה הפורמלית אחרי הצדדים: `..., פ"ד יב(3) 456 (1990)`.
-   - אם לא ניתן לזהות איזשהו רכיב במדויק — השאר את **כל** השדה ריק. אל תניח / תמציא.
+13. **parties / citation_prefix — רכיבי המראה-מקום (לא המראה-מקום עצמו)**. אינך מרכיב את הציטוט המעוצב — המערכת מרכיבה אותו דטרמיניסטית מהרכיבים. עליך רק **לחלץ** שני רכיבים נקיים:
+   - **parties** — שורת הצדדים "[עורר/מבקש] נ' [משיב]" כפי שמופיעה בכותרת/רובריקה. בלי מספר-תיק, בלי קידומת-הליך, בלי הדגשה. הצדדים = מי שמופיע בין מספר-התיק לבין שם-הערכאה/התאריך. אם אין רובריקה עם צדדים (החלטה שפותחת ישר בגוף) — השאר ריק; **אל תמציא שמות**.
+   - **citation_prefix** — קידומת-ההליך **רק לפסקי בית-משפט** (ע"א / רע"א / בג"ץ / עע"מ / עת"מ / ע"פ / דנ"א / ת"א…), כפי שכתובה בראש הכותרת. להחלטות ועדת-ערר — ריק (המערכת גוזרת 'ערר'/'בל"מ' מ-proceeding_type).
+   - שניהם רשות; ריק עדיף על ניחוש (INV-AH — abstention על המצאה).
 """


@@ -210,14 +205,14 @@ async def extract_metadata(case_law_id: UUID | str) -> dict:
        # silently storing free-text in what callers treat as a filter facet.
        if d in {"ירושלים", "תל אביב", "מרכז", "חיפה", "צפון", "דרום", "ארצית"}:
            out["district"] = d
-    if isinstance(result.get("citation_formatted"), str):
-        cf = result["citation_formatted"].strip()
-        # Sanity check: a valid citation should contain at least one bold
-        # marker pair (the parties) AND a closing paren (the reporter/date).
-        # If the LLM returned a half-formed string, drop it rather than
-        # store junk that the UI then has to special-case.
-        if cf.count("**") >= 2 and ")" in cf:
-            out["citation_formatted"] = cf
+    # parties / citation_prefix — COMPONENTS of the citation, not the formatted
+    # string. citation_formatted itself is assembled deterministically by
+    # db.format_precedent_citation in apply_to_record (#145): a Flash model reliably
+    # extracts the party line but dropped the formatted citation outright.
+    if isinstance(result.get("parties"), str):
+        out["parties"] = result["parties"].strip()
+    if isinstance(result.get("citation_prefix"), str):
+        out["citation_prefix"] = result["citation_prefix"].strip()
    return out


@@ -371,12 +366,12 @@ async def apply_to_record(
    ):
        fields_to_update["case_number"] = cn_clean

-    # citation_formatted — full citation per Israeli citation rules. Only
-    # fill if empty; user edits in /precedents/[id] are preserved.
-    if not (record.get("citation_formatted") or "").strip():
-        s = (suggested.get("citation_formatted") or "").strip()
-        if s:
-            fields_to_update["citation_formatted"] = s
+    # parties — store the extracted "עורר נ' משיב" line (the re-derivable basis for
+    # the deterministic citation). Only fill when empty; chair edits are preserved.
+    if not (record.get("parties") or "").strip():
+        p = (suggested.get("parties") or "").strip()
+        if p:
+            fields_to_update["parties"] = p

    # chair_name / district — only for internal_committee rows. The DB CHECK
    # forces these to be non-empty, so the upload endpoint stamps the row
@@ -414,6 +409,25 @@ async def apply_to_record(
        if eff_st != derived_st:
            fields_to_update["source_type"] = derived_st

+    # citation_formatted — DERIVED deterministically from the effective record
+    # (db.format_precedent_citation), NEVER formatted by the LLM (#145, INV-ID2).
+    # Built last, so it sees this run's component updates (case_number/date/level/
+    # source_type/district/proceeding_type/parties). Only fill when empty so chair
+    # edits in /precedents/[id] are preserved; abstains (no write) when a component
+    # is missing.
+    if not (record.get("citation_formatted") or "").strip():
+        eff = {**record, **fields_to_update}
+        eff_parties = (
+            fields_to_update.get("parties") or record.get("parties") or ""
+        ).strip()
+        cit = db.format_precedent_citation(
+            eff,
+            parties=eff_parties,
+            court_prefix=(suggested.get("citation_prefix") or "").strip(),
+        )
+        if cit:
+            fields_to_update["citation_formatted"] = cit
+
    if not fields_to_update:
        return {"updated": False, "fields": []}