feat(precedents): citation_formatted דטרמיניסטי בקוד — Gemini מחלץ רכיבים, לא מעצב (#145)
הבעיה (#145): מחלץ-המטא ביקש מ-Gemini Flash *לעצב* את מראה-המקום המלא (citation_formatted). ב-JSON-mode חופשי (ללא responseSchema) המודל החזיר JSON תקין ומלא אך **השמיט בעקביות** דווקא את השדה הזה — אומת על 8070-05-25, 1194-12-25, 1200-12-25 (וגם כשהצדדים זוהו). השדה הקשה ביותר (עיצוב מחרוזת) + היתר-בפרומפט להשאיר ריק → Flash מפיל אותו. הפתרון: citation_formatted הוא **שדה-תצוגה נגזר** (X1 §3 / INV-ID2) — מורכב דטרמיניסטית מרכיבים מובְנים, לא מעוצב ע"י LLM. תפקיד ה-LLM מצטמצם לחילוץ רכיבים אמינים (שורת-הצדדים, קידומת-ההליך לפסקי-בית-משפט). - db.format_precedent_citation(record) — מרכיב לפי כללי-הציטוט-האחיד: ועדת-ערר (מחוזית/ארצית/בל"מ) מ-proceeding_type+district+source_kind; פסקי-בית-משפט מ-court_prefix(LLM)+district-abbrev. מוציא docket נקי מ-case_number מזוהם ("עע\"מ 683/13"→"683/13"). נמנע ('') כשחסר רכיב (צדדים/docket/תאריך/קידומת) — abstention על המצאה (INV-AH). - case_law.parties (V39) — שורת "עורר נ' משיב" כבסיס re-derivable. - מחלץ-המטא: הפרומפט מחלץ parties+citation_prefix (לא citation_formatted); apply_to_record מרכיב דטרמיניסטית מהרשומה-האפקטיבית וממלא רק שדה ריק (עריכות-יו"ר נשמרות). - scripts/backfill_precedent_citations.py — backfill 2-מעברים (דטרמיניסטי→LLM), מדווח שורות-נמנעות, idempotent. אומת: 3 הרשומות הידניות משוחזרות תו-בתו; פסק עליון אמיתי מולא end-to-end (עע"מ 683/13 ... נבו 3.9.2015). test_fu2b_reconcile ✓. Invariants: INV-ID2/X1§3 (ציטוט=תצוגה נגזר, לא מפתח) · INV-AH (abstention, אפס המצאה) · G1 (docket נקי) · G2 (מסלול-יחיד — מחליף את נתיב-ה-LLM, לא מקביל). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1560,6 +1560,18 @@ CREATE INDEX IF NOT EXISTS idx_plans_meta_tsv ON plans USING gin(meta_tsv);
|
||||
"""
|
||||
|
||||
|
||||
# ── V39: case_law.parties ──────────────────────────────────────────
|
||||
# The "עורר נ' משיב" line, extracted from the caption as a structured component.
|
||||
# It is the re-derivable BASIS for the deterministic citation_formatted
|
||||
# (format_precedent_citation) — the LLM extracts the party line (a reliable caption
|
||||
# read) instead of formatting the whole Markdown citation, which it dropped outright
|
||||
# (#145). citation_formatted stays a DERIVED display field (X1 §3 / INV-ID2); this
|
||||
# column is its irreducible bold component.
|
||||
SCHEMA_V39_SQL = """
|
||||
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS parties TEXT DEFAULT '';
|
||||
"""
|
||||
|
||||
|
||||
# Stable, arbitrary key for the session-level advisory lock that serialises
|
||||
# schema DDL across processes. Every short-lived process (cron drains, services)
|
||||
# re-runs the idempotent migrations on startup; without this lock two processes
|
||||
@@ -1620,6 +1632,7 @@ async def _apply_schema_ddl(conn: asyncpg.Connection) -> None:
|
||||
await conn.execute(SCHEMA_V36_SQL)
|
||||
await conn.execute(SCHEMA_V37_SQL)
|
||||
await conn.execute(SCHEMA_V38_SQL)
|
||||
await conn.execute(SCHEMA_V39_SQL)
|
||||
|
||||
|
||||
async def init_schema() -> None:
|
||||
@@ -3502,6 +3515,88 @@ def format_plan_citation(plan: dict) -> str:
|
||||
return sentence
|
||||
|
||||
|
||||
# Clean court docket inside a possibly citation-shaped case_number
|
||||
# ("עע\"מ 683/13" → "683/13"). Legacy court-ruling rows stored the full citation
|
||||
# in the identity field (X1 §4 known violation); pull the docket out so the
|
||||
# assembled citation never doubles the prefix.
|
||||
_CITATION_DOCKET_RE = re.compile(r"\d{1,6}(?:[-/]\d{1,4}){1,2}")
|
||||
|
||||
# District → administrative-court abbreviation as it appears in citations
|
||||
# (`עת"מ (י-ם) 1234/56 ...`). Empty/unknown → the abbrev parenthetical is omitted
|
||||
# rather than guessed.
|
||||
_DISTRICT_COURT_ABBREV = {
|
||||
"ירושלים": "י-ם",
|
||||
"תל אביב": 'ת"א',
|
||||
"מרכז": "מרכז",
|
||||
"חיפה": "חי'",
|
||||
"צפון": "נצ'",
|
||||
"דרום": 'ב"ש',
|
||||
}
|
||||
|
||||
|
||||
def _citation_docket(case_number: str) -> str:
|
||||
s = (case_number or "").strip()
|
||||
if not s:
|
||||
return ""
|
||||
m = _CITATION_DOCKET_RE.search(s)
|
||||
return m.group(0) if m else s
|
||||
|
||||
|
||||
def format_precedent_citation(
|
||||
record: dict, *, parties: str | None = None, court_prefix: str = "",
|
||||
) -> str:
|
||||
"""Deterministically render a precedent's unified-rules citation (מראה מקום).
|
||||
|
||||
DERIVED display field (X1 §3 / INV-ID2) assembled from stored components — NEVER
|
||||
formatted by an LLM, which proved to drop the field outright (#145). The LLM's job
|
||||
shrinks to extracting reliable COMPONENTS (the ``parties`` line and, for court
|
||||
rulings, the caption ``court_prefix``); the formatted string is built here.
|
||||
|
||||
• ועדת-ערר family — prefix from ``proceeding_type`` ('ערר'/'בל"מ'), forum from
|
||||
``district``, national level → 'ערר ארצי'. Reporter: our own decisions
|
||||
(``source_kind='internal_committee'``) are unpublished → date only; external /
|
||||
Nevo rows → 'נבו '.
|
||||
• court rulings (עליון/מנהלי) — prefix from the caption (``court_prefix``, e.g.
|
||||
'ע"א'/'עת"מ'/'ת"א'); admin-court district abbrev when known; reporter 'נבו '.
|
||||
|
||||
Abstains (returns '') when an essential component is missing — parties, docket, date,
|
||||
or an indeterminate court prefix — never inventing one (INV-AH).
|
||||
"""
|
||||
parties = (parties if parties is not None else (record.get("parties") or "")).strip()
|
||||
docket = _citation_docket(record.get("case_number") or "")
|
||||
d = _coerce_plan_date(record.get("date"))
|
||||
if not (parties and docket and d):
|
||||
return ""
|
||||
date_str = f"{d.day}.{d.month}.{d.year}"
|
||||
|
||||
level = (record.get("precedent_level") or "").strip()
|
||||
source_type = (record.get("source_type") or "").strip()
|
||||
is_committee = level.startswith("ועדת_ערר") or source_type == "appeals_committee"
|
||||
|
||||
if is_committee:
|
||||
reporter = "" if record.get("source_kind") == "internal_committee" else "נבו "
|
||||
if level == "ועדת_ערר_ארצית":
|
||||
head = f"ערר ארצי {docket}"
|
||||
else:
|
||||
prefix = 'בל"מ' if (record.get("proceeding_type") or "").strip() == 'בל"מ' else "ערר"
|
||||
district = (record.get("district") or "").strip()
|
||||
if not district:
|
||||
return ""
|
||||
head = f"{prefix} (ועדות ערר - מחוז {district}) {docket}"
|
||||
else:
|
||||
prefix = (court_prefix or "").strip()
|
||||
if not prefix:
|
||||
return "" # court-ruling prefix is not derivable from structured fields
|
||||
reporter = "נבו "
|
||||
if level == "מנהלי":
|
||||
abbrev = _DISTRICT_COURT_ABBREV.get((record.get("district") or "").strip(), "")
|
||||
head = f"{prefix} ({abbrev}) {docket}" if abbrev else f"{prefix} {docket}"
|
||||
else:
|
||||
head = f"{prefix} {docket}"
|
||||
|
||||
return f"{head} **{parties}** ({reporter}{date_str})"
|
||||
|
||||
|
||||
def _plan_row_to_dict(row) -> dict | None:
|
||||
if row is None:
|
||||
return None
|
||||
@@ -4248,7 +4343,7 @@ async def update_case_law(case_law_id: UUID, **fields) -> dict | None:
|
||||
"case_number", "case_name", "court", "date", "practice_area", "appeal_subtype",
|
||||
"subject_tags", "summary", "headnote", "nevo_ratio", "key_quote", "source_url",
|
||||
"source_type", "precedent_level", "is_binding", "district", "chair_name",
|
||||
"proceeding_type", "citation_formatted",
|
||||
"proceeding_type", "citation_formatted", "parties",
|
||||
}
|
||||
updates = {k: v for k, v in fields.items() if k in allowed}
|
||||
if not updates:
|
||||
|
||||
Reference in New Issue
Block a user