legal-ai/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py

"""Auto-extract precedent metadata from a freshly-uploaded ruling.

Runs after chunking. Reads the precedent's full_text and asks Claude to
fill in the metadata fields that an upload form usually leaves empty:
short case_name, summary, headnote, key_quote, subject_tags,
appeal_subtype, decision_date, precedent_level, court — plus
chair_name + district for internal_committee rows (which the upload
path stamps with PLACEHOLDER_PENDING_EXTRACTION when missing).

Caller policy: only empty user-supplied fields are filled. Anything the
chair already typed in the upload form is preserved. This is enforced
in ``apply_to_record``.
"""

from __future__ import annotations

import logging
from datetime import date as date_type
from uuid import UUID

from legal_mcp.config import parse_llm_json
from legal_mcp.services import db, gemini_session

logger = logging.getLogger(__name__)


# Sentinel inserted by the upload endpoint when a committee row is created
# without chair_name/district (the DB CHECK forces non-empty). Treated as
# empty by ``apply_to_record`` so LLM-extracted values overwrite it.
PLACEHOLDER_PENDING_EXTRACTION = "(טרם חולץ)"


# The prompt is short — we only need the first 12K chars of the ruling
# (header + opening of discussion is enough for naming + summary). For
# subject tags we sample the discussion section too.
_HEAD_CHARS = 12_000
_TAIL_CHARS = 6_000


# Note: this template is concatenated with f-strings at call-time rather
# than using .format(), because the JSON example below contains '{' / '}'
# which str.format would interpret as placeholders and crash with
# KeyError on the field names.
METADATA_EXTRACTION_PROMPT = """אתה מסייע משפטי בכיר. קרא את פסק הדין/ההחלטה הבא וחלץ ממנו מטא-דאטה לקטלוג הקורפוס.

המטרה: למלא שדות בטופס העלאה שהמשתמש הזין באופן חלקי. **אל תמציא** — אם המידע לא מופיע בטקסט, השאר ריק (מחרוזת ריקה / מערך ריק).

## פלט נדרש
החזר JSON אחד (object — לא array) בפורמט הבא, ללא markdown וללא הסברים:

{
  "case_name_short": "שם קצר ל-3-6 מילים (למשל 'אהרון ברק' או 'ב. קרן-נכסים'). אל תכלול מספר תיק. שם המבקש/העורר העיקרי. אם זו החלטה מאוחדת — שם הצד המוביל.",
  "appeal_subtype": "תת-סוג ספציפי בתוך תחום המשפט (למשל 'תכנית רחביה', 'מימוש במכר', 'תמ\\"א 38', 'שימוש חורג', 'סופיות ההחלטה'). מילה אחת או צירוף קצר.",
  "summary": "תקציר עניני 2-3 משפטים: מה הייתה השאלה, מה הוכרע. בלי שיפוט.",
  "headnote": "headnote בסגנון נבו: 1-2 משפטים שמסכמים את העיקרון שנקבע/יושם בפסק. למשל 'תכנית רחביה — היטל השבחה במימוש במכר — אין לחייב כשהזכויות צפות'.",
  "key_quote": "ציטוט מילולי בודד, 30-100 מילים, שמייצג את לב הפסק. חייב להופיע מילה במילה בטקסט. אם אין ציטוט מתאים — מחרוזת ריקה.",
  "subject_tags": ["תגיות", "נושא", "בעברית"],
  "decision_date_iso": "YYYY-MM-DD — תאריך מתן ההחלטה כפי שמופיע בטקסט (בכותרת או בחתימה הסופית). אם לא ניתן לזהות במדויק — מחרוזת ריקה.",
  "precedent_level": "אחד מ-4: 'עליון' / 'מנהלי' / 'ועדת_ערר_ארצית' / 'ועדת_ערר_מחוזית'. בחר לפי הערכאה שמסומנת בכותרת הפסק. אם לא ברור — מחרוזת ריקה.",
  "source_type": "אחד מ-2: 'court_ruling' (פסק דין של בית משפט — עליון/מנהלי) / 'appeals_committee' (החלטה של ועדת ערר). אם לא ברור — מחרוזת ריקה.",
  "proceeding_type": "אחד מ-2 (רק להחלטות ועדת ערר): 'ערר' (הליך ערר עיקרי על החלטת ועדה מקומית) / 'בל\\\"מ' (בקשה להארכת מועד להגשת ערר). זהה דרך כותרת המסמך: 'ערר (ועדות ערר ...) NNNN/YY' → 'ערר'; 'בל\\\"מ NNNN/YY' או נושא 'בקשה להארכת מועד להגשת ערר' → 'בל\\\"מ'. בפסיקת בית משפט (לא ועדת ערר) — מחרוזת ריקה.",
  "court": "שם הערכאה כפי שהוא מופיע בכותרת (למשל 'בית המשפט העליון', 'בית המשפט המחוזי בירושלים בשבתו כבית משפט לעניינים מנהליים', 'ועדת הערר לתכנון ובניה פיצויים והיטלי השבחה — מחוז ירושלים'). מחרוזת ריקה אם לא ניתן לזהות.",
  "case_number_clean": "מספר הערר/תיק כפי שמופיע בכותרת — רק הספרות והאלכסון, למשל '1062/24' או '8031/21'. ללא המילה 'ערר', ללא שם הצדדים, ללא סוגריים. אם יש כמה עררים מאוחדים — הרשום הראשון. מחרוזת ריקה אם לא ניתן לזהות.",
  "chair_name": "שם יו\\\"ר ההרכב — רלוונטי **רק להחלטות ועדת ערר**, לא לפסקי בית משפט. חפש בכותרת/חתימה: 'עו\\\"ד דפנה תמיר, יו\\\"ר ועדת הערר', 'בפני: עו\\\"ד פלוני אלמוני (יו\\\"ר)'. השאר שם פרטי+משפחה בלי תוארים ('עו\\\"ד', 'אדריכל'). אם זה פסק דין של בית משפט — מחרוזת ריקה.",
  "district": "מחוז ועדת הערר — רלוונטי **רק להחלטות ועדת ערר**. ערכים מותרים: 'ירושלים', 'תל אביב', 'מרכז', 'חיפה', 'צפון', 'דרום', 'ארצית'. זהה מהכותרת ('ועדת הערר לתכנון ובניה — מחוז ירושלים' → 'ירושלים'; 'ועדות ערר - תכנון ובנייה תל אביב-יפו' → 'תל אביב'). אם זה פסק דין של בית משפט — מחרוזת ריקה.",
  "citation_formatted": "המראה מקום המלא לפי **כללי הציטוט האחיד**, בפורמט Markdown — שמות הצדדים בלבד מוקפים בכפול-כוכבית (`**…**`), הכל השאר רגיל. ראה כללים מפורטים בסעיף 12 למטה."
}

## כללי איכות
1. **case_name_short** — שם בולט וקצר. בלי 'נ\\'' / 'נגד' / מספרי תיק.
2. **appeal_subtype** — אופציונלי. אם הסוגיה רחבה ולא מסווגת — השאר ריק.
3. **summary** — תיאור ניטרלי, גוף שלישי.
4. **headnote** — לא מצטטים, מסכמים. סגנון נבו: ביטוי קצר אחד.
5. **key_quote** — חייב להיות הדבקה מילולית מהקלט. אם אין ציטוט בולט — השאר ריק.
6. **subject_tags** — 3-7 תגיות בעברית, snake_case (חניה, קווי_בניין, שיקול_דעת, פגם_פרוצדורלי, סמכות, מועדים, פגיעה_במקרקעין, ירידת_ערך, תכנית_רחביה, מימוש_במכר, וכד'). שייך לתחום של ועדת ערר תכנון ובניה.
7. **decision_date_iso** — תאריך מדויק בלבד. אם בטקסט יש "ניתנה היום, ט' באלול תשפ"א, 5 בספטמבר 2022" — הפלט: "2022-09-05".
8. **precedent_level** — קבע לפי הערכאה: בית המשפט העליון = "עליון"; בית משפט מחוזי בשבתו כבית משפט לעניינים מנהליים = "מנהלי"; ועדת ערר ארצית = "ועדת_ערר_ארצית"; ועדת ערר מחוזית (כמו ועדות תכנון ובניה ירושלים/מחוז המרכז וכד') = "ועדת_ערר_מחוזית". השתמש ב-underscore כפי שמופיע — לא ברווח.
9. **source_type** — שני ערכים בלבד: "court_ruling" כשהמסמך הוא פסק דין/החלטה של בית משפט (עליון/בג"ץ/מנהלי/מחוזי); "appeals_committee" כשהמסמך הוא החלטה של ועדת ערר (ארצית או מחוזית). זה משלים את `precedent_level` — שני השדות צריכים להיות תואמים.
10. **court** — מהכותרת הראשית של הפסק. ניסוח מלא (לא קיצור). מחרוזת ריקה אם לא ניתן לזהות.
11. **proceeding_type** — חובה לזהות עבור החלטות ועדת ערר; ריק עבור פסיקת בית משפט. הסימן הברור: בכותרת הראשונה של המסמך כתוב "ערר (ועדות ערר ...) NNNN/YY" → 'ערר'; "בל\"מ NNNN/YY" או הנושא "בקשה להארכת מועד להגשת ערר" → 'בל\"מ'. שני הסוגים יכולים לחלוק אותו מספר תיק — לכן חשוב להבחין מפורשות.
12. **chair_name / district** — חובה למלא רק עבור החלטות ועדת ערר (source_type='appeals_committee'). chair_name נמצא בכותרת ("בפני: עו\"ד פלוני אלמוני, יו\"ר") או בחתימה. district = מחוז הוועדה, מתוך רשימה סגורה. עבור פסקי בית משפט — שני השדות ריקים.
13. **citation_formatted — כללי הציטוט האחיד הישראלי**. הרכב את המראה מקום במחרוזת אחת בפורמט Markdown, **כשרק שמות הצדדים מודגשים** (מוקפים ב-`**…**`). כל השאר — קיצור הערכאה, סוגריים של הרכב/מחוז, מספר תיק, מאגר/תאריך — **רגיל ללא הדגשה**.

   תבניות לסוגי פסיקה:
   * **בית משפט עליון — לא פורסם:** `ע"א 1234/56 **פלוני נ' אלמוני** (נבו 1.2.3456)`
   * **בית משפט עליון — פורסם:** `ע"א 1234/56 **פלוני נ' אלמוני**, פ"ד יב(3) 456 (1990)`
   * **בית משפט מנהלי:** `עת"מ (י-ם) 1234/56 **פלוני נ' הוועדה** (נבו 1.2.3456)` — "(י-ם)" / "(ת"א)" / וכד' = קיצור המחוז
   * **ועדת ערר תכנון ובנייה (מחוזית):** `ערר (ועדות ערר - תכנון ובנייה ת"א-יפו) 81002-01-21 **אברהם אגסי נ' הועדה המקומית לתכנון ובנייה תל אביב** (נבו 25.9.2025)`
   * **בל"מ (בקשה להארכת מועד):** `בל"מ (ועדות ערר - ירושלים) 1028/20 **חלוואני ריאד נ' רשות הרישוי - הוועדה המקומית ירושלים** (נבו 7.1.2021)`
   * **ועדת ערר ארצית:** `ערר ארצי 8047/23 **פלוני נ' אלמוני** (נבו 1.2.3456)`

   כללים:
   - **הצדדים מודגשים בלבד** — כל השאר רגיל. אל תדגיש את "ע"א" / "ערר" / מספר התיק / "(נבו ...)" / "פ"ד".
   - הצדדים = מי שמופיע **בין מספר התיק לבין הסוגריים הסופיים** (תאריך/מאגר), כלומר "[עורר/מבקש] נ' [משיב]".
   - תאריך בסוגריים סופיים בפורמט עברי "(נבו 25.9.2025)" — יום.חודש.שנה ללא אפסים מובילים.
   - אם המאגר הוא נבו והפסיקה לא פורסמה ב-פ"ד — השתמש ב-"(נבו DATE)". אם פורסמה ב-פ"ד — הוסף את ההפניה הפורמלית אחרי הצדדים: `..., פ"ד יב(3) 456 (1990)`.
   - אם לא ניתן לזהות איזשהו רכיב במדויק — השאר את **כל** השדה ריק. אל תניח / תמציא.
"""


def _build_text_window(full_text: str) -> str:
    """Return the head + tail of the ruling, with a marker if truncated.

    Most rulings have the parties/subject in the head and the conclusion
    in the tail; the middle is the discussion which is captured via the
    halacha extractor independently. Sending head+tail keeps the prompt
    cheap while preserving naming and conclusion context.
    """
    if len(full_text) <= _HEAD_CHARS + _TAIL_CHARS:
        return full_text
    return (
        full_text[:_HEAD_CHARS]
        + "\n\n[... חלק האמצע הושמט עקב אורך — ראה את החלק האחרון של הפסק להלן ...]\n\n"
        + full_text[-_TAIL_CHARS:]
    )


async def extract_metadata(case_law_id: UUID | str) -> dict:
    """Run metadata extraction. Returns a dict with the suggested values.

    Does NOT write to the DB — caller decides what to merge.
    """
    if isinstance(case_law_id, str):
        case_law_id = UUID(case_law_id)

    record = await db.get_case_law(case_law_id)
    if not record:
        return {}
    full_text = (record.get("full_text") or "").strip()
    if not full_text:
        return {}

    citation = record.get("case_number") or ""
    court = record.get("court") or ""
    date_str = str(record.get("date") or "")
    practice_area = record.get("practice_area") or ""

    context = (
        f"מראה מקום: {citation}\n"
        f"ערכאה: {court}\n"
        f"תאריך: {date_str}\n"
        f"תחום: {practice_area}"
    )
    text_window = _build_text_window(full_text)
    # Static instructions go via `system` so the SDK path can cache them
    # across uploads. Per-precedent content goes in the user prompt.
    user_msg = (
        f"## הקלט\n{context}\n\n"
        f"--- תחילת הטקסט ---\n{text_window}\n--- סוף הטקסט ---"
    )

    try:
        # Bounded structured extraction → Gemini Flash (JSON mode). The agentic
        # claude CLI hit error_max_turns on this single-shot task; see
        # gemini_session.py. Voice-sensitive/agentic work stays on claude_session.
        result = await gemini_session.query_json(
            user_msg, system=METADATA_EXTRACTION_PROMPT,
        )
    except Exception as e:
        logger.warning("precedent_metadata_extractor: query failed: %s", e)
        return {}

    if not isinstance(result, dict):
        logger.warning(
            "precedent_metadata_extractor: expected dict, got %s",
            type(result).__name__,
        )
        return {}

    # Normalize keys / types
    out: dict = {}
    if isinstance(result.get("case_name_short"), str):
        out["case_name_short"] = result["case_name_short"].strip()
    if isinstance(result.get("appeal_subtype"), str):
        out["appeal_subtype"] = result["appeal_subtype"].strip()
    if isinstance(result.get("summary"), str):
        out["summary"] = result["summary"].strip()
    if isinstance(result.get("headnote"), str):
        out["headnote"] = result["headnote"].strip()
    if isinstance(result.get("key_quote"), str):
        out["key_quote"] = result["key_quote"].strip()
    tags = result.get("subject_tags") or []
    if isinstance(tags, list):
        out["subject_tags"] = [str(t).strip() for t in tags if str(t).strip()]
    if isinstance(result.get("decision_date_iso"), str):
        out["decision_date_iso"] = result["decision_date_iso"].strip()
    if isinstance(result.get("precedent_level"), str):
        # Validate against the closed enum used elsewhere in the system
        lvl = result["precedent_level"].strip()
        if lvl in {"עליון", "מנהלי", "ועדת_ערר_ארצית", "ועדת_ערר_מחוזית"}:
            out["precedent_level"] = lvl
    if isinstance(result.get("source_type"), str):
        st = result["source_type"].strip()
        if st in {"court_ruling", "appeals_committee"}:
            out["source_type"] = st
    if isinstance(result.get("proceeding_type"), str):
        pt = result["proceeding_type"].strip()
        if pt in {"ערר", 'בל"מ', ""}:
            out["proceeding_type"] = pt
    if isinstance(result.get("court"), str):
        out["court"] = result["court"].strip()
    if isinstance(result.get("case_number_clean"), str):
        out["case_number_clean"] = result["case_number_clean"].strip()
    if isinstance(result.get("chair_name"), str):
        out["chair_name"] = result["chair_name"].strip()
    if isinstance(result.get("district"), str):
        d = result["district"].strip()
        # Closed enum for districts — anything else is dropped to avoid
        # silently storing free-text in what callers treat as a filter facet.
        if d in {"ירושלים", "תל אביב", "מרכז", "חיפה", "צפון", "דרום", "ארצית"}:
            out["district"] = d
    if isinstance(result.get("citation_formatted"), str):
        cf = result["citation_formatted"].strip()
        # Sanity check: a valid citation should contain at least one bold
        # marker pair (the parties) AND a closing paren (the reporter/date).
        # If the LLM returned a half-formed string, drop it rather than
        # store junk that the UI then has to special-case.
        if cf.count("**") >= 2 and ")" in cf:
            out["citation_formatted"] = cf
    return out


async def apply_to_record(
    case_law_id: UUID | str,
    suggested: dict,
    overwrite_case_number: bool = False,
) -> dict:
    """Merge suggested metadata into the case_law row, filling ONLY empty fields.

    Empty rules:
      - string field == "" → fill from suggested
      - list field == [] → fill from suggested
      - if suggested key is missing or empty, skip

    case_name has special handling: if the current case_name equals the
    case_number (a tell-tale sign of the upload form sending the long
    citation into both fields), treat it as empty and overwrite.

    overwrite_case_number: when True, update case_number from case_number_clean
      even if the field already has a value (used for one-time migration enrichment).
    """
    if isinstance(case_law_id, str):
        case_law_id = UUID(case_law_id)
    record = await db.get_case_law(case_law_id)
    if not record:
        return {"updated": False, "fields": []}

    fields_to_update: dict = {}

    cur_case_name = (record.get("case_name") or "").strip()
    cur_case_number = (record.get("case_number") or "").strip()
    suggested_case_name = (suggested.get("case_name_short") or "").strip()
    if suggested_case_name and (
        not cur_case_name or cur_case_name == cur_case_number
    ):
        fields_to_update["case_name"] = suggested_case_name

    if not (record.get("appeal_subtype") or "").strip():
        s = (suggested.get("appeal_subtype") or "").strip()
        if s:
            fields_to_update["appeal_subtype"] = s

    if not (record.get("summary") or "").strip():
        s = (suggested.get("summary") or "").strip()
        if s:
            fields_to_update["summary"] = s

    if not (record.get("headnote") or "").strip():
        s = (suggested.get("headnote") or "").strip()
        if s:
            fields_to_update["headnote"] = s

    if not (record.get("key_quote") or "").strip():
        s = (suggested.get("key_quote") or "").strip()
        if s:
            fields_to_update["key_quote"] = s

    cur_tags = record.get("subject_tags") or []
    # Treat character-by-character corruption as empty. Early ingest
    # pipelines stored a JSON string (`'["היטל השבחה"]'`) into a TEXT[]
    # column, which Postgres split into individual chars:
    # `['[', '"', 'ה', 'י', 'ט', 'ל', ' ', 'ה', 'ש', ...]`. Detection:
    # 3+ elements where every element is at most 2 chars (legitimate
    # tags are multi-character Hebrew words like `היטל_השבחה`).
    is_corrupt = (
        len(cur_tags) >= 3
        and all(isinstance(t, str) and len(t) <= 2 for t in cur_tags)
    )
    if not cur_tags or is_corrupt:
        sug_tags = suggested.get("subject_tags") or []
        if sug_tags:
            fields_to_update["subject_tags"] = sug_tags

    # decision_date — only fill if currently null. The DB column is DATE,
    # so we parse the LLM's ISO string into a date object before passing
    # it to update_case_law (asyncpg won't coerce a string to DATE).
    if record.get("date") is None:
        iso = (suggested.get("decision_date_iso") or "").strip()
        if iso:
            try:
                fields_to_update["date"] = date_type.fromisoformat(iso[:10])
            except ValueError:
                logger.debug(
                    "metadata_extractor: ignoring invalid decision_date_iso=%r",
                    iso,
                )

    if not (record.get("precedent_level") or "").strip():
        lvl = (suggested.get("precedent_level") or "").strip()
        if lvl:
            fields_to_update["precedent_level"] = lvl

    if not (record.get("source_type") or "").strip():
        st = (suggested.get("source_type") or "").strip()
        if st:
            fields_to_update["source_type"] = st

    if not (record.get("court") or "").strip():
        c = (suggested.get("court") or "").strip()
        if c:
            fields_to_update["court"] = c

    # proceeding_type — only fill for internal_committee rows (the field is
    # meaningless for court rulings, which we keep as '').
    if not (record.get("proceeding_type") or "").strip():
        pt = (suggested.get("proceeding_type") or "").strip()
        if pt and (record.get("source_kind") == "internal_committee"):
            fields_to_update["proceeding_type"] = pt

    if overwrite_case_number:
        cn = (suggested.get("case_number_clean") or "").strip()
        if cn:
            fields_to_update["case_number"] = cn

    # citation_formatted — full citation per Israeli citation rules. Only
    # fill if empty; user edits in /precedents/[id] are preserved.
    if not (record.get("citation_formatted") or "").strip():
        s = (suggested.get("citation_formatted") or "").strip()
        if s:
            fields_to_update["citation_formatted"] = s

    # chair_name / district — only for internal_committee rows. The DB CHECK
    # forces these to be non-empty, so the upload endpoint stamps the row
    # with "(טרם חולץ)" as a placeholder. Treat that placeholder as empty
    # so the LLM-extracted value can overwrite it.
    if record.get("source_kind") == "internal_committee":
        cur_chair = (record.get("chair_name") or "").strip()
        if cur_chair in ("", PLACEHOLDER_PENDING_EXTRACTION):
            s = (suggested.get("chair_name") or "").strip()
            if s:
                fields_to_update["chair_name"] = s
        cur_district = (record.get("district") or "").strip()
        if cur_district in ("", PLACEHOLDER_PENDING_EXTRACTION):
            s = (suggested.get("district") or "").strip()
            if s:
                fields_to_update["district"] = s

    if not fields_to_update:
        return {"updated": False, "fields": []}

    await db.update_case_law(case_law_id, **fields_to_update)
    return {"updated": True, "fields": list(fields_to_update.keys())}


async def extract_and_apply(
    case_law_id: UUID | str,
    overwrite_case_number: bool = False,
) -> dict:
    """Convenience wrapper: extract → merge into row → return summary."""
    suggested = await extract_metadata(case_law_id)
    if not suggested:
        return {"status": "no_metadata", "fields": []}
    result = await apply_to_record(case_law_id, suggested, overwrite_case_number=overwrite_case_number)
    if result["updated"]:
        await db.recompute_searchable(case_law_id)
    return {
        "status": "completed" if result["updated"] else "no_changes",
        "fields": result["fields"],
        "suggested": suggested,
    }