legal-ai/mcp-server/src/legal_mcp/services/bulletin_splitter.py

"""Split a monthly "עו"ד על נדל"ן" bulletin into typed radar items (X12).

The monthly bulletin (a SEPARATE publication from the daily "כל יום" digest) is
multi-topic: it bundles a featured ARTICLE, a list of legislative updates, and a
set of CASE-LAW pointers grouped by topic. The chair chose to catalog the
**case-law pointers** (each → a digest, like the daily issue) and the
**articles** (deep-context background) — legislative updates are skipped.

This module is the LLM splitter only. ``bulletin_library.ingest_bulletin`` turns
its output into digest rows. Like the daily extractor it is LOCAL-ONLY (claude
CLI) and MUST NOT be imported from the FastAPI container path.
"""
from __future__ import annotations

import logging

from legal_mcp import config

logger = logging.getLogger(__name__)

_VALID_PRACTICE_AREAS = {"rishuy_uvniya", "betterment_levy", "compensation_197"}

BULLETIN_SPLIT_PROMPT = """\
אתה מקבל טקסט מלא של **עלון חודשי "עו"ד על נדל"ן"** (פרסום מקצועי רב-נושאי בתחום
תכנון ובנייה, מקרקעין, היטל השבחה, פיצויים והתחדשות עירונית). פצל אותו לפריטים.

העלון בנוי משלושה חלקים: (א) **מאמר** מקצועי ארוך אחד או יותר; (ב) **עדכוני חקיקה**
(תיקוני-חוק, אישורי-תכניות, חוזרים) — **התעלם מהם, אל תחלץ**; (ג) **עדכוני פסיקה**
מקובצים לפי נושא — כל פריט = מראה-מקום של פסק דין/החלטה + שורת-תקציר.

**אל תמציא** — חלץ רק מה שמופיע בטקסט. שדה חסר → מחרוזת ריקה.

## פלט נדרש
החזר JSON אחד (object), ללא markdown:

{
  "cases": [
    {
      "underlying_citation": "מראה-המקום המלא של הפסק כפי שמופיע, מילה במילה (למשל 'ערר 8018-02-22 הועדה המקומית בת ים נ' קבוצת מזרחי ובניו השקעות בע\\"מ'). השדה הקריטי.",
      "concept_tag": "הנושא/הכותרת שתחתיה מופיע הפריט (למשל 'היטל השבחה', 'הפקעות', 'פירוק שיתוף').",
      "headline_holding": "שורת-התקציר/הכותרת של הפריט — מה נקבע/השאלה (למשל 'חוסר וודאות בין תכנית קודמת לבין ההקלה').",
      "summary": "תקציר ניטרלי קצר אם יש פירוט נוסף בגוף; אחרת חזור על headline_holding.",
      "underlying_court": "הערכאה אם מצוינת (למשל 'בית המשפט המחוזי', 'ועדת ערר').",
      "practice_area": "אחד מ: 'rishuy_uvniya' / 'betterment_levy' / 'compensation_197' — אם ברור מהנושא; אחרת ריק.",
      "subject_tags": ["2-5 תגיות snake_case בעברית"]
    }
  ],
  "articles": [
    {
      "title": "כותרת המאמר (למשל 'הפקעת קרקעות כיום - על המחוקק לתקן את העיוות שנוצר').",
      "authors": "שמות המחברים (למשל 'עו\\"ד צבי שוב, עו\\"ד רונית אלפר').",
      "summary": "2-4 משפטים: על מה המאמר ומה הטענה המרכזית.",
      "body": "הטקסט המלא של המאמר (כל הפסקאות), לצורך embedding וחיפוש-עומק.",
      "practice_area": "אחד מ-3 אם ברור; אחרת ריק.",
      "subject_tags": ["2-5 תגיות snake_case"]
    }
  ]
}

## כללים
1. **underlying_citation** — חלץ במלואו ובדיוק; הוא הגשר לפסק. פריט-פסיקה בלי מראה-מקום ברור → דלג עליו.
2. **cases** — כל מצביעי-הפסיקה בעלון, גם אם תחת נושאים שונים. אל תאחד פריטים נפרדים.
3. **articles** — רק מאמרי-עומק (לא רשימת עדכונים). body = הטקסט המלא.
4. **עדכוני חקיקה/אישורי-תכניות/חוזרים — לא לחלץ כלל.**
5. אם אין מאמר או אין פסיקה — החזר מערך ריק לאותו מפתח.
"""


def _norm_str(d: dict, key: str) -> str:
    v = d.get(key)
    return v.strip() if isinstance(v, str) else ""


def _norm_tags(d: dict) -> list[str]:
    tags = d.get("subject_tags")
    if not isinstance(tags, list):
        return []
    return [str(t).strip() for t in tags if str(t).strip()][:8]


def _norm_pa(d: dict) -> str:
    pa = _norm_str(d, "practice_area")
    return pa if pa in _VALID_PRACTICE_AREAS else ""


async def split(raw_text: str, model: str | None = None) -> dict:
    """Return ``{"cases": [...], "articles": [...]}`` extracted from a bulletin.

    Empty lists on any failure (surfaced as a warning, never raised) so the
    batch keeps going. Each item is type-normalized; malformed items are dropped.
    """
    from legal_mcp.services import claude_session

    text = (raw_text or "").strip()
    if not text:
        return {"cases": [], "articles": []}

    try:
        result = await claude_session.query_json(
            text,
            system=BULLETIN_SPLIT_PROMPT,
            model=(model or config.DIGEST_EXTRACT_MODEL or None),
            tools="",  # pure text→JSON; disable tools (avoids error_max_turns)
        )
    except Exception as e:  # §6 — surfaced, not swallowed
        logger.warning("bulletin_splitter: query failed: %s", e)
        return {"cases": [], "articles": []}

    if not isinstance(result, dict):
        logger.warning("bulletin_splitter: expected dict, got %s", type(result).__name__)
        return {"cases": [], "articles": []}

    cases: list[dict] = []
    for c in result.get("cases") or []:
        if not isinstance(c, dict):
            continue
        citation = _norm_str(c, "underlying_citation")
        if not citation:  # rule 1: no anchor → skip
            continue
        cases.append({
            "underlying_citation": citation,
            "concept_tag": _norm_str(c, "concept_tag"),
            "headline_holding": _norm_str(c, "headline_holding"),
            "summary": _norm_str(c, "summary") or _norm_str(c, "headline_holding"),
            "underlying_court": _norm_str(c, "underlying_court"),
            "practice_area": _norm_pa(c),
            "subject_tags": _norm_tags(c),
        })

    articles: list[dict] = []
    for a in result.get("articles") or []:
        if not isinstance(a, dict):
            continue
        title = _norm_str(a, "title")
        body = _norm_str(a, "body")
        if not (title or body):
            continue
        articles.append({
            "title": title,
            "authors": _norm_str(a, "authors"),
            "summary": _norm_str(a, "summary"),
            "body": body,
            "practice_area": _norm_pa(a),
            "subject_tags": _norm_tags(a),
        })

    return {"cases": cases, "articles": articles}