legal-ai/scripts/decompose-decisions.py

#!/usr/bin/env python3
"""Decompose 6 final decisions into 12-block structure.

Uses heuristic parsing based on known section headers in Dafna's decisions.
"""

import asyncio
import json
import re
import sys
from pathlib import Path
from uuid import UUID

sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))

from legal_mcp.services.db import get_pool, init_schema, close_pool


# ═══════════════════════════════════════════════════════════════════
# Block definitions with detection patterns
# ═══════════════════════════════════════════════════════════════════

BLOCKS = [
    {
        "block_id": "block-alef",
        "block_index": 1,
        "title": "כותרת מוסדית",
        "generation_type": "template-fill",
    },
    {
        "block_id": "block-bet",
        "block_index": 2,
        "title": "הרכב הוועדה",
        "generation_type": "template-fill",
    },
    {
        "block_id": "block-gimel",
        "block_index": 3,
        "title": "צדדים",
        "generation_type": "template-fill",
    },
    {
        "block_id": "block-dalet",
        "block_index": 4,
        "title": "כותרת החלטה",
        "generation_type": "template-fill",
    },
    {
        "block_id": "block-he",
        "block_index": 5,
        "title": "פתיחה",
        "generation_type": "paraphrase",
    },
    {
        "block_id": "block-vav",
        "block_index": 6,
        "title": "רקע עובדתי",
        "generation_type": "reproduction",
    },
    {
        "block_id": "block-zayin",
        "block_index": 7,
        "title": "טענות הצדדים",
        "generation_type": "paraphrase",
    },
    {
        "block_id": "block-chet",
        "block_index": 8,
        "title": "הליכים בפני ועדת הערר",
        "generation_type": "reproduction",
    },
    {
        "block_id": "block-tet",
        "block_index": 9,
        "title": "תכניות חלות",
        "generation_type": "guided-synthesis",
    },
    {
        "block_id": "block-yod",
        "block_index": 10,
        "title": "דיון והכרעה",
        "generation_type": "rhetorical-construction",
    },
    {
        "block_id": "block-yod-alef",
        "block_index": 11,
        "title": "סיכום",
        "generation_type": "paraphrase",
    },
    {
        "block_id": "block-yod-bet",
        "block_index": 12,
        "title": "חתימות",
        "generation_type": "template-fill",
    },
]

# Section header patterns (Hebrew)
SECTION_PATTERNS = {
    "claims": re.compile(r"תמצית\s*טענות\s*הצדדים|טענות\s*הצדדים|טענות\s*העוררי"),
    "proceedings": re.compile(r"ההליכים\s*בפני\s*ועדת\s*הערר|הליכים\s*בפני\s*הוועדה|הדיון\s*בפני\s*ועדת\s*הערר"),
    "plans": re.compile(r"תכניות\s*חלות|המסגרת\s*התכנונית|הוראות\s*התכנית"),
    "discussion": re.compile(r"דיון\s*והכרעה|דיון|הכרעה"),
    "summary": re.compile(r"^סיכום$|^סוף\s*דבר$", re.MULTILINE),
    "appellant_claims": re.compile(r"טענות\s*העוררי|טענות\s*העורר"),
    "respondent_claims": re.compile(r"עמדת\s*הוועדה\s*המקומית|תגובת\s*המשיבה|עמדת\s*המשיב"),
    "permit_applicant": re.compile(r"עמדת\s*מבקש|עמדת\s*מגיש|עמדת\s*היזם"),
    "panel": re.compile(r"בפני[:\s]|יו\"ר"),
    "parties_vs": re.compile(r"\s*נגד\s*"),
    "decision_title": re.compile(r"^החלטה$", re.MULTILINE),
    "opening": re.compile(r"^לפנינו\s|^בפנינו\s"),
    "signature": re.compile(r"ניתנה?\s*(היום|פה\s*אחד|ביום)|חתימ"),
}


def find_section_start(text: str, pattern: re.Pattern) -> int:
    """Find the character position where a section starts."""
    match = pattern.search(text)
    return match.start() if match else -1


def decompose_decision(text: str) -> list[dict]:
    """Parse decision text into blocks based on section headers."""
    lines = text.split("\n")
    total_len = len(text)

    # Find key section boundaries
    pos_claims = find_section_start(text, SECTION_PATTERNS["claims"])
    pos_proceedings = find_section_start(text, SECTION_PATTERNS["proceedings"])
    pos_plans = find_section_start(text, SECTION_PATTERNS["plans"])
    pos_discussion = find_section_start(text, SECTION_PATTERNS["discussion"])
    pos_summary = find_section_start(text, SECTION_PATTERNS["summary"])
    pos_signature = find_section_start(text, SECTION_PATTERNS["signature"])
    pos_opening = find_section_start(text, SECTION_PATTERNS["opening"])
    pos_decision_title = find_section_start(text, SECTION_PATTERNS["decision_title"])
    pos_panel = find_section_start(text, SECTION_PATTERNS["panel"])
    pos_parties = find_section_start(text, SECTION_PATTERNS["parties_vs"])

    # Build blocks based on what we found
    blocks = []

    # Blocks א-ד: Header area (before the opening "לפנינו")
    header_end = pos_opening if pos_opening > 0 else pos_claims if pos_claims > 0 else 500
    header_text = text[:header_end].strip()

    # Try to split header into institutional header, panel, parties, title
    if pos_panel > 0 and pos_panel < header_end:
        blocks.append({"block_id": "block-alef", "content": text[:pos_panel].strip()})

        if pos_parties > 0 and pos_parties < header_end:
            blocks.append({"block_id": "block-bet", "content": text[pos_panel:pos_parties].strip()})
            if pos_decision_title > 0 and pos_decision_title < header_end:
                blocks.append({"block_id": "block-gimel", "content": text[pos_parties:pos_decision_title].strip()})
                blocks.append({"block_id": "block-dalet", "content": "החלטה"})
            else:
                blocks.append({"block_id": "block-gimel", "content": text[pos_parties:header_end].strip()})
                blocks.append({"block_id": "block-dalet", "content": "החלטה"})
        else:
            blocks.append({"block_id": "block-bet", "content": text[pos_panel:header_end].strip()})
            blocks.append({"block_id": "block-gimel", "content": ""})
            blocks.append({"block_id": "block-dalet", "content": "החלטה"})
    else:
        # Can't split — put everything in alef
        blocks.append({"block_id": "block-alef", "content": header_text})
        blocks.append({"block_id": "block-bet", "content": ""})
        blocks.append({"block_id": "block-gimel", "content": ""})
        blocks.append({"block_id": "block-dalet", "content": "החלטה"})

    # Block ה: Opening — from "לפנינו" to claims section
    if pos_opening > 0:
        opening_end = pos_claims if pos_claims > pos_opening else pos_discussion if pos_discussion > pos_opening else total_len
        # Opening is usually just 1-3 paragraphs
        opening_text = text[pos_opening:min(pos_opening + 1000, opening_end)].strip()
        # Find end of first few paragraphs
        para_breaks = [i for i, c in enumerate(opening_text) if c == '\n' and i > 50]
        if len(para_breaks) >= 2:
            opening_text = opening_text[:para_breaks[1]].strip()
        blocks.append({"block_id": "block-he", "content": opening_text})

        # Block ו: Background — from after opening to claims
        if pos_claims > pos_opening:
            bg_start = pos_opening + len(opening_text)
            blocks.append({"block_id": "block-vav", "content": text[bg_start:pos_claims].strip()})
        else:
            blocks.append({"block_id": "block-vav", "content": ""})
    else:
        blocks.append({"block_id": "block-he", "content": ""})
        blocks.append({"block_id": "block-vav", "content": ""})

    # Block ז: Claims
    if pos_claims > 0:
        claims_end = pos_proceedings if pos_proceedings > pos_claims else pos_discussion if pos_discussion > pos_claims else pos_summary if pos_summary > pos_claims else total_len
        blocks.append({"block_id": "block-zayin", "content": text[pos_claims:claims_end].strip()})
    else:
        blocks.append({"block_id": "block-zayin", "content": ""})

    # Block ח: Proceedings (optional)
    if pos_proceedings > 0:
        proc_end = pos_plans if pos_plans > pos_proceedings else pos_discussion if pos_discussion > pos_proceedings else pos_summary if pos_summary > pos_proceedings else total_len
        blocks.append({"block_id": "block-chet", "content": text[pos_proceedings:proc_end].strip()})
    else:
        blocks.append({"block_id": "block-chet", "content": ""})

    # Block ט: Plans (optional)
    if pos_plans > 0 and pos_plans < (pos_discussion if pos_discussion > 0 else total_len):
        plans_end = pos_discussion if pos_discussion > pos_plans else pos_summary if pos_summary > pos_plans else total_len
        blocks.append({"block_id": "block-tet", "content": text[pos_plans:plans_end].strip()})
    else:
        blocks.append({"block_id": "block-tet", "content": ""})

    # Block י: Discussion
    if pos_discussion > 0:
        disc_end = pos_summary if pos_summary > pos_discussion else pos_signature if pos_signature > pos_discussion else total_len
        blocks.append({"block_id": "block-yod", "content": text[pos_discussion:disc_end].strip()})
    else:
        blocks.append({"block_id": "block-yod", "content": ""})

    # Block יא: Summary
    if pos_summary > 0:
        summ_end = pos_signature if pos_signature > pos_summary else total_len
        blocks.append({"block_id": "block-yod-alef", "content": text[pos_summary:summ_end].strip()})
    else:
        blocks.append({"block_id": "block-yod-alef", "content": ""})

    # Block יב: Signatures
    if pos_signature > 0:
        blocks.append({"block_id": "block-yod-bet", "content": text[pos_signature:].strip()})
    else:
        blocks.append({"block_id": "block-yod-bet", "content": ""})

    return blocks


async def main():
    await init_schema()
    pool = await get_pool()

    async with pool.acquire() as conn:
        decisions = await conn.fetch(
            """SELECT d.id as decision_id, c.case_number, c.title, d.total_words,
                      doc.extracted_text
               FROM decisions d
               JOIN cases c ON c.id = d.case_id
               JOIN documents doc ON doc.case_id = d.case_id AND doc.doc_type = 'decision'
               WHERE d.status = 'final'
               ORDER BY c.case_number"""
        )

    for dec in decisions:
        decision_id = dec["decision_id"]
        case_number = dec["case_number"]
        text = dec["extracted_text"]
        total_words = len(text.split())

        print(f"\n{'='*60}")
        print(f"מפרק: {case_number} — {dec['title']}")
        print(f"{'='*60}")

        # Decompose
        blocks = decompose_decision(text)

        # Merge with block metadata
        block_data = []
        for block_def in BLOCKS:
            matching = [b for b in blocks if b["block_id"] == block_def["block_id"]]
            content = matching[0]["content"] if matching else ""
            word_count = len(content.split()) if content else 0
            weight = round((word_count / total_words * 100), 2) if total_words > 0 and word_count > 0 else 0

            block_data.append({
                **block_def,
                "content": content,
                "word_count": word_count,
                "weight_percent": weight,
                "status": "final" if content else "empty",
            })

        # Print summary
        for b in block_data:
            status = "✅" if b["word_count"] > 0 else "⬜"
            print(f"  {status} {b['block_id']:18s} | {b['title']:25s} | {b['word_count']:5d} מילים | {b['weight_percent']:5.1f}%")

        # Store in DB
        async with pool.acquire() as conn:
            # Delete existing blocks for this decision
            await conn.execute(
                "DELETE FROM decision_blocks WHERE decision_id = $1", decision_id
            )

            for b in block_data:
                await conn.execute(
                    """INSERT INTO decision_blocks
                       (decision_id, block_id, block_index, title, content,
                        word_count, weight_percent, generation_type, status)
                       VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""",
                    decision_id,
                    b["block_id"], b["block_index"], b["title"],
                    b["content"], b["word_count"], b["weight_percent"],
                    b["generation_type"], b["status"],
                )

            # Count paragraphs in discussion block
            discussion = [b for b in block_data if b["block_id"] == "block-yod"][0]
            if discussion["content"]:
                paragraphs = [p.strip() for p in discussion["content"].split("\n") if p.strip() and len(p.strip()) > 20]
                await conn.execute(
                    "UPDATE decisions SET total_paragraphs = $1 WHERE id = $2",
                    len(paragraphs), decision_id,
                )

    # Final summary
    async with pool.acquire() as conn:
        block_count = await conn.fetchval("SELECT count(*) FROM decision_blocks")
        non_empty = await conn.fetchval("SELECT count(*) FROM decision_blocks WHERE status = 'final'")

    await close_pool()

    print(f"\n{'='*60}")
    print(f"✅ סה\"כ בלוקים: {block_count} ({non_empty} עם תוכן)")


if __name__ == "__main__":
    asyncio.run(main())