legal-ai/scripts/.archive/extract_claims_8174.py

#!/usr/bin/env python3
"""One-shot: extract appellant claims for case 8174-24.

The analyst (CMPA-13) finished but `extract_claims` timed out three times on
the main 25K-char appeal document, so we have only 19 committee/response
claims in DB and zero appellant claims. This script reruns extraction with
a higher timeout and parallel chunks.

Targets:
  • כתב ערר 18.12.24 (appeal, 25,474 chars) — appellant claims
  • השלמת מסמכים תמ״א 38 (decision, 3,718 chars) — supplementary appeal filing

After phase 1.1-1.3 lands, this script becomes obsolete.

Usage: /home/chaim/legal-ai/mcp-server/.venv/bin/python scripts/extract_claims_8174.py
"""

from __future__ import annotations

import asyncio
import json
import sys
import time
from pathlib import Path
from uuid import UUID

# Ensure we can import legal_mcp from this repo's mcp-server tree
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))

from legal_mcp.services import claims_extractor, claude_session, db


# ── Patch claude_session to use 30-min ceiling ───────────────────────
# The hard-coded timeout=120 in claims_extractor.extract_claims_with_ai is
# what kept failing. Force every claude_session call here to use 1800s.
_orig_query_json = claude_session.query_json
_orig_query = claude_session.query


def _patched_query_json(prompt: str, timeout: int = 120):
    return _orig_query_json(prompt, timeout=max(timeout, 1800))


def _patched_query(prompt: str, timeout: int = 120, max_turns: int = 1):
    return _orig_query(prompt, timeout=max(timeout, 1800), max_turns=max_turns)


claude_session.query_json = _patched_query_json
claude_session.query = _patched_query


CASE_NUMBER = "8174-24"

TARGETS = [
    # (doc_id, title hint, doc_type override, party_hint)
    ("655f96f7-d406-44ac-bb53-6b2c1ab2909c", "כתב ערר 18.12.24", "appeal", "יואל גולדמן"),
    ("13b4795a-4fb7-460e-bddf-a5d282a1a67f", "השלמת מסמכים תמ״א 38", "appeal", "יואל גולדמן"),
]


async def main() -> int:
    case = await db.get_case_by_number(CASE_NUMBER)
    if not case:
        print(f"ERROR: case {CASE_NUMBER} not found")
        return 1
    case_id = UUID(case["id"])
    print(f"=== Case {CASE_NUMBER} — {case['title']} ===")
    print()

    for doc_id, label, doc_type, party_hint in TARGETS:
        text = await db.get_document_text(UUID(doc_id))
        if not text:
            print(f"SKIP {label} — no extracted_text")
            continue

        chars = len(text)
        print(f"--- {label} ({chars:,} chars, doc_type={doc_type}) ---")
        t0 = time.monotonic()
        try:
            result = await claims_extractor.extract_and_store_claims(
                case_id=case_id,
                document_id=UUID(doc_id),
                text=text,
                doc_type=doc_type,
                party_hint=party_hint,
            )
        except Exception as e:
            print(f"  FAILED: {e}")
            continue
        dt = time.monotonic() - t0
        print(f"  done in {dt:.1f}s — {json.dumps(result, ensure_ascii=False)}")
        print()

    # Final tally
    pool = await db.get_pool()
    async with pool.acquire() as conn:
        rows = await conn.fetch(
            """SELECT party_role, claim_type, source_document, count(*) as n
               FROM claims WHERE case_id = $1
               GROUP BY 1, 2, 3 ORDER BY 1, 3""",
            case_id,
        )
    print("=== Final claims breakdown ===")
    total = 0
    for r in rows:
        n = r["n"]
        total += n
        print(f"  {r['party_role']:12} {r['claim_type']:10} ({n:3}) ← {r['source_document']}")
    print(f"  TOTAL: {total} claims")
    return 0


if __name__ == "__main__":
    sys.exit(asyncio.run(main()))