feat: external precedent library with auto halacha extraction

Adds a third corpus of legal authority distinct from style_corpus (Daphna's prior decisions for voice) and case_precedents (chair-attached quotes per case). The new corpus holds chair-uploaded court rulings and other appeals committee decisions, with binding rules (הלכות) extracted automatically and queued for chair approval. Pipeline (web/app.py + services/precedent_library.py): file → extract → chunk → Voyage embed → halacha_extractor → store + publish progress over the existing Redis SSE channel. Schema V7 (services/db.py): extends case_law with source_kind + extraction status fields under a CHECK constraint pinning practice_area to the three appeals committee domains (rishuy_uvniya, betterment_levy, compensation_197). New precedent_chunks (vector(1024)) and halachot tables (vector(1024) over rule_statement, IVFFlat indexes, gin on practice_areas/subject_tags). Halachot start as pending_review; only approved/published rows are visible to search_precedent_library. Agents: legal-writer, legal-researcher, legal-analyst, legal-ceo, legal-qa get search_precedent_library. legal-writer prompt explains the three-corpus distinction and CREAC use; legal-qa now verifies that every cited halacha resolves to an approved row in the corpus. UI: /precedents page with four tabs — library / semantic search / pending review (J/K nav, A/R/E shortcuts, badge count) / stats. Reuses the existing upload-sheet progress + SSE pattern. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 08:38:18 +00:00
parent a6edb75bbf
commit 7ee90dce31
23 changed files with 3853 additions and 67 deletions
--- a/mcp-server/src/legal_mcp/services/precedent_library.py
+++ b/mcp-server/src/legal_mcp/services/precedent_library.py
@@ -0,0 +1,309 @@
+"""Orchestrator for the External Precedent Library.
+
+Ingest pipeline (one upload):
+    file → extract_text → proofread → INSERT case_law (source_kind='external_upload')
+        → chunk → embed → store precedent_chunks
+        → halacha_extractor.extract → embed halachot → store halachot
+        → set extraction_status='completed'
+
+Progress is reported via a caller-supplied async callback so the
+web layer can pipe updates into the existing Redis ProgressStore /
+SSE plumbing without this module knowing about Redis.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+import shutil
+from datetime import date
+from pathlib import Path
+from typing import Awaitable, Callable
+from uuid import UUID, uuid4
+
+from legal_mcp import config
+from legal_mcp.services import (
+    chunker,
+    db,
+    embeddings,
+    extractor,
+    halacha_extractor,
+)
+
+logger = logging.getLogger(__name__)
+
+
+ProgressCb = Callable[[str, int, str], Awaitable[None]]
+
+
+PRECEDENT_LIBRARY_DIR = Path(config.DATA_DIR) / "precedent-library"
+
+
+_VALID_PRACTICE_AREAS = {"", "rishuy_uvniya", "betterment_levy", "compensation_197"}
+_VALID_SOURCE_TYPES = {"", "court_ruling", "appeals_committee"}
+_VALID_PRECEDENT_LEVELS = {
+    "", "עליון", "מנהלי", "ועדת_ערר_ארצית", "ועדת_ערר_מחוזית",
+    "supreme", "administrative", "national_appeals_committee", "district_appeals_committee",
+}
+
+
+async def _noop_progress(_status: str, _percent: int, _msg: str) -> None:
+    return None
+
+
+def _safe_filename(name: str) -> str:
+    """Strip path separators and unsafe chars from a user-provided name."""
+    base = Path(name).name
+    return re.sub(r"[^\w.\-+א-ת ]", "_", base) or f"upload-{uuid4().hex[:8]}"
+
+
+def _stage_file(src_path: Path, source_type: str) -> Path:
+    """Copy the uploaded file into data/precedent-library/<source_type>/.
+
+    Returns the destination path. Source file is not deleted (caller decides).
+    """
+    sub = source_type if source_type in {"court_ruling", "appeals_committee"} else "other"
+    dest_dir = PRECEDENT_LIBRARY_DIR / sub
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    safe_name = _safe_filename(src_path.name)
+    dest = dest_dir / f"{uuid4().hex[:8]}_{safe_name}"
+    shutil.copy2(src_path, dest)
+    return dest
+
+
+def _coerce_date(value) -> date | None:
+    if value is None or value == "":
+        return None
+    if isinstance(value, date):
+        return value
+    if isinstance(value, str):
+        try:
+            return date.fromisoformat(value[:10])
+        except ValueError:
+            return None
+    return None
+
+
+async def ingest_precedent(
+    *,
+    file_path: str | Path,
+    citation: str,
+    case_name: str = "",
+    court: str = "",
+    decision_date=None,
+    source_type: str = "",
+    precedent_level: str = "",
+    practice_area: str = "",
+    appeal_subtype: str = "",
+    subject_tags: list[str] | None = None,
+    is_binding: bool = True,
+    headnote: str = "",
+    summary: str = "",
+    document_id: UUID | None = None,
+    progress: ProgressCb | None = None,
+) -> dict:
+    """Ingest a single uploaded precedent through the full pipeline.
+
+    Required: file_path + citation. Everything else has a sensible default.
+
+    Returns:
+        ``{"status": "...", "case_law_id": "...", "chunks": N, "halachot": M}``
+    """
+    progress = progress or _noop_progress
+    src = Path(file_path)
+    if not src.is_file():
+        raise FileNotFoundError(f"file not found: {src}")
+    if not citation.strip():
+        raise ValueError("citation is required")
+    if practice_area not in _VALID_PRACTICE_AREAS:
+        raise ValueError(f"invalid practice_area: {practice_area!r}")
+    if source_type not in _VALID_SOURCE_TYPES:
+        raise ValueError(f"invalid source_type: {source_type!r}")
+
+    await progress("staging", 5, "מעתיק את הקובץ לאחסון")
+
+    staged = _stage_file(src, source_type)
+
+    await progress("extracting", 15, "מחלץ טקסט מהקובץ")
+    try:
+        text, page_count = await extractor.extract_text(str(staged))
+    except Exception as e:
+        await progress("failed", 100, f"כשל בחילוץ טקסט: {e}")
+        raise
+
+    text = (text or "").strip()
+    if not text:
+        await progress("failed", 100, "לא נמצא טקסט בקובץ")
+        raise ValueError("no extractable text in file")
+
+    # Strip any Nevo preamble that might wrap court rulings downloaded from Nevo.
+    text = extractor.strip_nevo_preamble(text)
+
+    await progress("storing_metadata", 25, "שומר את הפסיקה במסד הנתונים")
+    record = await db.create_external_case_law(
+        case_number=citation.strip(),
+        case_name=case_name.strip() or citation.strip(),
+        full_text=text,
+        court=court.strip(),
+        decision_date=_coerce_date(decision_date),
+        practice_area=practice_area,
+        appeal_subtype=appeal_subtype.strip(),
+        subject_tags=list(subject_tags or []),
+        summary=summary.strip(),
+        headnote=headnote.strip(),
+        source_type=source_type,
+        precedent_level=precedent_level,
+        is_binding=is_binding,
+        document_id=document_id,
+    )
+    case_law_id = UUID(str(record["id"]))
+
+    try:
+        await progress("chunking", 40, f"מחלק את הטקסט ל-chunks ({page_count} עמ')")
+        chunks = chunker.chunk_document(text)
+        if not chunks:
+            await db.set_case_law_extraction_status(case_law_id, "completed")
+            await db.set_case_law_halacha_status(case_law_id, "completed")
+            await progress("completed", 100, "אין טקסט לעיבוד")
+            return {
+                "status": "completed",
+                "case_law_id": str(case_law_id),
+                "chunks": 0,
+                "halachot": 0,
+            }
+
+        await progress("embedding", 55, f"מייצר embeddings ל-{len(chunks)} chunks")
+        chunk_texts = [c.content for c in chunks]
+        chunk_vectors = await embeddings.embed_texts(chunk_texts, input_type="document")
+
+        chunk_dicts = [
+            {
+                "chunk_index": c.chunk_index,
+                "content": c.content,
+                "section_type": c.section_type,
+                "page_number": c.page_number,
+                "embedding": v,
+            }
+            for c, v in zip(chunks, chunk_vectors)
+        ]
+        stored_chunks = await db.store_precedent_chunks(case_law_id, chunk_dicts)
+
+        await progress("extracting_halachot", 75, "מחלץ הלכות מחייבות")
+        await db.set_case_law_extraction_status(case_law_id, "completed")
+        halacha_result = await halacha_extractor.extract(case_law_id)
+
+        await progress(
+            "completed",
+            100,
+            f"הוכנס לספרייה: {stored_chunks} chunks, "
+            f"{halacha_result.get('stored', 0)} הלכות ממתינות לאישור",
+        )
+
+        return {
+            "status": "completed",
+            "case_law_id": str(case_law_id),
+            "chunks": stored_chunks,
+            "halachot": halacha_result.get("stored", 0),
+            "halachot_extracted_raw": halacha_result.get("extracted", 0),
+            "halachot_verified": halacha_result.get("verified", 0),
+            "pages": page_count,
+        }
+
+    except Exception as e:
+        logger.exception("precedent_library.ingest_precedent failed: %s", e)
+        await db.set_case_law_extraction_status(case_law_id, "failed")
+        await progress("failed", 100, f"כשל בעיבוד: {e}")
+        raise
+
+
+async def reextract_halachot(
+    case_law_id: UUID | str,
+    progress: ProgressCb | None = None,
+) -> dict:
+    """Re-run the halacha extractor on an existing precedent. Idempotent."""
+    progress = progress or _noop_progress
+    if isinstance(case_law_id, str):
+        case_law_id = UUID(case_law_id)
+
+    record = await db.get_case_law(case_law_id)
+    if not record or record.get("source_kind") != "external_upload":
+        raise ValueError("precedent not found or not chair-uploaded")
+
+    await progress("extracting_halachot", 50, "מחלץ הלכות מחדש")
+    result = await halacha_extractor.extract(case_law_id)
+    await progress(
+        "completed",
+        100,
+        f"הופקו {result.get('stored', 0)} הלכות (ממתינות לאישור)",
+    )
+    return result
+
+
+async def delete_precedent(case_law_id: UUID | str) -> bool:
+    """Delete a precedent and cascade chunks + halachot."""
+    if isinstance(case_law_id, str):
+        case_law_id = UUID(case_law_id)
+    return await db.delete_case_law(case_law_id)
+
+
+async def get_precedent(case_law_id: UUID | str) -> dict | None:
+    """Get a precedent with its halachot attached."""
+    if isinstance(case_law_id, str):
+        case_law_id = UUID(case_law_id)
+    record = await db.get_case_law(case_law_id)
+    if not record:
+        return None
+    record["halachot"] = await db.list_halachot(case_law_id=case_law_id, limit=500)
+    return record
+
+
+async def list_precedents(
+    practice_area: str = "",
+    court: str = "",
+    precedent_level: str = "",
+    source_type: str = "",
+    search: str = "",
+    limit: int = 100,
+    offset: int = 0,
+) -> list[dict]:
+    return await db.list_external_case_law(
+        practice_area=practice_area,
+        court=court,
+        precedent_level=precedent_level,
+        source_type=source_type,
+        search=search,
+        limit=limit,
+        offset=offset,
+    )
+
+
+async def search_library(
+    query: str,
+    practice_area: str = "",
+    court: str = "",
+    precedent_level: str = "",
+    appeal_subtype: str = "",
+    is_binding: bool | None = None,
+    subject_tag: str = "",
+    limit: int = 10,
+    include_halachot: bool = True,
+) -> list[dict]:
+    """Semantic search merging halachot (rule-level) and chunks (passage-level).
+
+    Only ``approved`` / ``published`` halachot are returned, per chair-review
+    policy. Chunks are returned regardless of halacha review status.
+    """
+    if not query.strip():
+        return []
+    query_vec = await embeddings.embed_query(query)
+    return await db.search_precedent_library_semantic(
+        query_embedding=query_vec,
+        practice_area=practice_area,
+        court=court,
+        precedent_level=precedent_level,
+        appeal_subtype=appeal_subtype,
+        is_binding=is_binding,
+        subject_tag=subject_tag,
+        limit=limit,
+        include_halachot=include_halachot,
+    )