feat: external precedent library with auto halacha extraction
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m27s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m27s
Adds a third corpus of legal authority distinct from style_corpus (Daphna's prior decisions for voice) and case_precedents (chair-attached quotes per case). The new corpus holds chair-uploaded court rulings and other appeals committee decisions, with binding rules (הלכות) extracted automatically and queued for chair approval. Pipeline (web/app.py + services/precedent_library.py): file → extract → chunk → Voyage embed → halacha_extractor → store + publish progress over the existing Redis SSE channel. Schema V7 (services/db.py): extends case_law with source_kind + extraction status fields under a CHECK constraint pinning practice_area to the three appeals committee domains (rishuy_uvniya, betterment_levy, compensation_197). New precedent_chunks (vector(1024)) and halachot tables (vector(1024) over rule_statement, IVFFlat indexes, gin on practice_areas/subject_tags). Halachot start as pending_review; only approved/published rows are visible to search_precedent_library. Agents: legal-writer, legal-researcher, legal-analyst, legal-ceo, legal-qa get search_precedent_library. legal-writer prompt explains the three-corpus distinction and CREAC use; legal-qa now verifies that every cited halacha resolves to an approved row in the corpus. UI: /precedents page with four tabs — library / semantic search / pending review (J/K nav, A/R/E shortcuts, badge count) / stats. Reuses the existing upload-sheet progress + SSE pattern. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
309
mcp-server/src/legal_mcp/services/precedent_library.py
Normal file
309
mcp-server/src/legal_mcp/services/precedent_library.py
Normal file
@@ -0,0 +1,309 @@
|
||||
"""Orchestrator for the External Precedent Library.
|
||||
|
||||
Ingest pipeline (one upload):
|
||||
file → extract_text → proofread → INSERT case_law (source_kind='external_upload')
|
||||
→ chunk → embed → store precedent_chunks
|
||||
→ halacha_extractor.extract → embed halachot → store halachot
|
||||
→ set extraction_status='completed'
|
||||
|
||||
Progress is reported via a caller-supplied async callback so the
|
||||
web layer can pipe updates into the existing Redis ProgressStore /
|
||||
SSE plumbing without this module knowing about Redis.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import shutil
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
from typing import Awaitable, Callable
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from legal_mcp import config
|
||||
from legal_mcp.services import (
|
||||
chunker,
|
||||
db,
|
||||
embeddings,
|
||||
extractor,
|
||||
halacha_extractor,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
ProgressCb = Callable[[str, int, str], Awaitable[None]]
|
||||
|
||||
|
||||
PRECEDENT_LIBRARY_DIR = Path(config.DATA_DIR) / "precedent-library"
|
||||
|
||||
|
||||
_VALID_PRACTICE_AREAS = {"", "rishuy_uvniya", "betterment_levy", "compensation_197"}
|
||||
_VALID_SOURCE_TYPES = {"", "court_ruling", "appeals_committee"}
|
||||
_VALID_PRECEDENT_LEVELS = {
|
||||
"", "עליון", "מנהלי", "ועדת_ערר_ארצית", "ועדת_ערר_מחוזית",
|
||||
"supreme", "administrative", "national_appeals_committee", "district_appeals_committee",
|
||||
}
|
||||
|
||||
|
||||
async def _noop_progress(_status: str, _percent: int, _msg: str) -> None:
|
||||
return None
|
||||
|
||||
|
||||
def _safe_filename(name: str) -> str:
|
||||
"""Strip path separators and unsafe chars from a user-provided name."""
|
||||
base = Path(name).name
|
||||
return re.sub(r"[^\w.\-+א-ת ]", "_", base) or f"upload-{uuid4().hex[:8]}"
|
||||
|
||||
|
||||
def _stage_file(src_path: Path, source_type: str) -> Path:
|
||||
"""Copy the uploaded file into data/precedent-library/<source_type>/.
|
||||
|
||||
Returns the destination path. Source file is not deleted (caller decides).
|
||||
"""
|
||||
sub = source_type if source_type in {"court_ruling", "appeals_committee"} else "other"
|
||||
dest_dir = PRECEDENT_LIBRARY_DIR / sub
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
safe_name = _safe_filename(src_path.name)
|
||||
dest = dest_dir / f"{uuid4().hex[:8]}_{safe_name}"
|
||||
shutil.copy2(src_path, dest)
|
||||
return dest
|
||||
|
||||
|
||||
def _coerce_date(value) -> date | None:
|
||||
if value is None or value == "":
|
||||
return None
|
||||
if isinstance(value, date):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return date.fromisoformat(value[:10])
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
async def ingest_precedent(
|
||||
*,
|
||||
file_path: str | Path,
|
||||
citation: str,
|
||||
case_name: str = "",
|
||||
court: str = "",
|
||||
decision_date=None,
|
||||
source_type: str = "",
|
||||
precedent_level: str = "",
|
||||
practice_area: str = "",
|
||||
appeal_subtype: str = "",
|
||||
subject_tags: list[str] | None = None,
|
||||
is_binding: bool = True,
|
||||
headnote: str = "",
|
||||
summary: str = "",
|
||||
document_id: UUID | None = None,
|
||||
progress: ProgressCb | None = None,
|
||||
) -> dict:
|
||||
"""Ingest a single uploaded precedent through the full pipeline.
|
||||
|
||||
Required: file_path + citation. Everything else has a sensible default.
|
||||
|
||||
Returns:
|
||||
``{"status": "...", "case_law_id": "...", "chunks": N, "halachot": M}``
|
||||
"""
|
||||
progress = progress or _noop_progress
|
||||
src = Path(file_path)
|
||||
if not src.is_file():
|
||||
raise FileNotFoundError(f"file not found: {src}")
|
||||
if not citation.strip():
|
||||
raise ValueError("citation is required")
|
||||
if practice_area not in _VALID_PRACTICE_AREAS:
|
||||
raise ValueError(f"invalid practice_area: {practice_area!r}")
|
||||
if source_type not in _VALID_SOURCE_TYPES:
|
||||
raise ValueError(f"invalid source_type: {source_type!r}")
|
||||
|
||||
await progress("staging", 5, "מעתיק את הקובץ לאחסון")
|
||||
|
||||
staged = _stage_file(src, source_type)
|
||||
|
||||
await progress("extracting", 15, "מחלץ טקסט מהקובץ")
|
||||
try:
|
||||
text, page_count = await extractor.extract_text(str(staged))
|
||||
except Exception as e:
|
||||
await progress("failed", 100, f"כשל בחילוץ טקסט: {e}")
|
||||
raise
|
||||
|
||||
text = (text or "").strip()
|
||||
if not text:
|
||||
await progress("failed", 100, "לא נמצא טקסט בקובץ")
|
||||
raise ValueError("no extractable text in file")
|
||||
|
||||
# Strip any Nevo preamble that might wrap court rulings downloaded from Nevo.
|
||||
text = extractor.strip_nevo_preamble(text)
|
||||
|
||||
await progress("storing_metadata", 25, "שומר את הפסיקה במסד הנתונים")
|
||||
record = await db.create_external_case_law(
|
||||
case_number=citation.strip(),
|
||||
case_name=case_name.strip() or citation.strip(),
|
||||
full_text=text,
|
||||
court=court.strip(),
|
||||
decision_date=_coerce_date(decision_date),
|
||||
practice_area=practice_area,
|
||||
appeal_subtype=appeal_subtype.strip(),
|
||||
subject_tags=list(subject_tags or []),
|
||||
summary=summary.strip(),
|
||||
headnote=headnote.strip(),
|
||||
source_type=source_type,
|
||||
precedent_level=precedent_level,
|
||||
is_binding=is_binding,
|
||||
document_id=document_id,
|
||||
)
|
||||
case_law_id = UUID(str(record["id"]))
|
||||
|
||||
try:
|
||||
await progress("chunking", 40, f"מחלק את הטקסט ל-chunks ({page_count} עמ')")
|
||||
chunks = chunker.chunk_document(text)
|
||||
if not chunks:
|
||||
await db.set_case_law_extraction_status(case_law_id, "completed")
|
||||
await db.set_case_law_halacha_status(case_law_id, "completed")
|
||||
await progress("completed", 100, "אין טקסט לעיבוד")
|
||||
return {
|
||||
"status": "completed",
|
||||
"case_law_id": str(case_law_id),
|
||||
"chunks": 0,
|
||||
"halachot": 0,
|
||||
}
|
||||
|
||||
await progress("embedding", 55, f"מייצר embeddings ל-{len(chunks)} chunks")
|
||||
chunk_texts = [c.content for c in chunks]
|
||||
chunk_vectors = await embeddings.embed_texts(chunk_texts, input_type="document")
|
||||
|
||||
chunk_dicts = [
|
||||
{
|
||||
"chunk_index": c.chunk_index,
|
||||
"content": c.content,
|
||||
"section_type": c.section_type,
|
||||
"page_number": c.page_number,
|
||||
"embedding": v,
|
||||
}
|
||||
for c, v in zip(chunks, chunk_vectors)
|
||||
]
|
||||
stored_chunks = await db.store_precedent_chunks(case_law_id, chunk_dicts)
|
||||
|
||||
await progress("extracting_halachot", 75, "מחלץ הלכות מחייבות")
|
||||
await db.set_case_law_extraction_status(case_law_id, "completed")
|
||||
halacha_result = await halacha_extractor.extract(case_law_id)
|
||||
|
||||
await progress(
|
||||
"completed",
|
||||
100,
|
||||
f"הוכנס לספרייה: {stored_chunks} chunks, "
|
||||
f"{halacha_result.get('stored', 0)} הלכות ממתינות לאישור",
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "completed",
|
||||
"case_law_id": str(case_law_id),
|
||||
"chunks": stored_chunks,
|
||||
"halachot": halacha_result.get("stored", 0),
|
||||
"halachot_extracted_raw": halacha_result.get("extracted", 0),
|
||||
"halachot_verified": halacha_result.get("verified", 0),
|
||||
"pages": page_count,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("precedent_library.ingest_precedent failed: %s", e)
|
||||
await db.set_case_law_extraction_status(case_law_id, "failed")
|
||||
await progress("failed", 100, f"כשל בעיבוד: {e}")
|
||||
raise
|
||||
|
||||
|
||||
async def reextract_halachot(
|
||||
case_law_id: UUID | str,
|
||||
progress: ProgressCb | None = None,
|
||||
) -> dict:
|
||||
"""Re-run the halacha extractor on an existing precedent. Idempotent."""
|
||||
progress = progress or _noop_progress
|
||||
if isinstance(case_law_id, str):
|
||||
case_law_id = UUID(case_law_id)
|
||||
|
||||
record = await db.get_case_law(case_law_id)
|
||||
if not record or record.get("source_kind") != "external_upload":
|
||||
raise ValueError("precedent not found or not chair-uploaded")
|
||||
|
||||
await progress("extracting_halachot", 50, "מחלץ הלכות מחדש")
|
||||
result = await halacha_extractor.extract(case_law_id)
|
||||
await progress(
|
||||
"completed",
|
||||
100,
|
||||
f"הופקו {result.get('stored', 0)} הלכות (ממתינות לאישור)",
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
async def delete_precedent(case_law_id: UUID | str) -> bool:
|
||||
"""Delete a precedent and cascade chunks + halachot."""
|
||||
if isinstance(case_law_id, str):
|
||||
case_law_id = UUID(case_law_id)
|
||||
return await db.delete_case_law(case_law_id)
|
||||
|
||||
|
||||
async def get_precedent(case_law_id: UUID | str) -> dict | None:
|
||||
"""Get a precedent with its halachot attached."""
|
||||
if isinstance(case_law_id, str):
|
||||
case_law_id = UUID(case_law_id)
|
||||
record = await db.get_case_law(case_law_id)
|
||||
if not record:
|
||||
return None
|
||||
record["halachot"] = await db.list_halachot(case_law_id=case_law_id, limit=500)
|
||||
return record
|
||||
|
||||
|
||||
async def list_precedents(
|
||||
practice_area: str = "",
|
||||
court: str = "",
|
||||
precedent_level: str = "",
|
||||
source_type: str = "",
|
||||
search: str = "",
|
||||
limit: int = 100,
|
||||
offset: int = 0,
|
||||
) -> list[dict]:
|
||||
return await db.list_external_case_law(
|
||||
practice_area=practice_area,
|
||||
court=court,
|
||||
precedent_level=precedent_level,
|
||||
source_type=source_type,
|
||||
search=search,
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
|
||||
async def search_library(
|
||||
query: str,
|
||||
practice_area: str = "",
|
||||
court: str = "",
|
||||
precedent_level: str = "",
|
||||
appeal_subtype: str = "",
|
||||
is_binding: bool | None = None,
|
||||
subject_tag: str = "",
|
||||
limit: int = 10,
|
||||
include_halachot: bool = True,
|
||||
) -> list[dict]:
|
||||
"""Semantic search merging halachot (rule-level) and chunks (passage-level).
|
||||
|
||||
Only ``approved`` / ``published`` halachot are returned, per chair-review
|
||||
policy. Chunks are returned regardless of halacha review status.
|
||||
"""
|
||||
if not query.strip():
|
||||
return []
|
||||
query_vec = await embeddings.embed_query(query)
|
||||
return await db.search_precedent_library_semantic(
|
||||
query_embedding=query_vec,
|
||||
practice_area=practice_area,
|
||||
court=court,
|
||||
precedent_level=precedent_level,
|
||||
appeal_subtype=appeal_subtype,
|
||||
is_binding=is_binding,
|
||||
subject_tag=subject_tag,
|
||||
limit=limit,
|
||||
include_halachot=include_halachot,
|
||||
)
|
||||
Reference in New Issue
Block a user