Files
legal-ai/mcp-server/src/legal_mcp/services/precedent_library.py
Chaim 885cba543e feat(halacha): lighter effort for BULK queue-drain extraction (speed at scale)
xhigh is the quality sweet-spot for a single precedent but very slow at scale
(64-chunk case ≈ 20 min). Bulk queue-drains (process_pending over many
precedents) now use a lighter effort to cut wall-clock; interactive single
re-extraction keeps xhigh quality.

- config.HALACHA_BULK_EXTRACT_EFFORT (env, default 'high'; set 'medium' for max
  speed, 'xhigh' to match single).
- extract()/_extract_impl()/_extract_chunk() take an `effort` override threaded
  to claude_session.query_json; None falls back to HALACHA_EXTRACT_EFFORT (xhigh).
- process_pending_extractions(kind='halacha') passes the bulk effort; single
  reextract_halachot keeps xhigh.

Verified end-to-end (mocked LLM): _extract_chunk(effort='medium') → query_json
effort='medium'; effort=None → 'xhigh' fallback. Closes the open item in #72.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-31 21:34:13 +00:00

411 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Orchestrator for the External Precedent Library.
Ingest pipeline (one upload):
file → extract_text → proofread → INSERT case_law (source_kind='external_upload')
→ chunk → embed → store precedent_chunks
→ halacha_extractor.extract → embed halachot → store halachot
→ set extraction_status='completed'
Progress is reported via a caller-supplied async callback so the
web layer can pipe updates into the existing Redis ProgressStore /
SSE plumbing without this module knowing about Redis.
"""
from __future__ import annotations
import asyncio
import logging
from pathlib import Path
from typing import Awaitable, Callable
from uuid import UUID
from legal_mcp import config
from legal_mcp.services import db, embeddings, hybrid_search, ingest # noqa: F401
# Note: halacha_extractor and precedent_metadata_extractor are NOT imported
# at module load. They are imported lazily inside the dedicated re-extract
# entry points so that `ingest_precedent` (called from the FastAPI container,
# where `claude` CLI is unavailable) cannot accidentally pull them in. See
# the architectural rule in services/claude_session.py.
logger = logging.getLogger(__name__)
ProgressCb = Callable[[str, int, str], Awaitable[None]]
PRECEDENT_LIBRARY_DIR = Path(config.DATA_DIR) / "precedent-library"
_VALID_PRACTICE_AREAS = frozenset({"", "rishuy_uvniya", "betterment_levy", "compensation_197"})
_VALID_SOURCE_TYPES = frozenset({"", "court_ruling", "appeals_committee"})
_VALID_PRECEDENT_LEVELS = {
"", "עליון", "מנהלי", "ועדת_ערר_ארצית", "ועדת_ערר_מחוזית",
"supreme", "administrative", "national_appeals_committee", "district_appeals_committee",
}
async def _noop_progress(_status: str, _percent: int, _msg: str) -> None:
return None
def _external_validate(inputs: dict) -> None:
citation = (inputs.get("citation") or "").strip()
if not citation:
raise ValueError("citation is required")
if citation.startswith(("ערר ", "ערר(", 'בל"מ ', 'בל"מ(', "ARAR ")):
raise ValueError(
"ציטוט שמתחיל ב-'ערר' או 'בל\"מ' הוא החלטת ועדת ערר. "
"השתמש ב-internal_decision_upload (דורש chair_name + district), "
"לא ב-precedent_library_upload."
)
def _external_staging_subdir(inputs: dict) -> str:
st = inputs.get("source_type") or ""
return st if st in {"court_ruling", "appeals_committee"} else "other"
async def _create_external_record(**kw) -> dict:
"""Adapter: maps canonical inputs (citation) to create_external_case_law(case_number)."""
return await db.create_external_case_law(
case_number=kw["citation"].strip(),
case_name=kw["case_name"],
full_text=kw["full_text"],
court=(kw.get("court") or "").strip(),
decision_date=kw.get("decision_date"),
practice_area=kw.get("practice_area", ""),
appeal_subtype=(kw.get("appeal_subtype") or "").strip(),
subject_tags=list(kw.get("subject_tags") or []),
summary=(kw.get("summary") or "").strip(),
headnote=(kw.get("headnote") or "").strip(),
source_type=kw.get("source_type", ""),
precedent_level=kw.get("precedent_level", ""),
is_binding=kw.get("is_binding", True),
document_id=kw.get("document_id"),
)
_EXTERNAL_SPEC = ingest.IntakeSpec(
source_kind="external_upload",
id_field="citation",
staging_root=PRECEDENT_LIBRARY_DIR,
staging_subdir=_external_staging_subdir,
validate=_external_validate,
enum_fields={"practice_area": _VALID_PRACTICE_AREAS, "source_type": _VALID_SOURCE_TYPES},
derive=lambda inputs: {},
display_name_fallback="citation",
create_record=_create_external_record,
)
async def ingest_precedent(
*,
file_path: str | Path,
citation: str,
case_name: str = "",
court: str = "",
decision_date=None,
source_type: str = "",
precedent_level: str = "",
practice_area: str = "",
appeal_subtype: str = "",
subject_tags: list[str] | None = None,
is_binding: bool = True,
headnote: str = "",
summary: str = "",
document_id: UUID | None = None,
progress: ingest.ProgressCb | None = None,
) -> dict:
"""Ingest one external precedent. Thin wrapper over the canonical pipeline."""
inputs = {
"citation": citation, "case_name": case_name, "court": court,
"decision_date": decision_date, "source_type": source_type,
"precedent_level": precedent_level, "practice_area": practice_area,
"appeal_subtype": appeal_subtype, "subject_tags": subject_tags,
"is_binding": is_binding, "headnote": headnote, "summary": summary,
}
return await ingest.ingest_document(
_EXTERNAL_SPEC, inputs=inputs, file_path=file_path,
document_id=document_id, progress=progress,
)
async def reextract_halachot(
case_law_id: UUID | str,
progress: ProgressCb | None = None,
) -> dict:
"""Re-run the halacha extractor on an existing precedent. Idempotent.
**MCP-tool-only path.** This function calls into ``halacha_extractor``,
which calls ``claude_session`` — the local CLI is required. Invoking
this from the FastAPI container will raise ``Claude CLI not found``.
See the architectural rule in ``services/claude_session.py``.
"""
from legal_mcp.services import halacha_extractor
progress = progress or _noop_progress
if isinstance(case_law_id, str):
case_law_id = UUID(case_law_id)
record = await db.get_case_law(case_law_id)
if not record:
raise ValueError("precedent not found")
# Was restricted to source_kind='external_upload'; opened 2026-05-06 so
# internal_committee rows can also be re-extracted when ingest produced
# bad data. See note in db.request_metadata_extraction.
await progress("extracting_halachot", 50, "מחלץ הלכות מחדש")
# Explicit re-extraction = clean slate (force): wipe prior halachot +
# per-chunk checkpoints and redo all. (Queue draining / resume uses the
# default force=False so an interrupted run continues where it stopped.)
result = await halacha_extractor.extract(case_law_id, force=True)
# Clear the queue timestamp on completion so the UI badge / worker queue
# don't keep showing this row. The queue worker (process_pending_extractions)
# already does this; mirror it here so per-record extraction drains too.
if result.get("status") in ("completed", "no_halachot"):
await db.clear_extraction_request(case_law_id, kind="halacha")
await progress(
"completed",
100,
f"הופקו {result.get('stored', 0)} הלכות (ממתינות לאישור)",
)
return result
# Wait this many seconds between precedents in a multi-precedent run.
# Anthropic rate-limits across the org, so back-to-back extractions of large
# rulings (e.g. 129 chunks for one, then 79 for another) can spill the second
# precedent into a 429 storm. Observed 2026-05-03: 1110/20 succeeded with 9
# halachot, 317/10 immediately after returned silent no_halachot.
INTER_PRECEDENT_COOLDOWN_SEC = 30
# How many times to retry a precedent that came back as 'extraction_failed'
# (i.e. >50% chunks crashed). Each retry uses a longer cooldown.
PRECEDENT_RETRY_ATTEMPTS = 1
PRECEDENT_RETRY_COOLDOWN_SEC = 60
async def process_pending_extractions(kind: str = "metadata", limit: int = 20) -> dict:
"""Drain the extraction queue (UI-button-stamped requests).
The button in the web UI cannot run claude_session itself (it lives in
the container, no CLI). It just stamps ``metadata_extraction_requested_at``
on the row. This function — called from local Claude Code via the MCP
tool — picks each stamped row up, runs the extractor, and clears the
timestamp.
Sequencing: precedents are processed serially (never in parallel) and
each is followed by a short cooldown so the Anthropic rate-limit
counter has time to drain before the next big precedent starts. If
halacha extraction comes back as ``extraction_failed`` we retry the
same precedent once with a longer cooldown — matching the empirical
pattern where the second precedent in a back-to-back run gets
rate-limited but recovers after a brief pause.
Args:
kind: 'metadata' or 'halacha'.
limit: max rows to process this run.
"""
from legal_mcp.services import halacha_extractor, precedent_metadata_extractor
if kind not in {"metadata", "halacha"}:
raise ValueError("kind must be 'metadata' or 'halacha'")
pending = await db.list_pending_extraction_requests(kind=kind, limit=limit)
if not pending:
return {"status": "no_pending", "kind": kind, "processed": 0, "results": []}
async def _run_once(cid: UUID) -> dict:
if kind == "metadata":
return await precedent_metadata_extractor.extract_and_apply(cid)
# Bulk queue-drain → lighter effort (config.HALACHA_BULK_EXTRACT_EFFORT,
# default 'high') to cut wall-clock at scale. Resume (force=False) so an
# interrupted drain continues per-chunk. Single re-extract stays xhigh.
return await halacha_extractor.extract(
cid, effort=config.HALACHA_BULK_EXTRACT_EFFORT,
)
results: list[dict] = []
processed = 0
for idx, row in enumerate(pending):
if idx > 0:
await asyncio.sleep(INTER_PRECEDENT_COOLDOWN_SEC)
cid = UUID(str(row["id"]))
attempts = 0
result: dict = {}
try:
result = await _run_once(cid)
# Retry only on systematic extraction failure (rate-limit storm).
# Don't retry on 'no_halachot' — that means Claude looked and
# genuinely found nothing.
while (
result.get("status") == "extraction_failed"
and attempts < PRECEDENT_RETRY_ATTEMPTS
):
attempts += 1
logger.warning(
"process_pending_extractions: %s returned extraction_failed "
"(%d/%d chunks crashed), retry %d/%d after %ds cooldown",
cid,
result.get("failed_chunks", 0),
result.get("total_chunks", 0),
attempts, PRECEDENT_RETRY_ATTEMPTS,
PRECEDENT_RETRY_COOLDOWN_SEC,
)
await asyncio.sleep(PRECEDENT_RETRY_COOLDOWN_SEC)
result = await _run_once(cid)
# Finalise: success or terminal failure both clear the request
# so the queue moves on. (Use 'failed' DB state for terminal
# extraction_failed so the UI shows the warning chip.)
if kind == "halacha" and result.get("status") == "extraction_failed":
await db.set_case_law_halacha_status(cid, "failed")
await db.clear_extraction_request(cid, kind=kind)
processed += 1
results.append({
"case_law_id": str(cid),
"case_number": row.get("case_number", ""),
"status": result.get("status", "unknown"),
"fields": result.get("fields", []),
"stored": result.get("stored", 0),
"retry_attempts": attempts,
})
except Exception as e:
logger.exception("process_pending_extractions failed for %s: %s", cid, e)
results.append({
"case_law_id": str(cid),
"case_number": row.get("case_number", ""),
"status": "failed",
"error": str(e),
"retry_attempts": attempts,
})
# Don't clear the request — it stays for the next run.
return {
"status": "completed",
"kind": kind,
"processed": processed,
"total_pending": len(pending),
"results": results,
}
async def reextract_metadata(
case_law_id: UUID | str,
progress: ProgressCb | None = None,
) -> dict:
"""Re-run metadata extraction on an existing precedent.
Only fills empty fields (subject_tags, summary, headnote, key_quote,
appeal_subtype, and case_name when it equals the citation). User
values are preserved.
**MCP-tool-only path** — same constraint as :func:`reextract_halachot`.
"""
from legal_mcp.services import precedent_metadata_extractor
progress = progress or _noop_progress
if isinstance(case_law_id, str):
case_law_id = UUID(case_law_id)
record = await db.get_case_law(case_law_id)
if not record:
raise ValueError("precedent not found")
# See note in db.request_metadata_extraction — opened to all source kinds.
await progress("extracting_metadata", 40, "מחלץ מטא-דאטה (תקציר, תגיות)")
result = await precedent_metadata_extractor.extract_and_apply(case_law_id)
# Clear the queue timestamp so the UI / worker stop showing this row.
# See note in reextract_halachot.
if result.get("status") in ("completed", "no_changes"):
await db.clear_extraction_request(case_law_id, kind="metadata")
fields = result.get("fields") or []
msg = (
f"מולאו {len(fields)} שדות: {', '.join(fields)}"
if fields
else "לא נמצא מה למלא (כל השדות מאוכלסים או לא ניתן לחלץ)"
)
await progress("completed", 100, msg)
return result
async def delete_precedent(case_law_id: UUID | str) -> bool:
"""Delete a precedent and cascade chunks + halachot."""
if isinstance(case_law_id, str):
case_law_id = UUID(case_law_id)
return await db.delete_case_law(case_law_id)
async def get_precedent(case_law_id: UUID | str) -> dict | None:
"""Get a precedent with its halachot and related cases attached."""
if isinstance(case_law_id, str):
case_law_id = UUID(case_law_id)
record = await db.get_case_law(case_law_id)
if not record:
return None
record["halachot"] = await db.list_halachot(case_law_id=case_law_id, limit=500)
record["related_cases"] = await db.get_case_law_relations(case_law_id)
return record
async def list_precedents(
practice_area: str = "",
court: str = "",
precedent_level: str = "",
source_type: str = "",
search: str = "",
source_kind: str = "external_upload",
limit: int = 100,
offset: int = 0,
) -> list[dict]:
return await db.list_external_case_law(
practice_area=practice_area,
court=court,
precedent_level=precedent_level,
source_type=source_type,
search=search,
source_kind=source_kind,
limit=limit,
offset=offset,
)
async def search_library(
query: str,
practice_area: str = "",
court: str = "",
precedent_level: str = "",
appeal_subtype: str = "",
is_binding: bool | None = None,
subject_tag: str = "",
limit: int = 10,
include_halachot: bool = True,
) -> list[dict]:
"""Semantic search merging halachot (rule-level) and chunks (passage-level).
Only ``approved`` / ``published`` halachot are returned, per chair-review
policy. Chunks are returned regardless of halacha review status.
When ``VOYAGE_RERANK_ENABLED`` is set, results are passed through
voyage rerank-2 (cross-encoder). The +0.05 halacha boost from
``search_precedent_library_semantic`` is preserved before rerank
but the rerank scores ultimately decide the order.
"""
if not query.strip():
return []
query_vec = await embeddings.embed_query(query)
return await hybrid_search.search_precedent_library_hybrid(
query=query,
query_text_embedding=query_vec,
limit=limit,
practice_area=practice_area,
court=court,
precedent_level=precedent_level,
appeal_subtype=appeal_subtype,
is_binding=is_binding,
subject_tag=subject_tag,
include_halachot=include_halachot,
)