מאגר חדש ליומוני "כל יום" (עפר טויסטר) כשכבת-גילוי מעל קורפוסי-הפסיקה:
מקור-משני המצביע על פסק הדין המקורי, נקלט לטבלה נפרדת `digests`, נחפש
סמנטית, ומקושר לפסק המקורי בספריית הפסיקה — אך לעולם אינו מצוטט בהחלטה
ואינו מחלץ הלכות.
Phase 0 (spec):
- docs/spec/X12-digests-radar.md — INV-DIG1 (מצביע לא מצוטט) /
INV-DIG2 (מסלול-קליטה נפרד, לא מקביל — מקיים G2) / INV-DIG3 (קישור-לפסק
הוא הגשר; חוסר-קישור = פער גלוי). עדכון אינדקס 00/03/README.
Phase 1 (MVP):
- SCHEMA_V30: טבלת `digests` (HNSW על embedding — לא ivfflat, להימנע מ-recall
cliff בקורפוס קטן/צומח) + GIN/FTS + UNIQUE חלקי ל-idempotent.
- services/digest_metadata_extractor.py — חילוץ-LLM (claude_session local-only,
ייבוא lazy): תג-מושג, כותרת-הלכה, מראה-מקום, שני-תאריכים מובחנים, תגיות.
- services/digest_library.py — מסלול קצר עצמאי (INV-DIG2): extract→hash→LLM→
embedding יחיד→autolink. לא משתמש ב-ingest.ingest_document.
- tools/digests.py + רישום 7 כלים ב-server.py (digest_upload/list/get/link/
relink/delete + search_digests).
- scripts/ingest_digests_batch.py — קליטה ידנית מ-data/digests/incoming.
- legal-researcher.md: שלב 2ב.0 (סריקת-radar לפני אימות) + סעיף-דוח ט +
3 כלים ב-frontmatter. HEARTBEAT §8: ניתוב יומון→digest_upload.
אומת end-to-end: 4 יומונים נקלטו (מטא-דאטה מדויק), חיפוש סמנטי מדרג נכון
("היטל השבחה"→5160, "תמא 38"→5158), link/relink/autolink/revert + מעטפת-MCP.
Invariants: מוסיף INV-DIG1/2/3 (X12). מקיים G2 (bounded context נפרד, לא
מסלול מקביל), G3 (idempotent upsert), G4 (אין בליעה שקטה — פער-קישור מוצף),
G9 (עקיבוּת — היומון מצביע על מקור עקיב). נוגע G7 (RRF) — נדחה, חיפוש
סמנטי-בלבד בשלב 1 (FTS index מוכן).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
269 lines
9.9 KiB
Python
269 lines
9.9 KiB
Python
"""Orchestrator for the Digests radar (X12).
|
|
|
|
A digest ("כל יום" daily one-pager) is a SECONDARY source that POINTS at a
|
|
ruling — it is never cited in a decision (INV-DIG1) and never enters the
|
|
precedent/halacha pipeline (INV-DIG2). Ingest is therefore a short, standalone
|
|
path that reuses only ATOMIC services (extract_text, embeddings), NOT the
|
|
canonical ``ingest.ingest_document`` (which is bound to case_law):
|
|
|
|
file → extract_text → content_hash (idempotent) → LLM metadata extract
|
|
→ create_digest → single embedding (concept+headline+summary+analysis)
|
|
→ try_autolink(underlying_citation → case_law) [INV-DIG3]
|
|
→ extraction_status='completed'
|
|
|
|
claude_session rule: ``digest_metadata_extractor`` (local CLI) is imported
|
|
LAZILY inside ``ingest_digest`` only, so this module is import-safe from the
|
|
FastAPI container for the search/list/link/delete paths (DB + voyage only).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from datetime import date
|
|
from pathlib import Path
|
|
from typing import Awaitable, Callable
|
|
from uuid import UUID
|
|
|
|
from legal_mcp import config
|
|
from legal_mcp.services import db, embeddings, extractor, ingest
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
ProgressCb = Callable[[str, int, str], Awaitable[None]]
|
|
|
|
DIGEST_LIBRARY_DIR = Path(config.DATA_DIR) / "digests"
|
|
|
|
_VALID_PRACTICE_AREAS = frozenset(
|
|
{"", "rishuy_uvniya", "betterment_levy", "compensation_197"}
|
|
)
|
|
|
|
|
|
async def _noop_progress(_status: str, _percent: int, _msg: str) -> None:
|
|
return None
|
|
|
|
|
|
def _embedding_text(fields: dict) -> str:
|
|
"""The single vector indexes the digest as an atomic discovery unit."""
|
|
parts = [
|
|
fields.get("concept_tag", ""),
|
|
fields.get("headline_holding", ""),
|
|
fields.get("summary", ""),
|
|
fields.get("analysis_text", ""),
|
|
]
|
|
return "\n".join(p for p in parts if p).strip()
|
|
|
|
|
|
async def try_autolink(digest_id: UUID | str, underlying_citation: str) -> str | None:
|
|
"""Best-effort link of a digest to the underlying ruling in case_law
|
|
(INV-DIG3). Returns the case_law_id (str) if linked, else None. Never raises."""
|
|
citation = (underlying_citation or "").strip()
|
|
if not citation:
|
|
return None
|
|
try:
|
|
match = await db.find_case_law_by_citation_fuzzy(citation)
|
|
except Exception as e:
|
|
logger.warning("digest try_autolink lookup failed for %r: %s", citation, e)
|
|
return None
|
|
if not match:
|
|
return None
|
|
await db.link_digest_to_case_law(digest_id, match["id"])
|
|
return str(match["id"])
|
|
|
|
|
|
async def ingest_digest(
|
|
*,
|
|
file_path: str | Path,
|
|
yomon_number: str = "",
|
|
digest_date: date | str | None = None,
|
|
practice_area: str = "",
|
|
appeal_subtype: str = "",
|
|
subject_tags: list[str] | None = None,
|
|
progress: ProgressCb | None = None,
|
|
) -> dict:
|
|
"""Ingest one digest. **MCP-tool-only** (uses the local LLM extractor).
|
|
|
|
User-supplied args win over LLM-extracted values for the same field
|
|
(the chair typed them deliberately); empty args are filled from the LLM.
|
|
Idempotent on yomon_number / content_hash (INV-G3).
|
|
"""
|
|
progress = progress or _noop_progress
|
|
if practice_area and practice_area not in _VALID_PRACTICE_AREAS:
|
|
raise ValueError(f"invalid practice_area: {practice_area!r}")
|
|
|
|
src = Path(file_path)
|
|
if not src.exists():
|
|
raise ValueError(f"file not found: {file_path}")
|
|
|
|
await progress("staging", 5, "מעתיק קובץ")
|
|
staged = ingest._stage_file(src, DIGEST_LIBRARY_DIR, "incoming")
|
|
rel_path = str(staged.relative_to(config.DATA_DIR)) \
|
|
if str(staged).startswith(str(config.DATA_DIR)) else str(staged)
|
|
|
|
await progress("extracting_text", 20, "מחלץ טקסט")
|
|
raw_text, _page_count, _offsets = await extractor.extract_text(str(staged))
|
|
raw_text = (raw_text or "").strip()
|
|
if not raw_text:
|
|
raise ValueError("no text extracted from digest")
|
|
|
|
# Idempotency: identical text already ingested → return existing row.
|
|
content_hash = db._content_hash(raw_text)
|
|
existing = await db.get_digest_by_content_hash(content_hash)
|
|
if existing:
|
|
await progress("completed", 100, "יומון זהה כבר קיים — לא נוצר כפל")
|
|
return {
|
|
"status": "exists",
|
|
"digest_id": existing["id"],
|
|
"yomon_number": existing.get("yomon_number", ""),
|
|
"linked_case_law_id": existing.get("linked_case_law_id"),
|
|
}
|
|
|
|
# LLM metadata extraction (lazy import — keeps this module container-safe).
|
|
await progress("extracting_metadata", 45, "מחלץ מטא-דאטה (LLM)")
|
|
from legal_mcp.services import digest_metadata_extractor
|
|
extracted = await digest_metadata_extractor.extract(raw_text)
|
|
|
|
def _coerce_date(v) -> date | None:
|
|
if v is None or v == "":
|
|
return None
|
|
if isinstance(v, date):
|
|
return v
|
|
if isinstance(v, str):
|
|
try:
|
|
return date.fromisoformat(v[:10])
|
|
except ValueError:
|
|
return None
|
|
return None
|
|
|
|
# Merge: explicit user args win; otherwise fall back to LLM extraction.
|
|
fields = {
|
|
"analysis_text": raw_text,
|
|
"yomon_number": yomon_number.strip() or extracted.get("yomon_number", ""),
|
|
"digest_date": _coerce_date(digest_date) or extracted.get("digest_date"),
|
|
"concept_tag": extracted.get("concept_tag", ""),
|
|
"headline_holding": extracted.get("headline_holding", ""),
|
|
"summary": extracted.get("summary", ""),
|
|
"underlying_citation": extracted.get("underlying_citation", ""),
|
|
"underlying_court": extracted.get("underlying_court", ""),
|
|
"underlying_date": extracted.get("underlying_date"),
|
|
"underlying_judge": extracted.get("underlying_judge", ""),
|
|
"practice_area": practice_area or extracted.get("practice_area", ""),
|
|
"appeal_subtype": appeal_subtype.strip() or extracted.get("appeal_subtype", ""),
|
|
"subject_tags": list(subject_tags) if subject_tags else extracted.get("subject_tags", []),
|
|
"source_document_path": rel_path,
|
|
"extraction_status": "processing",
|
|
}
|
|
|
|
await progress("storing", 70, "שומר רשומה")
|
|
record = await db.create_digest(**fields)
|
|
digest_id = record["id"]
|
|
|
|
# Single embedding for the whole digest (atomic discovery unit — X12 §6).
|
|
await progress("embedding", 85, "מחשב embedding")
|
|
emb_text = _embedding_text(fields)
|
|
if emb_text:
|
|
try:
|
|
vecs = await embeddings.embed_texts([emb_text], input_type="document")
|
|
if vecs:
|
|
await db.store_digest_embedding(digest_id, vecs[0])
|
|
except Exception as e: # surfaced, not swallowed (§6)
|
|
logger.warning("digest embedding failed for %s: %s", digest_id, e)
|
|
|
|
# Bridge to the underlying ruling if it is already in the library (INV-DIG3).
|
|
await progress("linking", 95, "מנסה לקשר לפסק המקורי")
|
|
linked_id = await try_autolink(digest_id, fields["underlying_citation"])
|
|
|
|
await db.update_digest(digest_id, extraction_status="completed")
|
|
await progress("completed", 100, "הושלם")
|
|
return {
|
|
"status": "completed",
|
|
"digest_id": digest_id,
|
|
"yomon_number": fields["yomon_number"],
|
|
"underlying_citation": fields["underlying_citation"],
|
|
"linked_case_law_id": linked_id,
|
|
"fields_extracted": sorted(extracted.keys()),
|
|
}
|
|
|
|
|
|
async def link_digest(digest_id: UUID | str, case_law_id: UUID | str) -> dict:
|
|
"""Manually link a digest to an underlying ruling (INV-DIG3). Idempotent."""
|
|
digest = await db.get_digest(digest_id)
|
|
if not digest:
|
|
raise ValueError("digest not found")
|
|
ruling = await db.get_case_law(
|
|
case_law_id if isinstance(case_law_id, UUID) else UUID(str(case_law_id))
|
|
)
|
|
if not ruling:
|
|
raise ValueError("case_law not found")
|
|
updated = await db.link_digest_to_case_law(digest_id, case_law_id)
|
|
return {
|
|
"linked": True,
|
|
"digest_id": str(digest_id),
|
|
"case_law_id": str(case_law_id),
|
|
"case_number": ruling.get("case_number"),
|
|
"digest": updated,
|
|
}
|
|
|
|
|
|
async def relink_digest(digest_id: UUID | str) -> dict:
|
|
"""Re-run autolink for a digest whose underlying ruling may now be in the
|
|
library. No-op if already linked or no match found."""
|
|
digest = await db.get_digest(digest_id)
|
|
if not digest:
|
|
raise ValueError("digest not found")
|
|
if digest.get("linked_case_law_id"):
|
|
return {"linked": True, "digest_id": str(digest_id),
|
|
"case_law_id": digest["linked_case_law_id"], "changed": False}
|
|
linked_id = await try_autolink(digest_id, digest.get("underlying_citation", ""))
|
|
return {
|
|
"linked": linked_id is not None,
|
|
"digest_id": str(digest_id),
|
|
"case_law_id": linked_id,
|
|
"changed": linked_id is not None,
|
|
}
|
|
|
|
|
|
async def search_digests(
|
|
query: str,
|
|
practice_area: str = "",
|
|
subject_tag: str = "",
|
|
concept_tag: str = "",
|
|
limit: int = 10,
|
|
) -> list[dict]:
|
|
"""Semantic search over the digests radar. Container-safe (voyage + DB)."""
|
|
if not query.strip():
|
|
return []
|
|
query_vec = await embeddings.embed_query(query)
|
|
return await db.search_digests_semantic(
|
|
query_embedding=query_vec,
|
|
practice_area=practice_area,
|
|
subject_tag=subject_tag,
|
|
concept_tag=concept_tag,
|
|
limit=limit,
|
|
)
|
|
|
|
|
|
async def get_digest(digest_id: UUID | str) -> dict | None:
|
|
return await db.get_digest(digest_id)
|
|
|
|
|
|
async def list_digests(
|
|
practice_area: str = "",
|
|
concept_tag: str = "",
|
|
linked: bool | None = None,
|
|
search: str = "",
|
|
limit: int = 100,
|
|
offset: int = 0,
|
|
) -> list[dict]:
|
|
return await db.list_digests(
|
|
practice_area=practice_area,
|
|
concept_tag=concept_tag,
|
|
linked=linked,
|
|
search=search,
|
|
limit=limit,
|
|
offset=offset,
|
|
)
|
|
|
|
|
|
async def delete_digest(digest_id: UUID | str) -> bool:
|
|
return await db.delete_digest(digest_id)
|