feat(digests): קורפוס יומונים כשכבת-גילוי (radar) — X12

מאגר חדש ליומוני "כל יום" (עפר טויסטר) כשכבת-גילוי מעל קורפוסי-הפסיקה:
מקור-משני המצביע על פסק הדין המקורי, נקלט לטבלה נפרדת `digests`, נחפש
סמנטית, ומקושר לפסק המקורי בספריית הפסיקה — אך לעולם אינו מצוטט בהחלטה
ואינו מחלץ הלכות.

Phase 0 (spec):
- docs/spec/X12-digests-radar.md — INV-DIG1 (מצביע לא מצוטט) /
  INV-DIG2 (מסלול-קליטה נפרד, לא מקביל — מקיים G2) / INV-DIG3 (קישור-לפסק
  הוא הגשר; חוסר-קישור = פער גלוי). עדכון אינדקס 00/03/README.

Phase 1 (MVP):
- SCHEMA_V30: טבלת `digests` (HNSW על embedding — לא ivfflat, להימנע מ-recall
  cliff בקורפוס קטן/צומח) + GIN/FTS + UNIQUE חלקי ל-idempotent.
- services/digest_metadata_extractor.py — חילוץ-LLM (claude_session local-only,
  ייבוא lazy): תג-מושג, כותרת-הלכה, מראה-מקום, שני-תאריכים מובחנים, תגיות.
- services/digest_library.py — מסלול קצר עצמאי (INV-DIG2): extract→hash→LLM→
  embedding יחיד→autolink. לא משתמש ב-ingest.ingest_document.
- tools/digests.py + רישום 7 כלים ב-server.py (digest_upload/list/get/link/
  relink/delete + search_digests).
- scripts/ingest_digests_batch.py — קליטה ידנית מ-data/digests/incoming.
- legal-researcher.md: שלב 2ב.0 (סריקת-radar לפני אימות) + סעיף-דוח ט +
  3 כלים ב-frontmatter. HEARTBEAT §8: ניתוב יומון→digest_upload.

אומת end-to-end: 4 יומונים נקלטו (מטא-דאטה מדויק), חיפוש סמנטי מדרג נכון
("היטל השבחה"→5160, "תמא 38"→5158), link/relink/autolink/revert + מעטפת-MCP.

Invariants: מוסיף INV-DIG1/2/3 (X12). מקיים G2 (bounded context נפרד, לא
מסלול מקביל), G3 (idempotent upsert), G4 (אין בליעה שקטה — פער-קישור מוצף),
G9 (עקיבוּת — היומון מצביע על מקור עקיב). נוגע G7 (RRF) — נדחה, חיפוש
סמנטי-בלבד בשלב 1 (FTS index מוכן).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-07 17:49:00 +00:00
parent 9eaabffba4
commit 8171572cdd
13 changed files with 1353 additions and 5 deletions

View File

@@ -58,6 +58,7 @@ from legal_mcp.tools import ( # noqa: E402
missing_precedents as mp_tools,
citations as cit_tools,
training_enrichment as train_tools,
digests as digest_tools,
)
@@ -340,6 +341,75 @@ async def search_precedent_library(
)
# Digests radar (X12) — secondary discovery layer; NOT a citation corpus.
@mcp.tool()
async def digest_upload(
file_path: str,
yomon_number: str = "",
digest_date: str = "",
practice_area: str = "",
appeal_subtype: str = "",
subject_tags: list[str] | None = None,
) -> str:
"""העלאת יומון ("כל יום") לקורפוס-הגילוי (radar) + חילוץ מטא-דאטה אוטומטי. היומון הוא מקור-משני המצביע על הפסק המקורי — אינו מצוטט בהחלטה ואינו מחלץ הלכות (INV-DIG1/2). practice_area: rishuy_uvniya / betterment_levy / compensation_197."""
return await digest_tools.digest_upload(
file_path, yomon_number, digest_date, practice_area,
appeal_subtype, subject_tags,
)
@mcp.tool()
async def digest_list(
practice_area: str = "",
concept_tag: str = "",
linked: bool | None = None,
search: str = "",
limit: int = 100,
) -> str:
"""רשימת יומונים בקורפוס-הגילוי, עם פילטרים. linked=false → יומונים שהפסק המקורי שלהם עוד לא נקלט לספריית הפסיקה (פער-ידע גלוי, INV-DIG3)."""
return await digest_tools.digest_list(
practice_area, concept_tag, linked, search, _clamp_limit(limit),
)
@mcp.tool()
async def digest_get(digest_id: str) -> str:
"""יומון ספציפי לפי מזהה (כולל מראה-מקום, ניתוח, וקישור לפסק המקורי אם קיים)."""
return await digest_tools.digest_get(digest_id)
@mcp.tool()
async def digest_link(digest_id: str, case_law_id: str) -> str:
"""קישור ידני של יומון לפסק הדין המקורי בספריית הפסיקה (INV-DIG3). idempotent."""
return await digest_tools.digest_link(digest_id, case_law_id)
@mcp.tool()
async def digest_relink(digest_id: str) -> str:
"""ניסיון-קישור מחדש: בודק אם פסק הדין המקורי של היומון כבר בספרייה ומקשר אוטומטית."""
return await digest_tools.digest_relink(digest_id)
@mcp.tool()
async def digest_delete(digest_id: str) -> str:
"""מחיקת יומון מקורפוס-הגילוי."""
return await digest_tools.digest_delete(digest_id)
@mcp.tool()
async def search_digests(
query: str,
practice_area: str = "",
subject_tag: str = "",
concept_tag: str = "",
limit: int = 10,
) -> str:
"""חיפוש סמנטי בקורפוס-הגילוי (יומוני "כל יום") — מצפן-מחקר (radar). מחזיר את היומון הרלוונטי + מראה-המקום של הפסק המקורי. ⚠️ היומון אינו מצוטט בהחלטה (INV-DIG1) — הצטט מהפסק המקורי דרך search_precedent_library. החוקר משתמש בזה בשלב 2ב.0 לפני האימות."""
return await digest_tools.search_digests(
query, practice_area, subject_tag, concept_tag, _clamp_limit(limit),
)
@mcp.tool()
async def halacha_review(
halacha_id: str,

View File

@@ -1287,6 +1287,71 @@ ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_generated_at TIMESTAMPTZ
"""
SCHEMA_V30_SQL = """
-- digests (X12): Ofer Toister daily "כל יום" one-pagers. A SECONDARY,
-- discovery-layer ("radar") source — NOT authoritative law. Kept in its OWN
-- table (never case_law) so it cannot pollute the precedent corpus, never
-- enters the halacha pipeline (INV-DIG2), and is never cited directly in a
-- decision (INV-DIG1). Its only job is to point the researcher at the
-- UNDERLYING ruling, which is ingested separately into case_law and cited from
-- there. linked_case_law_id is the bridge (INV-DIG3): filled once the
-- underlying ruling is in the library; NULL = an open knowledge gap.
CREATE TABLE IF NOT EXISTS digests (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
yomon_number TEXT NOT NULL DEFAULT '', -- "5163"
digest_date DATE, -- date of the yomon ISSUE
publication TEXT NOT NULL DEFAULT 'כל יום',
source_firm TEXT NOT NULL DEFAULT 'עפר טויסטר, עורכי דין',
concept_tag TEXT NOT NULL DEFAULT '', -- "שיקול הדעת המצומצם"
headline_holding TEXT NOT NULL DEFAULT '', -- bold subtitle = the holding
analysis_text TEXT NOT NULL DEFAULT '', -- the 1-2 page body (raw text)
summary TEXT NOT NULL DEFAULT '', -- 2-3 sentence LLM summary
underlying_citation TEXT NOT NULL DEFAULT '', -- 'עת"מ 46111-12-22 יכין-אפק...'
underlying_court TEXT NOT NULL DEFAULT '',
underlying_date DATE, -- date the RULING was given (≠ digest_date)
underlying_judge TEXT NOT NULL DEFAULT '',
practice_area TEXT NOT NULL DEFAULT '', -- rishuy_uvniya/betterment_levy/compensation_197
appeal_subtype TEXT NOT NULL DEFAULT '',
subject_tags TEXT[] NOT NULL DEFAULT '{}',
linked_case_law_id UUID REFERENCES case_law(id) ON DELETE SET NULL,
embedding vector(1024), -- single vector of concept+headline+summary+analysis
source_document_path TEXT NOT NULL DEFAULT '', -- staged PDF path (rel to DATA_DIR)
content_hash TEXT NOT NULL DEFAULT '', -- sha256 of extracted text — idempotent upload
extraction_status TEXT NOT NULL DEFAULT 'pending', -- pending/processing/completed/failed
content_tsv tsvector GENERATED ALWAYS AS (
to_tsvector('simple',
coalesce(concept_tag,'') || ' ' || coalesce(headline_holding,'') || ' ' ||
coalesce(summary,'') || ' ' || coalesce(analysis_text,''))
) STORED,
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now()
);
-- Idempotent re-upload (INV-G3): same yomon number = same digest. yomon_number
-- can be '' transiently (before extraction), so the unique index is partial.
CREATE UNIQUE INDEX IF NOT EXISTS uq_digests_yomon_number
ON digests(yomon_number) WHERE yomon_number <> '';
-- Secondary dedup key when yomon_number couldn't be parsed.
CREATE UNIQUE INDEX IF NOT EXISTS uq_digests_content_hash
ON digests(content_hash) WHERE content_hash <> '';
-- HNSW (not ivfflat): the digests radar is a small, slowly-growing corpus
-- (~1/day). ivfflat trains `lists` centroids and probes a subset at query time,
-- so on a small table a single probe can hit an empty list and return 0 rows
-- (recall cliff). HNSW has no list-training/probe step — correct recall from
-- the first row — so it is the right index for a corpus that starts ~empty.
DROP INDEX IF EXISTS idx_digests_embedding; -- drop any pre-existing ivfflat
CREATE INDEX IF NOT EXISTS idx_digests_embedding_hnsw
ON digests USING hnsw (embedding vector_cosine_ops);
CREATE INDEX IF NOT EXISTS idx_digests_linked ON digests(linked_case_law_id);
CREATE INDEX IF NOT EXISTS idx_digests_practice_area ON digests(practice_area);
CREATE INDEX IF NOT EXISTS idx_digests_concept_tag ON digests(concept_tag);
CREATE INDEX IF NOT EXISTS idx_digests_subject_tags ON digests USING gin(subject_tags);
-- Lexical half of a future hybrid (Phase-1 search is semantic-only; index is ready).
CREATE INDEX IF NOT EXISTS idx_digests_content_tsv ON digests USING gin(content_tsv);
"""
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
async with pool.acquire() as conn:
await conn.execute(SCHEMA_SQL)
@@ -1319,7 +1384,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
await conn.execute(SCHEMA_V27_SQL)
await conn.execute(SCHEMA_V28_SQL)
await conn.execute(SCHEMA_V29_SQL)
logger.info("Database schema initialized (v1-v29)")
await conn.execute(SCHEMA_V30_SQL)
logger.info("Database schema initialized (v1-v30)")
async def init_schema() -> None:
@@ -3494,6 +3560,311 @@ async def delete_case_law(case_law_id: UUID) -> bool:
return result == "DELETE 1"
# ── Digests (X12 — radar layer; separate table, INV-DIG1/2/3) ────────
_DIGEST_COLS = (
"id, yomon_number, digest_date, publication, source_firm, concept_tag, "
"headline_holding, analysis_text, summary, underlying_citation, "
"underlying_court, underlying_date, underlying_judge, practice_area, "
"appeal_subtype, subject_tags, linked_case_law_id, source_document_path, "
"content_hash, extraction_status, created_at, updated_at"
)
_DIGEST_UPDATE_ALLOWED = {
"yomon_number", "digest_date", "publication", "source_firm", "concept_tag",
"headline_holding", "analysis_text", "summary", "underlying_citation",
"underlying_court", "underlying_date", "underlying_judge", "practice_area",
"appeal_subtype", "subject_tags", "source_document_path", "content_hash",
"extraction_status",
}
def _row_to_digest(row: asyncpg.Record | dict | None) -> dict | None:
"""Normalize a digests row: ISO-format dates, ensure subject_tags is a list."""
if row is None:
return None
d = dict(row)
for k in ("digest_date", "underlying_date", "created_at", "updated_at"):
if d.get(k) is not None and hasattr(d[k], "isoformat"):
d[k] = d[k].isoformat()
if d.get("subject_tags") is None:
d["subject_tags"] = []
if d.get("id") is not None:
d["id"] = str(d["id"])
if d.get("linked_case_law_id") is not None:
d["linked_case_law_id"] = str(d["linked_case_law_id"])
return d
async def create_digest(
*,
analysis_text: str,
yomon_number: str = "",
digest_date: date | None = None,
publication: str = "כל יום",
source_firm: str = "עפר טויסטר, עורכי דין",
concept_tag: str = "",
headline_holding: str = "",
summary: str = "",
underlying_citation: str = "",
underlying_court: str = "",
underlying_date: date | None = None,
underlying_judge: str = "",
practice_area: str = "",
appeal_subtype: str = "",
subject_tags: list[str] | None = None,
source_document_path: str = "",
extraction_status: str = "processing",
) -> dict:
"""Upsert a digest (X12). Idempotent on yomon_number (INV-G3): a repeat
upload of the same yomon updates in place. content_hash is the secondary
dedup key for digests whose number couldn't be parsed."""
pool = await get_pool()
content_hash = _content_hash(analysis_text)
async with pool.acquire() as conn:
# Upsert on the partial unique index uq_digests_yomon_number
# (yomon_number WHERE yomon_number <> ''). Predicate repeated in
# ON CONFLICT as required for partial indexes.
row = await conn.fetchrow(
f"""
INSERT INTO digests (
yomon_number, digest_date, publication, source_firm, concept_tag,
headline_holding, analysis_text, summary, underlying_citation,
underlying_court, underlying_date, underlying_judge, practice_area,
appeal_subtype, subject_tags, source_document_path,
content_hash, extraction_status
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13,
$14, $15, $16, $17, $18
)
ON CONFLICT (yomon_number) WHERE yomon_number <> ''
DO UPDATE SET
digest_date = COALESCE(EXCLUDED.digest_date, digests.digest_date),
publication = EXCLUDED.publication,
source_firm = EXCLUDED.source_firm,
concept_tag = EXCLUDED.concept_tag,
headline_holding = EXCLUDED.headline_holding,
analysis_text = EXCLUDED.analysis_text,
summary = EXCLUDED.summary,
underlying_citation = EXCLUDED.underlying_citation,
underlying_court = EXCLUDED.underlying_court,
underlying_date = COALESCE(EXCLUDED.underlying_date, digests.underlying_date),
underlying_judge = EXCLUDED.underlying_judge,
practice_area = EXCLUDED.practice_area,
appeal_subtype = EXCLUDED.appeal_subtype,
subject_tags = EXCLUDED.subject_tags,
source_document_path = COALESCE(NULLIF(EXCLUDED.source_document_path, ''), digests.source_document_path),
content_hash = EXCLUDED.content_hash,
extraction_status = EXCLUDED.extraction_status,
updated_at = now()
RETURNING {_DIGEST_COLS}
""",
yomon_number, digest_date, publication, source_firm, concept_tag,
headline_holding, analysis_text, summary, underlying_citation,
underlying_court, underlying_date, underlying_judge, practice_area,
appeal_subtype, list(subject_tags or []), source_document_path,
content_hash, extraction_status,
)
return _row_to_digest(row)
async def get_digest(digest_id: UUID | str) -> dict | None:
pool = await get_pool()
cid = digest_id if isinstance(digest_id, UUID) else UUID(str(digest_id))
row = await pool.fetchrow(
f"SELECT {_DIGEST_COLS} FROM digests WHERE id = $1", cid,
)
return _row_to_digest(row)
async def get_digest_by_content_hash(content_hash: str) -> dict | None:
if not content_hash:
return None
pool = await get_pool()
row = await pool.fetchrow(
f"SELECT {_DIGEST_COLS} FROM digests WHERE content_hash = $1", content_hash,
)
return _row_to_digest(row)
async def update_digest(digest_id: UUID | str, **fields) -> dict | None:
"""Patch metadata fields on a digest row. Whitelist via _DIGEST_UPDATE_ALLOWED."""
cid = digest_id if isinstance(digest_id, UUID) else UUID(str(digest_id))
updates = {k: v for k, v in fields.items() if k in _DIGEST_UPDATE_ALLOWED}
if not updates:
return await get_digest(cid)
pool = await get_pool()
set_parts = []
params: list = [cid]
for i, (k, v) in enumerate(updates.items(), start=2):
if k == "subject_tags":
v = list(v or [])
set_parts.append(f"{k} = ${i}")
params.append(v)
set_parts.append("updated_at = now()")
sql = f"UPDATE digests SET {', '.join(set_parts)} WHERE id = $1 RETURNING {_DIGEST_COLS}"
row = await pool.fetchrow(sql, *params)
return _row_to_digest(row)
async def store_digest_embedding(digest_id: UUID | str, vector: list[float]) -> None:
pool = await get_pool()
cid = digest_id if isinstance(digest_id, UUID) else UUID(str(digest_id))
await pool.execute(
"UPDATE digests SET embedding = $2, updated_at = now() WHERE id = $1",
cid, vector,
)
async def link_digest_to_case_law(
digest_id: UUID | str, case_law_id: UUID | str | None,
) -> dict | None:
"""Set (or clear, with None) the bridge to the underlying ruling (INV-DIG3)."""
pool = await get_pool()
cid = digest_id if isinstance(digest_id, UUID) else UUID(str(digest_id))
clid = None
if case_law_id is not None:
clid = case_law_id if isinstance(case_law_id, UUID) else UUID(str(case_law_id))
row = await pool.fetchrow(
f"UPDATE digests SET linked_case_law_id = $2, updated_at = now() "
f"WHERE id = $1 RETURNING {_DIGEST_COLS}",
cid, clid,
)
return _row_to_digest(row)
async def delete_digest(digest_id: UUID | str) -> bool:
pool = await get_pool()
cid = digest_id if isinstance(digest_id, UUID) else UUID(str(digest_id))
result = await pool.execute("DELETE FROM digests WHERE id = $1", cid)
return result == "DELETE 1"
async def list_digests(
practice_area: str = "",
concept_tag: str = "",
linked: bool | None = None,
search: str = "",
limit: int = 100,
offset: int = 0,
) -> list[dict]:
"""List digests with simple filters. linked=True/False filters on whether
the underlying ruling is in the library yet (INV-DIG3 gap surfacing)."""
pool = await get_pool()
conditions: list[str] = []
params: list = []
idx = 1
if practice_area:
conditions.append(f"practice_area = ${idx}")
params.append(practice_area)
idx += 1
if concept_tag:
conditions.append(f"concept_tag ILIKE ${idx}")
params.append(f"%{concept_tag}%")
idx += 1
if linked is True:
conditions.append("linked_case_law_id IS NOT NULL")
elif linked is False:
conditions.append("linked_case_law_id IS NULL")
if search:
conditions.append(
f"(yomon_number ILIKE ${idx} OR concept_tag ILIKE ${idx} "
f"OR headline_holding ILIKE ${idx} OR underlying_citation ILIKE ${idx} "
f"OR summary ILIKE ${idx})"
)
params.append(f"%{search}%")
idx += 1
where_sql = (" WHERE " + " AND ".join(conditions)) if conditions else ""
params.extend([limit, offset])
sql = (
f"SELECT {_DIGEST_COLS} FROM digests{where_sql} "
f"ORDER BY digest_date DESC NULLS LAST, created_at DESC "
f"LIMIT ${idx} OFFSET ${idx + 1}"
)
rows = await pool.fetch(sql, *params)
return [_row_to_digest(r) for r in rows]
async def search_digests_semantic(
query_embedding: list[float],
practice_area: str = "",
subject_tag: str = "",
concept_tag: str = "",
limit: int = 10,
) -> list[dict]:
"""Pure-semantic search over the digests radar (X12). Single vector per row
(no chunks/halachot), so no RRF here — see X12 §6. Joins the linked ruling's
citation when present so the researcher sees the pointer target directly."""
pool = await get_pool()
conditions = ["d.embedding IS NOT NULL"]
params: list = [query_embedding, limit]
idx = 3
if practice_area:
conditions.append(f"d.practice_area = ${idx}")
params.append(practice_area)
idx += 1
if subject_tag:
conditions.append(f"${idx} = ANY(d.subject_tags)")
params.append(subject_tag)
idx += 1
if concept_tag:
conditions.append(f"d.concept_tag ILIKE ${idx}")
params.append(f"%{concept_tag}%")
idx += 1
sql = f"""
SELECT {', '.join('d.' + c for c in _DIGEST_COLS.split(', '))},
cl.case_number AS linked_case_number,
cl.case_name AS linked_case_name,
cl.searchable AS linked_searchable,
1 - (d.embedding <=> $1) AS score
FROM digests d
LEFT JOIN case_law cl ON cl.id = d.linked_case_law_id
WHERE {' AND '.join(conditions)}
ORDER BY d.embedding <=> $1
LIMIT $2
"""
rows = await pool.fetch(sql, *params)
out = []
for r in rows:
d = _row_to_digest(r)
d["linked_case_number"] = r["linked_case_number"]
d["linked_case_name"] = r["linked_case_name"]
d["linked_searchable"] = r["linked_searchable"]
d["score"] = float(r["score"])
d["type"] = "digest"
out.append(d)
return out
async def find_case_law_by_citation_fuzzy(citation: str) -> dict | None:
"""Best-effort match of a digest's underlying_citation to a case_law row,
for autolink (INV-DIG3). Tries: (1) exact case_number; (2) canonical docket
substring (e.g. '46111-12-22') contained in a case_law.case_number. Returns
the first match or None — never raises, never mutates."""
citation = (citation or "").strip()
if not citation:
return None
pool = await get_pool()
row = await pool.fetchrow(
"SELECT * FROM case_law WHERE case_number = $1 LIMIT 1",
citation,
)
if row:
return _row_to_case_law(row)
# Extract a docket-like token: digits with '-' or '/' separators, e.g.
# 46111-12-22 or 3975/22. Match it as a substring of case_number.
m = re.search(r"\d+[-/]\d+(?:[-/]\d+)?", citation)
if not m:
return None
docket = m.group(0)
row = await pool.fetchrow(
"SELECT * FROM case_law "
"WHERE case_number ILIKE $1 ORDER BY created_at LIMIT 1",
f"%{docket}%",
)
return _row_to_case_law(row) if row else None
async def store_precedent_chunks(
case_law_id: UUID, chunks: list[dict],
) -> int:

View File

@@ -0,0 +1,268 @@
"""Orchestrator for the Digests radar (X12).
A digest ("כל יום" daily one-pager) is a SECONDARY source that POINTS at a
ruling — it is never cited in a decision (INV-DIG1) and never enters the
precedent/halacha pipeline (INV-DIG2). Ingest is therefore a short, standalone
path that reuses only ATOMIC services (extract_text, embeddings), NOT the
canonical ``ingest.ingest_document`` (which is bound to case_law):
file → extract_text → content_hash (idempotent) → LLM metadata extract
→ create_digest → single embedding (concept+headline+summary+analysis)
→ try_autolink(underlying_citation → case_law) [INV-DIG3]
→ extraction_status='completed'
claude_session rule: ``digest_metadata_extractor`` (local CLI) is imported
LAZILY inside ``ingest_digest`` only, so this module is import-safe from the
FastAPI container for the search/list/link/delete paths (DB + voyage only).
"""
from __future__ import annotations
import logging
from datetime import date
from pathlib import Path
from typing import Awaitable, Callable
from uuid import UUID
from legal_mcp import config
from legal_mcp.services import db, embeddings, extractor, ingest
logger = logging.getLogger(__name__)
ProgressCb = Callable[[str, int, str], Awaitable[None]]
DIGEST_LIBRARY_DIR = Path(config.DATA_DIR) / "digests"
_VALID_PRACTICE_AREAS = frozenset(
{"", "rishuy_uvniya", "betterment_levy", "compensation_197"}
)
async def _noop_progress(_status: str, _percent: int, _msg: str) -> None:
return None
def _embedding_text(fields: dict) -> str:
"""The single vector indexes the digest as an atomic discovery unit."""
parts = [
fields.get("concept_tag", ""),
fields.get("headline_holding", ""),
fields.get("summary", ""),
fields.get("analysis_text", ""),
]
return "\n".join(p for p in parts if p).strip()
async def try_autolink(digest_id: UUID | str, underlying_citation: str) -> str | None:
"""Best-effort link of a digest to the underlying ruling in case_law
(INV-DIG3). Returns the case_law_id (str) if linked, else None. Never raises."""
citation = (underlying_citation or "").strip()
if not citation:
return None
try:
match = await db.find_case_law_by_citation_fuzzy(citation)
except Exception as e:
logger.warning("digest try_autolink lookup failed for %r: %s", citation, e)
return None
if not match:
return None
await db.link_digest_to_case_law(digest_id, match["id"])
return str(match["id"])
async def ingest_digest(
*,
file_path: str | Path,
yomon_number: str = "",
digest_date: date | str | None = None,
practice_area: str = "",
appeal_subtype: str = "",
subject_tags: list[str] | None = None,
progress: ProgressCb | None = None,
) -> dict:
"""Ingest one digest. **MCP-tool-only** (uses the local LLM extractor).
User-supplied args win over LLM-extracted values for the same field
(the chair typed them deliberately); empty args are filled from the LLM.
Idempotent on yomon_number / content_hash (INV-G3).
"""
progress = progress or _noop_progress
if practice_area and practice_area not in _VALID_PRACTICE_AREAS:
raise ValueError(f"invalid practice_area: {practice_area!r}")
src = Path(file_path)
if not src.exists():
raise ValueError(f"file not found: {file_path}")
await progress("staging", 5, "מעתיק קובץ")
staged = ingest._stage_file(src, DIGEST_LIBRARY_DIR, "incoming")
rel_path = str(staged.relative_to(config.DATA_DIR)) \
if str(staged).startswith(str(config.DATA_DIR)) else str(staged)
await progress("extracting_text", 20, "מחלץ טקסט")
raw_text, _page_count, _offsets = await extractor.extract_text(str(staged))
raw_text = (raw_text or "").strip()
if not raw_text:
raise ValueError("no text extracted from digest")
# Idempotency: identical text already ingested → return existing row.
content_hash = db._content_hash(raw_text)
existing = await db.get_digest_by_content_hash(content_hash)
if existing:
await progress("completed", 100, "יומון זהה כבר קיים — לא נוצר כפל")
return {
"status": "exists",
"digest_id": existing["id"],
"yomon_number": existing.get("yomon_number", ""),
"linked_case_law_id": existing.get("linked_case_law_id"),
}
# LLM metadata extraction (lazy import — keeps this module container-safe).
await progress("extracting_metadata", 45, "מחלץ מטא-דאטה (LLM)")
from legal_mcp.services import digest_metadata_extractor
extracted = await digest_metadata_extractor.extract(raw_text)
def _coerce_date(v) -> date | None:
if v is None or v == "":
return None
if isinstance(v, date):
return v
if isinstance(v, str):
try:
return date.fromisoformat(v[:10])
except ValueError:
return None
return None
# Merge: explicit user args win; otherwise fall back to LLM extraction.
fields = {
"analysis_text": raw_text,
"yomon_number": yomon_number.strip() or extracted.get("yomon_number", ""),
"digest_date": _coerce_date(digest_date) or extracted.get("digest_date"),
"concept_tag": extracted.get("concept_tag", ""),
"headline_holding": extracted.get("headline_holding", ""),
"summary": extracted.get("summary", ""),
"underlying_citation": extracted.get("underlying_citation", ""),
"underlying_court": extracted.get("underlying_court", ""),
"underlying_date": extracted.get("underlying_date"),
"underlying_judge": extracted.get("underlying_judge", ""),
"practice_area": practice_area or extracted.get("practice_area", ""),
"appeal_subtype": appeal_subtype.strip() or extracted.get("appeal_subtype", ""),
"subject_tags": list(subject_tags) if subject_tags else extracted.get("subject_tags", []),
"source_document_path": rel_path,
"extraction_status": "processing",
}
await progress("storing", 70, "שומר רשומה")
record = await db.create_digest(**fields)
digest_id = record["id"]
# Single embedding for the whole digest (atomic discovery unit — X12 §6).
await progress("embedding", 85, "מחשב embedding")
emb_text = _embedding_text(fields)
if emb_text:
try:
vecs = await embeddings.embed_texts([emb_text], input_type="document")
if vecs:
await db.store_digest_embedding(digest_id, vecs[0])
except Exception as e: # surfaced, not swallowed (§6)
logger.warning("digest embedding failed for %s: %s", digest_id, e)
# Bridge to the underlying ruling if it is already in the library (INV-DIG3).
await progress("linking", 95, "מנסה לקשר לפסק המקורי")
linked_id = await try_autolink(digest_id, fields["underlying_citation"])
await db.update_digest(digest_id, extraction_status="completed")
await progress("completed", 100, "הושלם")
return {
"status": "completed",
"digest_id": digest_id,
"yomon_number": fields["yomon_number"],
"underlying_citation": fields["underlying_citation"],
"linked_case_law_id": linked_id,
"fields_extracted": sorted(extracted.keys()),
}
async def link_digest(digest_id: UUID | str, case_law_id: UUID | str) -> dict:
"""Manually link a digest to an underlying ruling (INV-DIG3). Idempotent."""
digest = await db.get_digest(digest_id)
if not digest:
raise ValueError("digest not found")
ruling = await db.get_case_law(
case_law_id if isinstance(case_law_id, UUID) else UUID(str(case_law_id))
)
if not ruling:
raise ValueError("case_law not found")
updated = await db.link_digest_to_case_law(digest_id, case_law_id)
return {
"linked": True,
"digest_id": str(digest_id),
"case_law_id": str(case_law_id),
"case_number": ruling.get("case_number"),
"digest": updated,
}
async def relink_digest(digest_id: UUID | str) -> dict:
"""Re-run autolink for a digest whose underlying ruling may now be in the
library. No-op if already linked or no match found."""
digest = await db.get_digest(digest_id)
if not digest:
raise ValueError("digest not found")
if digest.get("linked_case_law_id"):
return {"linked": True, "digest_id": str(digest_id),
"case_law_id": digest["linked_case_law_id"], "changed": False}
linked_id = await try_autolink(digest_id, digest.get("underlying_citation", ""))
return {
"linked": linked_id is not None,
"digest_id": str(digest_id),
"case_law_id": linked_id,
"changed": linked_id is not None,
}
async def search_digests(
query: str,
practice_area: str = "",
subject_tag: str = "",
concept_tag: str = "",
limit: int = 10,
) -> list[dict]:
"""Semantic search over the digests radar. Container-safe (voyage + DB)."""
if not query.strip():
return []
query_vec = await embeddings.embed_query(query)
return await db.search_digests_semantic(
query_embedding=query_vec,
practice_area=practice_area,
subject_tag=subject_tag,
concept_tag=concept_tag,
limit=limit,
)
async def get_digest(digest_id: UUID | str) -> dict | None:
return await db.get_digest(digest_id)
async def list_digests(
practice_area: str = "",
concept_tag: str = "",
linked: bool | None = None,
search: str = "",
limit: int = 100,
offset: int = 0,
) -> list[dict]:
return await db.list_digests(
practice_area=practice_area,
concept_tag=concept_tag,
linked=linked,
search=search,
limit=limit,
offset=offset,
)
async def delete_digest(digest_id: UUID | str) -> bool:
return await db.delete_digest(digest_id)

View File

@@ -0,0 +1,137 @@
"""Auto-extract catalog metadata from a "כל יום" daily digest (X12).
A digest is a one-page secondary summary (Ofer Toister) of a single ruling.
This module reads its raw text and asks the local Claude CLI to extract the
fields the radar needs: yomon number, concept tag, headline holding, a short
summary, the UNDERLYING ruling's citation (the critical bridge field — INV-DIG3),
its court / date / judge, practice area and subject tags.
claude_session rule: this module imports ``claude_session`` (the local CLI),
so it is **MCP-tool-only** — never import it from the FastAPI container. It is
pulled in lazily inside ``digest_library.ingest_digest`` only.
Unlike ``precedent_metadata_extractor`` (which patches a DB row), this returns
a plain dict from raw text; ``digest_library`` decides how to merge/store it.
"""
from __future__ import annotations
import logging
from datetime import date as date_type
from legal_mcp.config import parse_llm_json
from legal_mcp.services import claude_session
logger = logging.getLogger(__name__)
_VALID_PRACTICE_AREAS = {"", "rishuy_uvniya", "betterment_levy", "compensation_197"}
# Concatenated with f-strings at call time, NOT .format() — the JSON example
# below contains '{' / '}' which str.format would treat as placeholders and
# crash (same trap documented in precedent_metadata_extractor).
DIGEST_EXTRACTION_PROMPT = """אתה מסייע משפטי בכיר. לפניך "יומון" — סיכום עמוד-אחד של משרד עפר טויסטר (עלון "כל יום")
על פסק דין/החלטה אחת בתחום תכנון ובנייה / היטל השבחה / פיצויים (ס' 197). חלץ ממנו מטא-דאטה לקטלוג.
**אל תמציא** — שדה שלא מופיע בטקסט → השאר ריק (מחרוזת ריקה / מערך ריק).
## פלט נדרש
החזר JSON אחד (object — לא array), ללא markdown וללא הסברים:
{
"yomon_number": "מספר היומון מהכותרת ('יומון מס' 5163''5163'). ספרות בלבד. אם אין — ריק.",
"digest_date_iso": "YYYY-MM-DD — תאריך גיליון היומון (בכותרת, למשל '7 ביוני 2026''2026-06-07').",
"concept_tag": "תג-המושג שבמרכאות בראש העמוד (למשל 'שיקול הדעת המצומצם', 'Cherry-picking'). ביטוי קצר אחד.",
"headline_holding": "כותרת-ההלכה המודגשת מתחת לתג — משפט אחד שמסכם מה נקבע (למשל 'ביהמ\\"ש - שיקול דעת הוועדה המחוזית אינו מצומצם לטעות חמורה').",
"summary": "תקציר ניטרלי 2-3 משפטים בגוף שלישי: מה הייתה השאלה ומה הוכרע. בלי שיפוט.",
"underlying_citation": "מראה-המקום של פסק הדין/ההחלטה המקורי, כפי שמופיע בתחתית היומון, מילה במילה (למשל 'עת\\"מ 46111-12-22 יכין-אפק בע\\"מ נ' הוועדה המחוזית'). זהו השדה הקריטי — חלץ אותו במלואו ובדיוק.",
"underlying_court": "הערכאה שנתנה את פסק הדין המקורי (למשל 'בית המשפט לעניינים מנהליים מרכז-לוד', 'ועדת הערר מחוז ירושלים').",
"underlying_date_iso": "YYYY-MM-DD — תאריך מתן פסק הדין/ההחלטה המקורי (לרוב 'ניתן ביום DD.M.YY' בתחתית). שים לב: זה שונה מתאריך גיליון היומון!",
"underlying_judge": "שם השופט/ת או יו\\"ר ההרכב שנתן את פסק הדין המקורי (למשל 'יעל טויסטר ישראלי'). בלי תארים ('עו\\"ד', 'כב' השופט').",
"practice_area": "אחד מ-3: 'rishuy_uvniya' (רישוי ובנייה/הקלות/שימוש חורג) / 'betterment_levy' (היטל השבחה) / 'compensation_197' (פיצויים ס'197). אם לא ברור — ריק.",
"appeal_subtype": "תת-סוג קצר אם בולט (למשל 'הקלה', 'שיקול דעת הוועדה', 'מימוש במכר'). אחרת ריק.",
"subject_tags": ["3-7 תגיות בעברית snake_case (שיקול_דעת, הקלה, ועדה_מחוזית, היטל_השבחה, ...)"]
}
## כללי איכות
1. **underlying_citation** — השדה החשוב ביותר; הוא הגשר לפסק הדין המקורי. חלץ מההערות/התחתית, מילה במילה.
2. **הבחן בין שני התאריכים**: digest_date_iso = תאריך גיליון היומון (בכותרת); underlying_date_iso = מועד מתן פסק הדין (בתחתית, 'ניתן ביום...'). אל תבלבל.
3. **summary** — ניטרלי, גוף שלישי, בלי מילות שיפוט.
4. **subject_tags** — snake_case, תחום ועדת ערר תכנון ובנייה בלבד.
5. אם רכיב לא מופיע בבירור — השאר את אותו שדה ריק. אל תנחש.
"""
def _norm_str(result: dict, key: str) -> str:
v = result.get(key)
return v.strip() if isinstance(v, str) else ""
def _norm_date(result: dict, key: str) -> date_type | None:
v = result.get(key)
if not isinstance(v, str) or not v.strip():
return None
try:
return date_type.fromisoformat(v.strip()[:10])
except ValueError:
logger.debug("digest_metadata_extractor: ignoring invalid %s=%r", key, v)
return None
async def extract(raw_text: str) -> dict:
"""Extract digest metadata from raw text. Returns a dict (never raises).
Keys: yomon_number, digest_date (date|None), concept_tag, headline_holding,
summary, underlying_citation, underlying_court, underlying_date (date|None),
underlying_judge, practice_area, appeal_subtype, subject_tags (list[str]).
Missing/invalid fields are omitted so the caller's merge keeps user values.
"""
text = (raw_text or "").strip()
if not text:
return {}
user_msg = f"--- תחילת היומון ---\n{text}\n--- סוף היומון ---"
try:
result = await claude_session.query_json(
user_msg, system=DIGEST_EXTRACTION_PROMPT,
)
except Exception as e: # surfaced as warning, not swallowed silently (§6)
logger.warning("digest_metadata_extractor: query failed: %s", e)
return {}
if not isinstance(result, dict):
logger.warning(
"digest_metadata_extractor: expected dict, got %s",
type(result).__name__,
)
return {}
out: dict = {}
for key in (
"yomon_number", "concept_tag", "headline_holding", "summary",
"underlying_citation", "underlying_court", "underlying_judge",
"appeal_subtype",
):
s = _norm_str(result, key)
if s:
out[key] = s
dd = _norm_date(result, "digest_date_iso")
if dd is not None:
out["digest_date"] = dd
ud = _norm_date(result, "underlying_date_iso")
if ud is not None:
out["underlying_date"] = ud
pa = _norm_str(result, "practice_area")
if pa in _VALID_PRACTICE_AREAS and pa:
out["practice_area"] = pa
tags = result.get("subject_tags")
if isinstance(tags, list):
clean = [str(t).strip() for t in tags if str(t).strip()]
if clean:
out["subject_tags"] = clean
return out

View File

@@ -0,0 +1,161 @@
"""MCP tools for the Digests radar (X12).
A digest ("כל יום" daily one-pager, Ofer Toister) is a SECONDARY, discovery-
layer source that POINTS at a ruling. It is distinct from the three citation
corpora:
- ``search_precedent_library`` — authoritative external court rulings.
- ``search_internal_decisions`` — appeals-committee decisions.
- ``search_decisions`` — Dafna's prior decisions (style corpus).
A digest is NEVER cited in a decision (INV-DIG1) and NEVER enters the halacha
pipeline (INV-DIG2). ``search_digests`` is a research compass: it surfaces the
relevant digest + the UNDERLYING ruling's citation, which is then ingested into
the precedent library and cited from there.
"""
from __future__ import annotations
import time
from uuid import UUID
from legal_mcp.services import db, digest_library, telemetry
from legal_mcp.tools.envelope import empty, err as _err, ok as _ok
async def digest_upload(
file_path: str,
yomon_number: str = "",
digest_date: str = "",
practice_area: str = "",
appeal_subtype: str = "",
subject_tags: list[str] | None = None,
) -> str:
"""העלאת יומון ("כל יום") לקורפוס-הגילוי + חילוץ מטא-דאטה אוטומטי.
היומון הוא מקור-משני המצביע על פסק הדין המקורי — אינו מצוטט בהחלטה.
Args:
file_path: נתיב מלא לקובץ PDF/DOCX של היומון.
yomon_number: מספר היומון (אופציונלי — יחולץ מהטקסט אם ריק).
digest_date: ISO date של גיליון היומון (אופציונלי).
practice_area: rishuy_uvniya / betterment_levy / compensation_197.
subject_tags: תגיות נושא (אופציונלי — יחולצו אם ריק).
Returns: JSON עם digest_id, מספר היומון, מראה-המקום, וקישור-אוטומטי אם נמצא.
"""
try:
result = await digest_library.ingest_digest(
file_path=file_path,
yomon_number=yomon_number,
digest_date=digest_date or None,
practice_area=practice_area,
appeal_subtype=appeal_subtype,
subject_tags=subject_tags or None,
)
except Exception as e:
return _err(str(e))
return _ok(result)
async def digest_list(
practice_area: str = "",
concept_tag: str = "",
linked: bool | None = None,
search: str = "",
limit: int = 100,
) -> str:
"""רשימת יומונים בקורפוס-הגילוי, עם פילטרים. linked=false → יומונים שהפסק
המקורי שלהם עוד לא נקלט לספריית הפסיקה (פער-ידע גלוי)."""
rows = await digest_library.list_digests(
practice_area=practice_area,
concept_tag=concept_tag,
linked=linked,
search=search,
limit=limit,
)
return _ok(rows)
async def digest_get(digest_id: str) -> str:
"""יומון ספציפי לפי מזהה."""
try:
cid = UUID(digest_id)
except ValueError:
return _err("digest_id לא תקין")
record = await digest_library.get_digest(cid)
if not record:
return _err("יומון לא נמצא")
return _ok(record)
async def digest_link(digest_id: str, case_law_id: str) -> str:
"""קישור ידני של יומון לפסק הדין המקורי בספריית הפסיקה (INV-DIG3)."""
try:
UUID(digest_id)
UUID(case_law_id)
except ValueError:
return _err("מזהה לא תקין")
try:
result = await digest_library.link_digest(digest_id, case_law_id)
except Exception as e:
return _err(str(e))
return _ok(result)
async def digest_relink(digest_id: str) -> str:
"""ניסיון-קישור מחדש: בודק אם פסק הדין המקורי של היומון כבר בספרייה ומקשר."""
try:
UUID(digest_id)
except ValueError:
return _err("digest_id לא תקין")
try:
result = await digest_library.relink_digest(digest_id)
except Exception as e:
return _err(str(e))
return _ok(result)
async def digest_delete(digest_id: str) -> str:
"""מחיקת יומון מקורפוס-הגילוי."""
try:
cid = UUID(digest_id)
except ValueError:
return _err("digest_id לא תקין")
ok_ = await digest_library.delete_digest(cid)
if not ok_:
return _err("יומון לא נמצא")
return _ok({"deleted": True, "digest_id": digest_id})
async def search_digests(
query: str,
practice_area: str = "",
subject_tag: str = "",
concept_tag: str = "",
limit: int = 10,
) -> str:
"""חיפוש סמנטי בקורפוס-הגילוי (יומוני "כל יום"). מצפן-מחקר בלבד — מחזיר את
היומון הרלוונטי + מראה-המקום של הפסק המקורי (radar). היומון אינו מצוטט
בהחלטה (INV-DIG1); הצטט מהפסק המקורי דרך search_precedent_library."""
if not query or len(query.strip()) < 2:
return empty("שאילתה קצרה מדי (פחות מ-2 תווים).")
q = query.strip()
t0 = time.perf_counter()
results = await digest_library.search_digests(
query=q,
practice_area=practice_area,
subject_tag=subject_tag,
concept_tag=concept_tag,
limit=limit,
)
elapsed_ms = int((time.perf_counter() - t0) * 1000)
telemetry.log_search_bg(
search_type="digests",
query=q,
results=results,
duration_ms=elapsed_ms,
practice_area=practice_area or None,
user_agent="unknown",
)
if not results:
return empty("לא נמצאו יומונים תואמים.")
return _ok(results)