feat: external precedent library with auto halacha extraction
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m27s

Adds a third corpus of legal authority distinct from style_corpus
(Daphna's prior decisions for voice) and case_precedents (chair-attached
quotes per case). The new corpus holds chair-uploaded court rulings and
other appeals committee decisions, with binding rules (הלכות) extracted
automatically and queued for chair approval.

Pipeline (web/app.py + services/precedent_library.py):
file → extract → chunk → Voyage embed → halacha_extractor → store +
publish progress over the existing Redis SSE channel.

Schema V7 (services/db.py): extends case_law with source_kind +
extraction status fields under a CHECK constraint pinning practice_area
to the three appeals committee domains (rishuy_uvniya, betterment_levy,
compensation_197). New precedent_chunks (vector(1024)) and halachot
tables (vector(1024) over rule_statement, IVFFlat indexes, gin on
practice_areas/subject_tags). Halachot start as pending_review; only
approved/published rows are visible to search_precedent_library.

Agents: legal-writer, legal-researcher, legal-analyst, legal-ceo,
legal-qa get search_precedent_library. legal-writer prompt explains
the three-corpus distinction and CREAC use; legal-qa now verifies that
every cited halacha resolves to an approved row in the corpus.

UI: /precedents page with four tabs — library / semantic search /
pending review (J/K nav, A/R/E shortcuts, badge count) / stats.
Reuses the existing upload-sheet progress + SSE pattern.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-03 08:38:18 +00:00
parent a6edb75bbf
commit 7ee90dce31
23 changed files with 3853 additions and 67 deletions

View File

@@ -47,6 +47,7 @@ mcp = FastMCP(
from legal_mcp.tools import ( # noqa: E402
cases, documents, search, drafting, workflow, precedents,
precedent_library as plib,
)
@@ -142,10 +143,114 @@ async def precedent_remove(precedent_id: str) -> str:
async def precedent_search_library(
query: str, practice_area: str = "", limit: int = 10,
) -> str:
"""חיפוש בספרייה הרוחבית של ציטוטים שנצברו בין תיקים."""
"""חיפוש בציטוטים שדפנה צירפה ידנית לתיקים בעבר (case_precedents).
שונה מ-search_precedent_library שמחפש בקורפוס הפסיקה הסמכותית."""
return await precedents.precedent_search_library(query, practice_area, limit)
# ── External Precedent Library — authoritative case-law corpus ─────
# Distinct from precedent_search_library above (chair-attached quotes)
# and from search_decisions (Daphna's style corpus).
@mcp.tool()
async def precedent_library_upload(
file_path: str,
citation: str,
case_name: str = "",
court: str = "",
decision_date: str = "",
source_type: str = "",
precedent_level: str = "",
practice_area: str = "",
appeal_subtype: str = "",
subject_tags: list[str] | None = None,
is_binding: bool = True,
headnote: str = "",
summary: str = "",
) -> str:
"""העלאת פסיקה חיצונית (פס"ד / החלטה של ועדה אחרת) לקורפוס הסמכותי. מחלץ הלכות אוטומטית — כולן ממתינות לאישור היו"ר. practice_area: rishuy_uvniya / betterment_levy / compensation_197."""
return await plib.precedent_library_upload(
file_path, citation, case_name, court, decision_date,
source_type, precedent_level, practice_area, appeal_subtype,
subject_tags, is_binding, headnote, summary,
)
@mcp.tool()
async def precedent_library_list(
practice_area: str = "",
court: str = "",
precedent_level: str = "",
source_type: str = "",
search: str = "",
limit: int = 100,
) -> str:
"""רשימת הפסיקה בקורפוס הסמכותי, עם פילטרים."""
return await plib.precedent_library_list(
practice_area, court, precedent_level, source_type, search, limit,
)
@mcp.tool()
async def precedent_library_get(case_law_id: str) -> str:
"""פסיקה ספציפית בקורפוס + רשימת ההלכות שחולצו ממנה (כולל ממתינות לאישור)."""
return await plib.precedent_library_get(case_law_id)
@mcp.tool()
async def precedent_library_delete(case_law_id: str) -> str:
"""מחיקת פסיקה מהקורפוס (cascade: chunks + halachot)."""
return await plib.precedent_library_delete(case_law_id)
@mcp.tool()
async def precedent_extract_halachot(case_law_id: str) -> str:
"""הרצה מחדש של חילוץ הלכות לפסיקה קיימת. ההלכות הקיימות נמחקות, החדשות חוזרות לסטטוס pending_review."""
return await plib.precedent_extract_halachot(case_law_id)
@mcp.tool()
async def search_precedent_library(
query: str,
practice_area: str = "",
court: str = "",
precedent_level: str = "",
appeal_subtype: str = "",
subject_tag: str = "",
limit: int = 10,
include_halachot: bool = True,
) -> str:
"""חיפוש סמנטי בקורפוס הפסיקה הסמכותית. מחזיר הלכות (מאושרות בלבד) + קטעי טקסט. השתמש כש-legal-writer צריך לצטט פסיקה מחייבת בבלוק י (CREAC: rule + explanation)."""
return await plib.search_precedent_library(
query, practice_area, court, precedent_level, appeal_subtype,
None, subject_tag, limit, include_halachot,
)
@mcp.tool()
async def halacha_review(
halacha_id: str,
status: str,
reviewer: str = "דפנה",
rule_statement: str = "",
reasoning_summary: str = "",
subject_tags: list[str] | None = None,
practice_areas: list[str] | None = None,
) -> str:
"""אישור / דחייה / עריכה של הלכה שחולצה אוטומטית. status: pending_review / approved / rejected / published."""
return await plib.halacha_review(
halacha_id, status, reviewer, rule_statement, reasoning_summary,
subject_tags, practice_areas,
)
@mcp.tool()
async def halachot_pending(limit: int = 100) -> str:
"""תור ההלכות הממתינות לאישור."""
return await plib.halachot_pending(limit)
# Documents
@mcp.tool()
async def document_upload(

View File

@@ -7,14 +7,16 @@ from dataclasses import dataclass, field
from legal_mcp import config
# Hebrew legal section headers
# Hebrew legal section headers.
# Covers both appeals committee decisions and external court rulings —
# court rulings use slightly different vocabulary (פסק דין, נימוקים, סוף דבר).
SECTION_PATTERNS = [
(r"רקע\s*עובדתי|רקע\s*כללי|העובדות|הרקע", "facts"),
(r"טענות\s*העוררי[םן]|טענות\s*המערערי[םן]|עיקר\s*טענות\s*העוררי[םן]", "appellant_claims"),
(r"טענות\s*המשיבי[םן]|תשובת\s*המשיבי[םן]|עיקר\s*טענות\s*המשיבי[םן]", "respondent_claims"),
(r"דיון\s*והכרעה|דיון|הכרעה|ניתוח\s*משפטי|המסגרת\s*המשפטית", "legal_analysis"),
(r"מסקנ[הות]|סיכום", "conclusion"),
(r"החלטה|לפיכך\s*אני\s*מחליט|התוצאה", "ruling"),
(r"דיון\s*והכרעה|דיון|הכרעה|ניתוח\s*משפטי|המסגרת\s*המשפטית|נימוקים", "legal_analysis"),
(r"מסקנ[הות]|סיכום|סוף\s*דבר", "conclusion"),
(r"פסק[- ]?דין|החלטה|לפיכך\s*אני\s*מחליט|התוצאה", "ruling"),
(r"מבוא|פתיחה|לפניי", "intro"),
]

View File

@@ -518,6 +518,91 @@ CREATE INDEX IF NOT EXISTS idx_cases_archived ON cases(archived_at) WHERE archiv
"""
# ── V7: External Precedent Library + halacha extraction ──────────
# Chair-uploaded external court rulings and other appeals committee decisions
# become an authoritative law corpus. Distinct from style_corpus (Daphna's
# style) and case_precedents (chair-attached quotes scoped to a single case).
SCHEMA_V7_SQL = """
-- case_law extensions: distinguish chair-uploaded full rulings from
-- auto-extracted citation stubs, and track ingestion progress.
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS source_kind TEXT DEFAULT 'cited_only';
-- 'external_upload' (chair uploaded full ruling) | 'cited_only' (stub from
-- references_extractor) | 'nevo_seed' (future: auto-fetched from Nevo).
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS document_id UUID REFERENCES documents(id) ON DELETE SET NULL;
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS extraction_status TEXT DEFAULT 'pending';
-- 'pending' | 'processing' | 'completed' | 'failed'
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS halacha_extraction_status TEXT DEFAULT 'pending';
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT '';
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS headnote TEXT DEFAULT '';
-- chair-editable abstract shown in search results.
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS source_type TEXT DEFAULT '';
-- 'court_ruling' | 'appeals_committee'
-- practice_area is closed to the three appeals committee domains.
DO $$ BEGIN
ALTER TABLE case_law ADD CONSTRAINT case_law_practice_area_check
CHECK (practice_area IN ('', 'rishuy_uvniya', 'betterment_levy', 'compensation_197'));
EXCEPTION WHEN duplicate_object THEN NULL; END $$;
CREATE INDEX IF NOT EXISTS idx_case_law_source_kind ON case_law(source_kind);
CREATE INDEX IF NOT EXISTS idx_case_law_practice ON case_law(practice_area, appeal_subtype);
-- precedent_chunks: full-text chunks of an uploaded ruling, with embeddings.
-- Analog of document_chunks for case_law rows where source_kind='external_upload'.
CREATE TABLE IF NOT EXISTS precedent_chunks (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
section_type TEXT DEFAULT 'other',
-- intro | facts | legal_analysis | ruling | conclusion | other
page_number INTEGER,
embedding vector(1024),
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_precedent_chunks_case_law ON precedent_chunks(case_law_id);
CREATE INDEX IF NOT EXISTS idx_precedent_chunks_section ON precedent_chunks(case_law_id, section_type);
CREATE INDEX IF NOT EXISTS idx_precedent_chunks_vec
ON precedent_chunks USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50);
-- halachot: extracted binding rules. One halacha = one rule + verbatim quote.
-- Embedded separately for rule-precision semantic match (chunks centroid is
-- dominated by surrounding context). All halachot start as pending_review;
-- only approved/published rows are visible to search_precedent_library.
CREATE TABLE IF NOT EXISTS halachot (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE,
halacha_index INTEGER NOT NULL,
rule_statement TEXT NOT NULL,
rule_type TEXT DEFAULT 'binding',
-- binding | interpretive | procedural | obiter
reasoning_summary TEXT DEFAULT '',
supporting_quote TEXT NOT NULL,
page_reference TEXT DEFAULT '',
practice_areas TEXT[] DEFAULT '{}',
subject_tags TEXT[] DEFAULT '{}',
cites TEXT[] DEFAULT '{}',
confidence NUMERIC(3,2) DEFAULT 0.0,
quote_verified BOOLEAN DEFAULT FALSE,
review_status TEXT DEFAULT 'pending_review',
-- pending_review | approved | rejected | published
reviewer TEXT DEFAULT '',
reviewed_at TIMESTAMPTZ,
embedding vector(1024),
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_halachot_case_law ON halachot(case_law_id);
CREATE INDEX IF NOT EXISTS idx_halachot_status ON halachot(review_status);
CREATE INDEX IF NOT EXISTS idx_halachot_practice ON halachot USING gin(practice_areas);
CREATE INDEX IF NOT EXISTS idx_halachot_tags ON halachot USING gin(subject_tags);
CREATE INDEX IF NOT EXISTS idx_halachot_vec
ON halachot USING ivfflat (embedding vector_cosine_ops) WITH (lists = 50);
"""
async def init_schema() -> None:
pool = await get_pool()
async with pool.acquire() as conn:
@@ -528,7 +613,8 @@ async def init_schema() -> None:
await conn.execute(SCHEMA_V4_SQL)
await conn.execute(SCHEMA_V5_SQL)
await conn.execute(SCHEMA_V6_SQL)
logger.info("Database schema initialized (v1-v6)")
await conn.execute(SCHEMA_V7_SQL)
logger.info("Database schema initialized (v1-v7)")
# ── Case CRUD ───────────────────────────────────────────────────────
@@ -1518,3 +1604,590 @@ async def detect_appraiser_conflicts(case_id: UUID) -> list[dict]:
"entries": entries,
})
return conflicts
# ── V7: External precedent library + halachot ─────────────────────
def _row_to_case_law(row: asyncpg.Record) -> dict:
"""Normalize a case_law row, parsing subject_tags JSONB to list."""
d = dict(row)
if isinstance(d.get("subject_tags"), str):
try:
d["subject_tags"] = json.loads(d["subject_tags"])
except (TypeError, ValueError):
d["subject_tags"] = []
if d.get("date") is not None:
d["date"] = d["date"].isoformat()
return d
async def get_case_law(case_law_id: UUID) -> dict | None:
pool = await get_pool()
row = await pool.fetchrow(
"SELECT * FROM case_law WHERE id = $1", case_law_id,
)
return _row_to_case_law(row) if row else None
async def get_case_law_by_citation(case_number: str) -> dict | None:
pool = await get_pool()
row = await pool.fetchrow(
"SELECT * FROM case_law WHERE case_number = $1", case_number,
)
return _row_to_case_law(row) if row else None
async def create_external_case_law(
case_number: str,
case_name: str,
full_text: str,
court: str = "",
decision_date: date | None = None,
practice_area: str = "",
appeal_subtype: str = "",
subject_tags: list[str] | None = None,
summary: str = "",
headnote: str = "",
key_quote: str = "",
source_url: str = "",
source_type: str = "",
precedent_level: str = "",
is_binding: bool = True,
document_id: UUID | None = None,
) -> dict:
"""Insert a chair-uploaded external precedent into case_law.
If a row with this ``case_number`` already exists with
source_kind='cited_only' (auto-discovered), promote it to
source_kind='external_upload' and fill in the missing fields.
"""
pool = await get_pool()
tags_json = json.dumps(subject_tags or [], ensure_ascii=False)
async with pool.acquire() as conn:
existing = await conn.fetchrow(
"SELECT id, source_kind FROM case_law WHERE case_number = $1",
case_number,
)
if existing:
row = await conn.fetchrow(
"""
UPDATE case_law SET
case_name = $2,
court = COALESCE(NULLIF($3, ''), court),
date = COALESCE($4, date),
practice_area = $5,
appeal_subtype = $6,
subject_tags = $7,
summary = COALESCE(NULLIF($8, ''), summary),
headnote = $9,
key_quote = COALESCE(NULLIF($10, ''), key_quote),
full_text = $11,
source_url = COALESCE(NULLIF($12, ''), source_url),
source_type = $13,
precedent_level = $14,
is_binding = $15,
document_id = COALESCE($16, document_id),
source_kind = 'external_upload',
extraction_status = 'processing',
halacha_extraction_status = 'pending'
WHERE id = $1
RETURNING *
""",
existing["id"], case_name, court, decision_date,
practice_area, appeal_subtype, tags_json, summary, headnote,
key_quote, full_text, source_url, source_type,
precedent_level, is_binding, document_id,
)
else:
row = await conn.fetchrow(
"""
INSERT INTO case_law (
case_number, case_name, court, date, subject_tags,
summary, key_quote, full_text, source_url,
source_kind, document_id, extraction_status,
halacha_extraction_status, practice_area, appeal_subtype,
headnote, source_type, precedent_level, is_binding
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9,
'external_upload', $10, 'processing', 'pending',
$11, $12, $13, $14, $15, $16
)
RETURNING *
""",
case_number, case_name, court, decision_date, tags_json,
summary, key_quote, full_text, source_url,
document_id, practice_area, appeal_subtype, headnote,
source_type, precedent_level, is_binding,
)
return _row_to_case_law(row)
async def update_case_law(case_law_id: UUID, **fields) -> dict | None:
"""Patch metadata fields on a case_law row.
Allowed fields: case_name, court, date, practice_area, appeal_subtype,
subject_tags, summary, headnote, key_quote, source_url, source_type,
precedent_level, is_binding.
"""
allowed = {
"case_name", "court", "date", "practice_area", "appeal_subtype",
"subject_tags", "summary", "headnote", "key_quote", "source_url",
"source_type", "precedent_level", "is_binding",
}
updates = {k: v for k, v in fields.items() if k in allowed}
if not updates:
return await get_case_law(case_law_id)
pool = await get_pool()
set_parts = []
params: list = [case_law_id]
for i, (k, v) in enumerate(updates.items(), start=2):
if k == "subject_tags":
v = json.dumps(v or [], ensure_ascii=False)
set_parts.append(f"{k} = ${i}")
params.append(v)
sql = f"UPDATE case_law SET {', '.join(set_parts)} WHERE id = $1 RETURNING *"
row = await pool.fetchrow(sql, *params)
return _row_to_case_law(row) if row else None
async def set_case_law_extraction_status(case_law_id: UUID, status: str) -> None:
pool = await get_pool()
await pool.execute(
"UPDATE case_law SET extraction_status = $2 WHERE id = $1",
case_law_id, status,
)
async def set_case_law_halacha_status(case_law_id: UUID, status: str) -> None:
pool = await get_pool()
await pool.execute(
"UPDATE case_law SET halacha_extraction_status = $2 WHERE id = $1",
case_law_id, status,
)
async def list_external_case_law(
practice_area: str = "",
court: str = "",
precedent_level: str = "",
source_type: str = "",
search: str = "",
limit: int = 100,
offset: int = 0,
) -> list[dict]:
"""List chair-uploaded precedents, with simple filters."""
pool = await get_pool()
conditions = ["source_kind = 'external_upload'"]
params: list = []
idx = 1
if practice_area:
conditions.append(f"practice_area = ${idx}")
params.append(practice_area)
idx += 1
if court:
conditions.append(f"court ILIKE ${idx}")
params.append(f"%{court}%")
idx += 1
if precedent_level:
conditions.append(f"precedent_level = ${idx}")
params.append(precedent_level)
idx += 1
if source_type:
conditions.append(f"source_type = ${idx}")
params.append(source_type)
idx += 1
if search:
conditions.append(
f"(case_number ILIKE ${idx} OR case_name ILIKE ${idx} "
f"OR summary ILIKE ${idx} OR headnote ILIKE ${idx})"
)
params.append(f"%{search}%")
idx += 1
where_sql = " AND ".join(conditions)
params.extend([limit, offset])
sql = f"""
SELECT id, case_number, case_name, court, date, practice_area,
appeal_subtype, source_type, precedent_level, is_binding,
summary, headnote, subject_tags, source_kind,
extraction_status, halacha_extraction_status,
created_at,
(SELECT COUNT(*) FROM halachot h WHERE h.case_law_id = case_law.id) AS halachot_count,
(SELECT COUNT(*) FROM halachot h WHERE h.case_law_id = case_law.id
AND h.review_status IN ('approved', 'published')) AS approved_count
FROM case_law
WHERE {where_sql}
ORDER BY created_at DESC
LIMIT ${idx} OFFSET ${idx + 1}
"""
rows = await pool.fetch(sql, *params)
return [_row_to_case_law(r) for r in rows]
async def delete_case_law(case_law_id: UUID) -> bool:
"""Delete a precedent and cascade chunks + halachot."""
pool = await get_pool()
result = await pool.execute(
"DELETE FROM case_law WHERE id = $1", case_law_id,
)
return result == "DELETE 1"
async def store_precedent_chunks(
case_law_id: UUID, chunks: list[dict],
) -> int:
"""Replace precedent chunks for a case_law row.
Each chunk dict has: chunk_index, content, section_type, page_number,
embedding (list[float] or None).
"""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(
"DELETE FROM precedent_chunks WHERE case_law_id = $1",
case_law_id,
)
for c in chunks:
await conn.execute(
"""INSERT INTO precedent_chunks
(case_law_id, chunk_index, content, section_type,
page_number, embedding)
VALUES ($1, $2, $3, $4, $5, $6)""",
case_law_id,
c["chunk_index"],
c["content"],
c.get("section_type", "other"),
c.get("page_number"),
c.get("embedding"),
)
return len(chunks)
async def list_precedent_chunks(
case_law_id: UUID,
section_types: tuple[str, ...] | None = None,
) -> list[dict]:
pool = await get_pool()
if section_types:
rows = await pool.fetch(
"""SELECT id, chunk_index, content, section_type, page_number
FROM precedent_chunks
WHERE case_law_id = $1 AND section_type = ANY($2::text[])
ORDER BY chunk_index""",
case_law_id, list(section_types),
)
else:
rows = await pool.fetch(
"""SELECT id, chunk_index, content, section_type, page_number
FROM precedent_chunks
WHERE case_law_id = $1
ORDER BY chunk_index""",
case_law_id,
)
return [dict(r) for r in rows]
async def delete_halachot(case_law_id: UUID) -> int:
pool = await get_pool()
result = await pool.execute(
"DELETE FROM halachot WHERE case_law_id = $1", case_law_id,
)
# result is e.g. "DELETE 5" — extract the number.
try:
return int(result.split()[-1])
except (ValueError, IndexError):
return 0
async def store_halachot(case_law_id: UUID, halachot: list[dict]) -> int:
"""Bulk-insert extracted halachot. Always with review_status='pending_review'."""
if not halachot:
return 0
pool = await get_pool()
async with pool.acquire() as conn:
for i, h in enumerate(halachot):
await conn.execute(
"""INSERT INTO halachot
(case_law_id, halacha_index, rule_statement, rule_type,
reasoning_summary, supporting_quote, page_reference,
practice_areas, subject_tags, cites, confidence,
quote_verified, embedding, review_status)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11,
$12, $13, 'pending_review')""",
case_law_id,
i,
h["rule_statement"],
h.get("rule_type", "binding"),
h.get("reasoning_summary", ""),
h["supporting_quote"],
h.get("page_reference", ""),
h.get("practice_areas", []),
h.get("subject_tags", []),
h.get("cites", []),
h.get("confidence", 0.0),
h.get("quote_verified", False),
h.get("embedding"),
)
return len(halachot)
async def list_halachot(
case_law_id: UUID | None = None,
review_status: str | None = None,
practice_area: str | None = None,
limit: int = 200,
offset: int = 0,
) -> list[dict]:
pool = await get_pool()
conditions = []
params: list = []
idx = 1
if case_law_id is not None:
conditions.append(f"h.case_law_id = ${idx}")
params.append(case_law_id)
idx += 1
if review_status:
conditions.append(f"h.review_status = ${idx}")
params.append(review_status)
idx += 1
if practice_area:
conditions.append(f"${idx} = ANY(h.practice_areas)")
params.append(practice_area)
idx += 1
where_sql = f"WHERE {' AND '.join(conditions)}" if conditions else ""
params.extend([limit, offset])
sql = f"""
SELECT h.id, h.case_law_id, h.halacha_index, h.rule_statement,
h.rule_type, h.reasoning_summary, h.supporting_quote,
h.page_reference, h.practice_areas, h.subject_tags,
h.cites, h.confidence, h.quote_verified, h.review_status,
h.reviewer, h.reviewed_at, h.created_at, h.updated_at,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level
FROM halachot h
LEFT JOIN case_law cl ON cl.id = h.case_law_id
{where_sql}
ORDER BY h.case_law_id, h.halacha_index
LIMIT ${idx} OFFSET ${idx + 1}
"""
rows = await pool.fetch(sql, *params)
out = []
for r in rows:
d = dict(r)
if d.get("decision_date") is not None:
d["decision_date"] = d["decision_date"].isoformat()
out.append(d)
return out
async def update_halacha(
halacha_id: UUID,
review_status: str | None = None,
reviewer: str = "",
rule_statement: str | None = None,
reasoning_summary: str | None = None,
subject_tags: list[str] | None = None,
practice_areas: list[str] | None = None,
) -> dict | None:
"""Update a halacha — used by the chair to approve/reject/edit."""
pool = await get_pool()
set_parts: list[str] = []
params: list = [halacha_id]
idx = 2
if review_status is not None:
set_parts.append(f"review_status = ${idx}")
params.append(review_status)
idx += 1
if review_status in ("approved", "rejected", "published"):
set_parts.append(f"reviewed_at = now()")
set_parts.append(f"reviewer = ${idx}")
params.append(reviewer)
idx += 1
if rule_statement is not None:
set_parts.append(f"rule_statement = ${idx}")
params.append(rule_statement)
idx += 1
if reasoning_summary is not None:
set_parts.append(f"reasoning_summary = ${idx}")
params.append(reasoning_summary)
idx += 1
if subject_tags is not None:
set_parts.append(f"subject_tags = ${idx}")
params.append(subject_tags)
idx += 1
if practice_areas is not None:
set_parts.append(f"practice_areas = ${idx}")
params.append(practice_areas)
idx += 1
if not set_parts:
return None
set_parts.append("updated_at = now()")
sql = f"UPDATE halachot SET {', '.join(set_parts)} WHERE id = $1 RETURNING *"
row = await pool.fetchrow(sql, *params)
return dict(row) if row else None
async def search_precedent_library_semantic(
query_embedding: list[float],
practice_area: str = "",
court: str = "",
precedent_level: str = "",
appeal_subtype: str = "",
is_binding: bool | None = None,
subject_tag: str = "",
limit: int = 10,
include_halachot: bool = True,
) -> list[dict]:
"""Semantic search over chair-uploaded precedents.
Returns merged halachot + chunks. Halachot are pre-distilled rules, so
they get a small score boost. Only ``approved`` / ``published`` halachot
are visible (per chair-review policy). Chunks are visible regardless
of halacha review status.
"""
pool = await get_pool()
halacha_filters = ["h.review_status IN ('approved', 'published')"]
chunk_filters = ["cl.source_kind = 'external_upload'"]
h_params: list = [query_embedding, limit]
c_params: list = [query_embedding, limit]
h_idx = 3
c_idx = 3
if practice_area:
halacha_filters.append(f"${h_idx} = ANY(h.practice_areas)")
h_params.append(practice_area)
h_idx += 1
chunk_filters.append(f"cl.practice_area = ${c_idx}")
c_params.append(practice_area)
c_idx += 1
if court:
halacha_filters.append(f"cl.court ILIKE ${h_idx}")
h_params.append(f"%{court}%")
h_idx += 1
chunk_filters.append(f"cl.court ILIKE ${c_idx}")
c_params.append(f"%{court}%")
c_idx += 1
if precedent_level:
halacha_filters.append(f"cl.precedent_level = ${h_idx}")
h_params.append(precedent_level)
h_idx += 1
chunk_filters.append(f"cl.precedent_level = ${c_idx}")
c_params.append(precedent_level)
c_idx += 1
if appeal_subtype:
halacha_filters.append(f"cl.appeal_subtype = ${h_idx}")
h_params.append(appeal_subtype)
h_idx += 1
chunk_filters.append(f"cl.appeal_subtype = ${c_idx}")
c_params.append(appeal_subtype)
c_idx += 1
if is_binding is not None:
halacha_filters.append(f"cl.is_binding = ${h_idx}")
h_params.append(is_binding)
h_idx += 1
chunk_filters.append(f"cl.is_binding = ${c_idx}")
c_params.append(is_binding)
c_idx += 1
if subject_tag:
halacha_filters.append(f"${h_idx} = ANY(h.subject_tags)")
h_params.append(subject_tag)
h_idx += 1
halacha_sql = f"""
SELECT h.id AS halacha_id, h.case_law_id, h.rule_statement,
h.reasoning_summary, h.supporting_quote, h.page_reference,
h.practice_areas, h.subject_tags, h.confidence, h.rule_type,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level,
1 - (h.embedding <=> $1) AS score
FROM halachot h
JOIN case_law cl ON cl.id = h.case_law_id
WHERE {' AND '.join(halacha_filters)}
AND h.embedding IS NOT NULL
ORDER BY h.embedding <=> $1
LIMIT $2
"""
chunk_sql = f"""
SELECT pc.id AS chunk_id, pc.case_law_id, pc.content,
pc.section_type, pc.page_number,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level, cl.practice_area,
1 - (pc.embedding <=> $1) AS score
FROM precedent_chunks pc
JOIN case_law cl ON cl.id = pc.case_law_id
WHERE {' AND '.join(chunk_filters)}
AND pc.embedding IS NOT NULL
ORDER BY pc.embedding <=> $1
LIMIT $2
"""
results: list[dict] = []
if include_halachot:
rows = await pool.fetch(halacha_sql, *h_params)
for r in rows:
d = dict(r)
if d.get("decision_date") is not None:
d["decision_date"] = d["decision_date"].isoformat()
d["score"] = float(d["score"]) + 0.05 # rule-level boost
d["type"] = "halacha"
results.append(d)
rows = await pool.fetch(chunk_sql, *c_params)
for r in rows:
d = dict(r)
if d.get("decision_date") is not None:
d["decision_date"] = d["decision_date"].isoformat()
d["score"] = float(d["score"])
d["type"] = "passage"
results.append(d)
results.sort(key=lambda x: x["score"], reverse=True)
return results[:limit]
async def precedent_library_stats() -> dict:
"""Aggregate stats for the /precedents stats tab."""
pool = await get_pool()
async with pool.acquire() as conn:
total = await conn.fetchval(
"SELECT COUNT(*) FROM case_law WHERE source_kind = 'external_upload'"
)
by_practice = await conn.fetch(
"""SELECT practice_area, COUNT(*) AS n
FROM case_law
WHERE source_kind = 'external_upload'
GROUP BY practice_area
ORDER BY n DESC"""
)
by_level = await conn.fetch(
"""SELECT precedent_level, COUNT(*) AS n
FROM case_law
WHERE source_kind = 'external_upload'
GROUP BY precedent_level
ORDER BY n DESC"""
)
halachot_total = await conn.fetchval(
"SELECT COUNT(*) FROM halachot"
)
halachot_pending = await conn.fetchval(
"SELECT COUNT(*) FROM halachot WHERE review_status = 'pending_review'"
)
halachot_approved = await conn.fetchval(
"SELECT COUNT(*) FROM halachot WHERE review_status IN ('approved', 'published')"
)
return {
"precedents_total": int(total or 0),
"by_practice_area": [
{"practice_area": r["practice_area"], "count": int(r["n"])}
for r in by_practice
],
"by_precedent_level": [
{"precedent_level": r["precedent_level"], "count": int(r["n"])}
for r in by_level
],
"halachot_total": int(halachot_total or 0),
"halachot_pending": int(halachot_pending or 0),
"halachot_approved": int(halachot_approved or 0),
}

View File

@@ -0,0 +1,326 @@
"""Extract binding legal rules (הלכות) from external court rulings.
Runs Claude (via the local headless ``claude -p`` bridge) over the
legal_analysis / ruling / conclusion chunks of a precedent, returns a
structured list of halachot, validates each one against the source text,
embeds the rule statement, and stores everything as ``pending_review`` in
the ``halachot`` table.
All extraction is idempotent — calling ``extract(case_law_id)`` twice
deletes prior rows for that precedent first.
Trust model:
Per chair decision, NO halacha is auto-published. Every extracted
halacha enters with ``review_status='pending_review'``. The chair
approves/rejects via the UI, and only ``approved`` (or ``published``)
rows are visible to ``search_precedent_library`` and the writing
agents.
"""
from __future__ import annotations
import asyncio
import logging
import re
from uuid import UUID
from legal_mcp import config
from legal_mcp.config import parse_llm_json
from legal_mcp.services import claude_session, db, embeddings, proofreader
logger = logging.getLogger(__name__)
# Concurrency model mirrors claims_extractor — each ``claude -p`` subprocess
# holds ~300 MB RSS, so we cap parallel chunks to keep the box healthy.
CHUNK_CONCURRENCY = 3
CHUNK_RETRY_ATTEMPTS = 1
# Sections from which to extract. facts/intro/appellant_claims/respondent_claims
# never contain holdings, only positions, so we skip them.
EXTRACTABLE_SECTIONS = ("legal_analysis", "ruling", "conclusion")
HALACHA_EXTRACTION_PROMPT = """אתה משפטן בכיר המתמחה בדיני תכנון ובניה (ועדות ערר, היטל השבחה, פיצויים לפי סעיף 197 לחוק התכנון והבניה). תפקידך: לחלץ הלכות מחייבות מתוך פסק דין/החלטה משפטית.
## הגדרות מחייבות
הלכה (binding rule) = כלל משפטי שהפסק קובע או מאמץ ומיישם, באופן שניתן להסתמך עליו בהחלטות עתידיות.
לא-הלכה (אין לחלץ):
- אמרת אגב (obiter dicta) — הערות שאינן הכרחיות להכרעה.
- ממצאים עובדתיים ספציפיים לתיק ("העורר לא הוכיח X").
- ציטוטי הלכות מפסקי דין אחרים שלא אומצו במפורש בפסק זה.
- הצהרות על דין קיים שאינן מיושמות בהכרעה.
הבחנה קריטית: כאשר הפסק מצטט הלכה מפסק קודם, חלץ אותה רק אם בית המשפט בפסק הנוכחי **מאמץ ומחיל** אותה (לא רק מזכיר אותה ברקע).
## תחומים אפשריים (practice_areas) — תחומי ועדת הערר בלבד
- rishuy_uvniya — רישוי ובניה (תיקי 1xxx: היתרים, שימוש חורג, תכניות, קווי בניין, גובה, חניה)
- betterment_levy — היטל השבחה (תיקי 8xxx: שומה, מערכות, תכניות המקנות בה, מועד קובע, סופיות ההחלטה)
- compensation_197 — פיצויים לפי ס' 197 (תיקי 9xxx: פגיעה במקרקעין, ירידת ערך, ס' 200/פטור)
הלכה אחת יכולה לחול על כמה תחומים — practice_areas הוא array ולא string יחיד.
## סוגי הלכה (rule_type)
- binding — הלכה מחייבת שהוחלה על התיק.
- interpretive — פרשנות סעיף חוק/תכנית שאומצה.
- procedural — כלל פרוצדורלי (סמכות, מועדים, הליכי שמיעה).
- obiter — אמרת אגב חשובה (חלץ רק אם משמעותית; סמן confidence נמוך).
## פלט נדרש
החזר JSON array בלבד, ללא markdown, ללא הסברים. דוגמה:
[
{
"rule_statement": "ניסוח הכלל בלשון משפטית מדויקת בגוף שלישי, 1-3 משפטים.",
"rule_type": "binding",
"reasoning_summary": "תמצית ההיגיון: למה בית המשפט הגיע לכלל הזה (1-2 משפטים).",
"supporting_quote": "ציטוט מילולי מדויק מהפסק התומך בכלל. חייב להופיע מילה במילה בטקסט הקלט.",
"page_reference": "פס' 12 / עמ' 8 — ככל שניתן לזהות מהקלט.",
"practice_areas": ["betterment_levy"],
"subject_tags": ["מועד_קביעת_שומה", "סופיות_ההחלטה"],
"cites": ["עע\\"מ 3975/22"],
"confidence": 0.85
}
]
## כללי איכות
1. **נאמנות מוחלטת לציטוט** — supporting_quote חייב להיות הדבקה מדויקת מהקלט. אם אין ציטוט מתאים — אל תמציא הלכה.
2. **מספר הלכות** — פסק רגיל מכיל 1-4 הלכות מחייבות. אל תמתח את הרשימה. אם אין הלכה — החזר [].
3. **לא לפצל יתר על המידה** — אם שני סעיפים מבטאים את אותו עיקרון, אחד את הניסוח.
4. **שפה** — rule_statement בעברית משפטית מקצועית, לא צמצום מילולי של הציטוט.
5. **subject_tags** — 2-5 תגיות בעברית, snake_case (חניה, קווי_בניין, שיקול_דעת, פגם_פרוצדורלי, סמכות, מועדים, פגיעה_במקרקעין, ירידת_ערך).
6. **confidence** — 0..1. מתחת ל-0.7 = ספק לגבי היות זה הלכה מחייבת.
"""
_VALID_PRACTICE_AREAS = {"rishuy_uvniya", "betterment_levy", "compensation_197"}
_VALID_RULE_TYPES = {"binding", "interpretive", "procedural", "obiter"}
def _normalize_for_comparison(text: str) -> str:
"""Normalize Hebrew text for substring matching.
Collapses whitespace and unifies the half-dozen Hebrew quote-mark
variants. Use ``proofreader._fix_hebrew_quotes`` for the quote part
so we stay consistent with the proofreader pipeline.
"""
fixed = proofreader._fix_hebrew_quotes(text)
# Collapse all whitespace (newlines, tabs, multiple spaces) to a single space.
return re.sub(r"\s+", " ", fixed).strip()
def _verify_quote(supporting_quote: str, full_text: str) -> bool:
"""Return True if ``supporting_quote`` appears verbatim in ``full_text``
after Hebrew quote/whitespace normalization.
The LLM occasionally trims a leading/trailing word from the quote;
we accept the quote if at least 90% of its characters match a
contiguous substring of the source.
"""
if not supporting_quote.strip():
return False
normalized_quote = _normalize_for_comparison(supporting_quote)
normalized_text = _normalize_for_comparison(full_text)
if not normalized_quote:
return False
if normalized_quote in normalized_text:
return True
# Fallback: try the inner 90% of the quote (drops boundary trim).
if len(normalized_quote) >= 30:
trim = max(2, len(normalized_quote) // 20)
inner = normalized_quote[trim:-trim]
if inner and inner in normalized_text:
return True
return False
def _coerce_halacha(raw: dict) -> dict | None:
"""Validate and normalize one LLM-returned halacha dict.
Returns ``None`` if the entry is missing required fields.
"""
if not isinstance(raw, dict):
return None
rule_statement = (raw.get("rule_statement") or "").strip()
supporting_quote = (raw.get("supporting_quote") or "").strip()
if not rule_statement or not supporting_quote:
return None
rule_type = (raw.get("rule_type") or "binding").strip().lower()
if rule_type not in _VALID_RULE_TYPES:
rule_type = "binding"
practice_areas_raw = raw.get("practice_areas") or []
if isinstance(practice_areas_raw, str):
practice_areas_raw = [practice_areas_raw]
practice_areas = [p for p in practice_areas_raw if p in _VALID_PRACTICE_AREAS]
subject_tags_raw = raw.get("subject_tags") or []
if isinstance(subject_tags_raw, str):
subject_tags_raw = [subject_tags_raw]
subject_tags = [str(t).strip() for t in subject_tags_raw if str(t).strip()]
cites_raw = raw.get("cites") or []
if isinstance(cites_raw, str):
cites_raw = [cites_raw]
cites = [str(c).strip() for c in cites_raw if str(c).strip()]
try:
confidence = float(raw.get("confidence", 0.0))
except (TypeError, ValueError):
confidence = 0.0
confidence = max(0.0, min(1.0, confidence))
return {
"rule_statement": rule_statement,
"rule_type": rule_type,
"reasoning_summary": (raw.get("reasoning_summary") or "").strip(),
"supporting_quote": supporting_quote,
"page_reference": (raw.get("page_reference") or "").strip(),
"practice_areas": practice_areas,
"subject_tags": subject_tags,
"cites": cites,
"confidence": confidence,
}
async def _extract_chunk(
chunk_text: str,
section_type: str,
chunk_index: int,
chunk_total: int,
context: str,
) -> list[dict]:
"""Run the halacha extractor on one chunk with retry."""
chunk_label = f" (חלק {chunk_index + 1}/{chunk_total})" if chunk_total > 1 else ""
prompt = (
f"{HALACHA_EXTRACTION_PROMPT}\n\n"
f"## הקלט\n"
f"סוג קטע: {section_type}\n"
f"{context}{chunk_label}\n\n"
f"--- תחילת הטקסט ---\n{chunk_text}\n--- סוף הטקסט ---"
)
last_err: Exception | None = None
for attempt in range(CHUNK_RETRY_ATTEMPTS + 1):
try:
result = await claude_session.query_json(prompt)
except Exception as e:
last_err = e
logger.warning(
"halacha_extractor chunk %d/%d attempt %d raised: %s",
chunk_index + 1, chunk_total, attempt + 1, e,
)
continue
if isinstance(result, list):
return result
logger.warning(
"halacha_extractor chunk %d/%d attempt %d returned non-list (%s)",
chunk_index + 1, chunk_total, attempt + 1, type(result).__name__,
)
logger.error(
"halacha_extractor chunk %d/%d failed after %d attempts: %s",
chunk_index + 1, chunk_total, CHUNK_RETRY_ATTEMPTS + 1, last_err,
)
return []
async def extract(case_law_id: UUID | str) -> dict:
"""Extract halachot from an uploaded precedent and store them.
Idempotent: replaces any existing halachot for this case_law_id.
All inserted rows start as ``review_status='pending_review'``.
Returns:
``{"status": "...", "extracted": N, "verified": M, "stored": K, ...}``
"""
if isinstance(case_law_id, str):
case_law_id = UUID(case_law_id)
record = await db.get_case_law(case_law_id)
if not record:
return {"status": "not_found", "extracted": 0, "stored": 0}
chunks = await db.list_precedent_chunks(
case_law_id, section_types=EXTRACTABLE_SECTIONS,
)
if not chunks:
await db.set_case_law_halacha_status(case_law_id, "completed")
return {"status": "no_chunks", "extracted": 0, "stored": 0}
await db.set_case_law_halacha_status(case_law_id, "processing")
await db.delete_halachot(case_law_id)
citation = record.get("case_number", "")
court = record.get("court", "")
date_str = str(record.get("date") or "")
context = f"מקור: {citation}{court}, {date_str}"
sem = asyncio.Semaphore(CHUNK_CONCURRENCY)
async def _bounded(idx: int, chunk_row: dict) -> list[dict]:
async with sem:
return await _extract_chunk(
chunk_row["content"], chunk_row["section_type"],
idx, len(chunks), context,
)
chunk_results = await asyncio.gather(
*[_bounded(i, c) for i, c in enumerate(chunks)]
)
raw_halachot: list[dict] = []
for items in chunk_results:
raw_halachot.extend(items)
if not raw_halachot:
await db.set_case_law_halacha_status(case_law_id, "completed")
return {"status": "no_halachot", "extracted": 0, "stored": 0}
# Validate against the full text of the precedent for the quote check.
full_text = record.get("full_text") or ""
cleaned: list[dict] = []
for raw in raw_halachot:
coerced = _coerce_halacha(raw)
if coerced is None:
continue
coerced["quote_verified"] = _verify_quote(
coerced["supporting_quote"], full_text,
)
cleaned.append(coerced)
if not cleaned:
await db.set_case_law_halacha_status(case_law_id, "completed")
return {"status": "no_valid_halachot", "extracted": len(raw_halachot), "stored": 0}
# Embed rule_statement + reasoning_summary so semantic search hits the
# rule directly rather than the surrounding chunk centroid.
embed_inputs = [
f"{h['rule_statement']}{h['reasoning_summary']}".strip("")
for h in cleaned
]
try:
vectors = await embeddings.embed_texts(embed_inputs, input_type="document")
except Exception as e:
logger.error("halacha_extractor: embeddings failed: %s", e)
vectors = [None] * len(cleaned)
for halacha, vec in zip(cleaned, vectors):
halacha["embedding"] = vec
stored = await db.store_halachot(case_law_id, cleaned)
verified = sum(1 for h in cleaned if h["quote_verified"])
await db.set_case_law_halacha_status(case_law_id, "completed")
logger.info(
"halacha_extractor: case_law=%s extracted=%d cleaned=%d verified=%d stored=%d",
case_law_id, len(raw_halachot), len(cleaned), verified, stored,
)
return {
"status": "completed",
"extracted": len(raw_halachot),
"valid": len(cleaned),
"verified": verified,
"stored": stored,
}

View File

@@ -0,0 +1,309 @@
"""Orchestrator for the External Precedent Library.
Ingest pipeline (one upload):
file → extract_text → proofread → INSERT case_law (source_kind='external_upload')
→ chunk → embed → store precedent_chunks
→ halacha_extractor.extract → embed halachot → store halachot
→ set extraction_status='completed'
Progress is reported via a caller-supplied async callback so the
web layer can pipe updates into the existing Redis ProgressStore /
SSE plumbing without this module knowing about Redis.
"""
from __future__ import annotations
import logging
import re
import shutil
from datetime import date
from pathlib import Path
from typing import Awaitable, Callable
from uuid import UUID, uuid4
from legal_mcp import config
from legal_mcp.services import (
chunker,
db,
embeddings,
extractor,
halacha_extractor,
)
logger = logging.getLogger(__name__)
ProgressCb = Callable[[str, int, str], Awaitable[None]]
PRECEDENT_LIBRARY_DIR = Path(config.DATA_DIR) / "precedent-library"
_VALID_PRACTICE_AREAS = {"", "rishuy_uvniya", "betterment_levy", "compensation_197"}
_VALID_SOURCE_TYPES = {"", "court_ruling", "appeals_committee"}
_VALID_PRECEDENT_LEVELS = {
"", "עליון", "מנהלי", "ועדת_ערר_ארצית", "ועדת_ערר_מחוזית",
"supreme", "administrative", "national_appeals_committee", "district_appeals_committee",
}
async def _noop_progress(_status: str, _percent: int, _msg: str) -> None:
return None
def _safe_filename(name: str) -> str:
"""Strip path separators and unsafe chars from a user-provided name."""
base = Path(name).name
return re.sub(r"[^\w.\-+א-ת ]", "_", base) or f"upload-{uuid4().hex[:8]}"
def _stage_file(src_path: Path, source_type: str) -> Path:
"""Copy the uploaded file into data/precedent-library/<source_type>/.
Returns the destination path. Source file is not deleted (caller decides).
"""
sub = source_type if source_type in {"court_ruling", "appeals_committee"} else "other"
dest_dir = PRECEDENT_LIBRARY_DIR / sub
dest_dir.mkdir(parents=True, exist_ok=True)
safe_name = _safe_filename(src_path.name)
dest = dest_dir / f"{uuid4().hex[:8]}_{safe_name}"
shutil.copy2(src_path, dest)
return dest
def _coerce_date(value) -> date | None:
if value is None or value == "":
return None
if isinstance(value, date):
return value
if isinstance(value, str):
try:
return date.fromisoformat(value[:10])
except ValueError:
return None
return None
async def ingest_precedent(
*,
file_path: str | Path,
citation: str,
case_name: str = "",
court: str = "",
decision_date=None,
source_type: str = "",
precedent_level: str = "",
practice_area: str = "",
appeal_subtype: str = "",
subject_tags: list[str] | None = None,
is_binding: bool = True,
headnote: str = "",
summary: str = "",
document_id: UUID | None = None,
progress: ProgressCb | None = None,
) -> dict:
"""Ingest a single uploaded precedent through the full pipeline.
Required: file_path + citation. Everything else has a sensible default.
Returns:
``{"status": "...", "case_law_id": "...", "chunks": N, "halachot": M}``
"""
progress = progress or _noop_progress
src = Path(file_path)
if not src.is_file():
raise FileNotFoundError(f"file not found: {src}")
if not citation.strip():
raise ValueError("citation is required")
if practice_area not in _VALID_PRACTICE_AREAS:
raise ValueError(f"invalid practice_area: {practice_area!r}")
if source_type not in _VALID_SOURCE_TYPES:
raise ValueError(f"invalid source_type: {source_type!r}")
await progress("staging", 5, "מעתיק את הקובץ לאחסון")
staged = _stage_file(src, source_type)
await progress("extracting", 15, "מחלץ טקסט מהקובץ")
try:
text, page_count = await extractor.extract_text(str(staged))
except Exception as e:
await progress("failed", 100, f"כשל בחילוץ טקסט: {e}")
raise
text = (text or "").strip()
if not text:
await progress("failed", 100, "לא נמצא טקסט בקובץ")
raise ValueError("no extractable text in file")
# Strip any Nevo preamble that might wrap court rulings downloaded from Nevo.
text = extractor.strip_nevo_preamble(text)
await progress("storing_metadata", 25, "שומר את הפסיקה במסד הנתונים")
record = await db.create_external_case_law(
case_number=citation.strip(),
case_name=case_name.strip() or citation.strip(),
full_text=text,
court=court.strip(),
decision_date=_coerce_date(decision_date),
practice_area=practice_area,
appeal_subtype=appeal_subtype.strip(),
subject_tags=list(subject_tags or []),
summary=summary.strip(),
headnote=headnote.strip(),
source_type=source_type,
precedent_level=precedent_level,
is_binding=is_binding,
document_id=document_id,
)
case_law_id = UUID(str(record["id"]))
try:
await progress("chunking", 40, f"מחלק את הטקסט ל-chunks ({page_count} עמ')")
chunks = chunker.chunk_document(text)
if not chunks:
await db.set_case_law_extraction_status(case_law_id, "completed")
await db.set_case_law_halacha_status(case_law_id, "completed")
await progress("completed", 100, "אין טקסט לעיבוד")
return {
"status": "completed",
"case_law_id": str(case_law_id),
"chunks": 0,
"halachot": 0,
}
await progress("embedding", 55, f"מייצר embeddings ל-{len(chunks)} chunks")
chunk_texts = [c.content for c in chunks]
chunk_vectors = await embeddings.embed_texts(chunk_texts, input_type="document")
chunk_dicts = [
{
"chunk_index": c.chunk_index,
"content": c.content,
"section_type": c.section_type,
"page_number": c.page_number,
"embedding": v,
}
for c, v in zip(chunks, chunk_vectors)
]
stored_chunks = await db.store_precedent_chunks(case_law_id, chunk_dicts)
await progress("extracting_halachot", 75, "מחלץ הלכות מחייבות")
await db.set_case_law_extraction_status(case_law_id, "completed")
halacha_result = await halacha_extractor.extract(case_law_id)
await progress(
"completed",
100,
f"הוכנס לספרייה: {stored_chunks} chunks, "
f"{halacha_result.get('stored', 0)} הלכות ממתינות לאישור",
)
return {
"status": "completed",
"case_law_id": str(case_law_id),
"chunks": stored_chunks,
"halachot": halacha_result.get("stored", 0),
"halachot_extracted_raw": halacha_result.get("extracted", 0),
"halachot_verified": halacha_result.get("verified", 0),
"pages": page_count,
}
except Exception as e:
logger.exception("precedent_library.ingest_precedent failed: %s", e)
await db.set_case_law_extraction_status(case_law_id, "failed")
await progress("failed", 100, f"כשל בעיבוד: {e}")
raise
async def reextract_halachot(
case_law_id: UUID | str,
progress: ProgressCb | None = None,
) -> dict:
"""Re-run the halacha extractor on an existing precedent. Idempotent."""
progress = progress or _noop_progress
if isinstance(case_law_id, str):
case_law_id = UUID(case_law_id)
record = await db.get_case_law(case_law_id)
if not record or record.get("source_kind") != "external_upload":
raise ValueError("precedent not found or not chair-uploaded")
await progress("extracting_halachot", 50, "מחלץ הלכות מחדש")
result = await halacha_extractor.extract(case_law_id)
await progress(
"completed",
100,
f"הופקו {result.get('stored', 0)} הלכות (ממתינות לאישור)",
)
return result
async def delete_precedent(case_law_id: UUID | str) -> bool:
"""Delete a precedent and cascade chunks + halachot."""
if isinstance(case_law_id, str):
case_law_id = UUID(case_law_id)
return await db.delete_case_law(case_law_id)
async def get_precedent(case_law_id: UUID | str) -> dict | None:
"""Get a precedent with its halachot attached."""
if isinstance(case_law_id, str):
case_law_id = UUID(case_law_id)
record = await db.get_case_law(case_law_id)
if not record:
return None
record["halachot"] = await db.list_halachot(case_law_id=case_law_id, limit=500)
return record
async def list_precedents(
practice_area: str = "",
court: str = "",
precedent_level: str = "",
source_type: str = "",
search: str = "",
limit: int = 100,
offset: int = 0,
) -> list[dict]:
return await db.list_external_case_law(
practice_area=practice_area,
court=court,
precedent_level=precedent_level,
source_type=source_type,
search=search,
limit=limit,
offset=offset,
)
async def search_library(
query: str,
practice_area: str = "",
court: str = "",
precedent_level: str = "",
appeal_subtype: str = "",
is_binding: bool | None = None,
subject_tag: str = "",
limit: int = 10,
include_halachot: bool = True,
) -> list[dict]:
"""Semantic search merging halachot (rule-level) and chunks (passage-level).
Only ``approved`` / ``published`` halachot are returned, per chair-review
policy. Chunks are returned regardless of halacha review status.
"""
if not query.strip():
return []
query_vec = await embeddings.embed_query(query)
return await db.search_precedent_library_semantic(
query_embedding=query_vec,
practice_area=practice_area,
court=court,
precedent_level=precedent_level,
appeal_subtype=appeal_subtype,
is_binding=is_binding,
subject_tag=subject_tag,
limit=limit,
include_halachot=include_halachot,
)

View File

@@ -0,0 +1,234 @@
"""MCP tools for the External Precedent Library.
This is distinct from:
- ``precedents`` (case_precedents table) — chair-attached quotes scoped to
a specific case section. Use ``precedent_search_library`` for that.
- ``style_corpus`` (Daphna's prior decisions) — searched via
``search_decisions`` for style/voice.
The precedent library is the **authoritative law** corpus: external court
rulings and other appeals committees' decisions, with halachot extracted
and reviewed by the chair.
All halachot enter as ``pending_review`` and are invisible to search until
the chair approves them — per project review policy.
"""
from __future__ import annotations
import json
from uuid import UUID
from legal_mcp.services import db, precedent_library
def _ok(payload) -> str:
return json.dumps(payload, ensure_ascii=False, indent=2, default=str)
def _err(msg: str) -> str:
return json.dumps({"error": msg}, ensure_ascii=False)
async def precedent_library_upload(
file_path: str,
citation: str,
case_name: str = "",
court: str = "",
decision_date: str = "",
source_type: str = "",
precedent_level: str = "",
practice_area: str = "",
appeal_subtype: str = "",
subject_tags: list[str] | None = None,
is_binding: bool = True,
headnote: str = "",
summary: str = "",
) -> str:
"""העלאת פסיקה חיצונית לקורפוס הסמכותי + חילוץ הלכות אוטומטי.
Args:
file_path: נתיב מלא לקובץ PDF/DOCX/RTF/TXT/MD.
citation: מראה המקום ("עע\\"מ 3975/22 ב. קרן-נכסים נ' ועדה מקומית").
case_name: שם קצר.
court: ערכאה (עליון / מנהלי / ועדת ערר ארצית / ועדת ערר מחוזית).
decision_date: ISO date (YYYY-MM-DD), אופציונלי.
source_type: court_ruling / appeals_committee.
precedent_level: עליון / מנהלי / ועדת_ערר_ארצית / ועדת_ערר_מחוזית.
practice_area: rishuy_uvniya / betterment_levy / compensation_197.
subject_tags: תגיות נושא (חניה, קווי_בניין, וכד').
Returns: JSON עם case_law_id, מספר chunks, מספר הלכות שנכנסו לתור אישור.
"""
if not citation.strip():
return _err("citation חובה")
try:
result = await precedent_library.ingest_precedent(
file_path=file_path,
citation=citation,
case_name=case_name,
court=court,
decision_date=decision_date or None,
source_type=source_type,
precedent_level=precedent_level,
practice_area=practice_area,
appeal_subtype=appeal_subtype,
subject_tags=subject_tags or [],
is_binding=is_binding,
headnote=headnote,
summary=summary,
)
except Exception as e:
return _err(str(e))
return _ok(result)
async def precedent_library_list(
practice_area: str = "",
court: str = "",
precedent_level: str = "",
source_type: str = "",
search: str = "",
limit: int = 100,
) -> str:
"""רשימה של פסיקה בקורפוס הסמכותי, עם פילטרים."""
rows = await precedent_library.list_precedents(
practice_area=practice_area,
court=court,
precedent_level=precedent_level,
source_type=source_type,
search=search,
limit=limit,
)
return _ok(rows)
async def precedent_library_get(case_law_id: str) -> str:
"""פסיקה ספציפית עם כל ההלכות שלה (כולל ממתינות לאישור)."""
try:
cid = UUID(case_law_id)
except ValueError:
return _err("case_law_id לא תקין")
record = await precedent_library.get_precedent(cid)
if not record:
return _err("פסיקה לא נמצאה")
return _ok(record)
async def precedent_library_delete(case_law_id: str) -> str:
"""מחיקת פסיקה מהקורפוס. cascade: chunks + halachot."""
try:
cid = UUID(case_law_id)
except ValueError:
return _err("case_law_id לא תקין")
ok = await precedent_library.delete_precedent(cid)
return _ok({"deleted": ok, "case_law_id": case_law_id})
async def precedent_extract_halachot(case_law_id: str) -> str:
"""הרצה מחדש של חילוץ ההלכות לפסיקה קיימת. הלכות קודמות נמחקות."""
try:
cid = UUID(case_law_id)
except ValueError:
return _err("case_law_id לא תקין")
try:
result = await precedent_library.reextract_halachot(cid)
except Exception as e:
return _err(str(e))
return _ok(result)
async def search_precedent_library(
query: str,
practice_area: str = "",
court: str = "",
precedent_level: str = "",
appeal_subtype: str = "",
is_binding: bool | None = None,
subject_tag: str = "",
limit: int = 10,
include_halachot: bool = True,
) -> str:
"""חיפוש סמנטי בקורפוס הפסיקה הסמכותית.
מחזיר תוצאות מעורבות: הלכות (rule-level, מאושרות בלבד) + קטעי טקסט
(passage-level). הלכות מקבלות boost קל בדירוג כי הן מזוקקות מראש.
Args:
query: שאילתת חיפוש בעברית.
practice_area: rishuy_uvniya / betterment_levy / compensation_197.
court: סינון לפי ערכאה (substring).
precedent_level: עליון / מנהלי / ועדת_ערר_ארצית / ועדת_ערר_מחוזית.
appeal_subtype: סינון לתת-סוג.
is_binding: True/False (None = ללא סינון).
subject_tag: סינון לפי תגית נושא (לדוגמה "מועד_קביעת_שומה").
limit: מספר תוצאות מקסימלי.
include_halachot: האם לכלול הלכות (ברירת מחדל: כן).
Returns: רשימה מדורגת. כל פריט הוא {"type": "halacha"|"passage", "score", ...}.
"""
if not query or len(query.strip()) < 2:
return json.dumps([], ensure_ascii=False)
results = await precedent_library.search_library(
query=query.strip(),
practice_area=practice_area,
court=court,
precedent_level=precedent_level,
appeal_subtype=appeal_subtype,
is_binding=is_binding,
subject_tag=subject_tag,
limit=limit,
include_halachot=include_halachot,
)
return _ok(results)
async def halacha_review(
halacha_id: str,
status: str,
reviewer: str = "דפנה",
rule_statement: str = "",
reasoning_summary: str = "",
subject_tags: list[str] | None = None,
practice_areas: list[str] | None = None,
) -> str:
"""אישור / דחייה / עריכה של הלכה שחולצה אוטומטית.
Args:
halacha_id: מזהה ההלכה.
status: pending_review / approved / rejected / published.
reviewer: שם המאשר (ברירת מחדל: דפנה).
rule_statement: עריכת ניסוח הכלל (ריק = ללא שינוי).
reasoning_summary: עריכת תמצית ההיגיון (ריק = ללא שינוי).
subject_tags: עריכת תגיות (None = ללא שינוי).
practice_areas: עריכת תחומים (None = ללא שינוי).
"""
if status not in {"pending_review", "approved", "rejected", "published"}:
return _err(
"status לא חוקי. ערכים תקינים: "
"pending_review / approved / rejected / published"
)
try:
hid = UUID(halacha_id)
except ValueError:
return _err("halacha_id לא תקין")
row = await db.update_halacha(
halacha_id=hid,
review_status=status,
reviewer=reviewer,
rule_statement=rule_statement or None,
reasoning_summary=reasoning_summary or None,
subject_tags=subject_tags,
practice_areas=practice_areas,
)
if row is None:
return _err("הלכה לא נמצאה")
return _ok(row)
async def halachot_pending(limit: int = 100) -> str:
"""תור ההלכות הממתינות לאישור (review_status='pending_review')."""
rows = await db.list_halachot(review_status="pending_review", limit=limit)
return _ok(rows)