Initial commit: MCP server + web upload interface

Ezer Mishpati - AI legal decision drafting system with:
- MCP server (FastMCP) with document processing pipeline
- Web upload interface (FastAPI) for file upload and classification
- pgvector-based semantic search
- Hebrew legal document chunking and embedding
This commit is contained in:
2026-03-23 12:33:07 +00:00
commit 6f515dc2cb
33 changed files with 3297 additions and 0 deletions

27
mcp-server/pyproject.toml Normal file
View File

@@ -0,0 +1,27 @@
[project]
name = "legal-mcp"
version = "0.1.0"
description = "MCP server for AI-assisted legal decision drafting"
requires-python = ">=3.10"
dependencies = [
"mcp[cli]>=1.0.0",
"asyncpg>=0.29.0",
"pgvector>=0.3.0",
"voyageai>=0.3.0",
"anthropic>=0.40.0",
"python-dotenv>=1.0.0",
"pydantic>=2.0.0",
"pymupdf>=1.25.0",
"python-docx>=1.1.0",
"striprtf>=0.0.26",
"redis>=5.0.0",
"rq>=1.16.0",
"pillow>=10.0.0",
]
[build-system]
requires = ["setuptools>=68.0"]
build-backend = "setuptools.build_meta"
[tool.setuptools.packages.find]
where = ["src"]

View File

View File

@@ -0,0 +1,4 @@
"""Allow running with: python -m legal_mcp"""
from legal_mcp.server import main
main()

View File

@@ -0,0 +1,40 @@
"""Configuration loaded from central .env file."""
import os
from pathlib import Path
from dotenv import load_dotenv
# Load from central .env or override path
dotenv_path = os.environ.get("DOTENV_PATH", str(Path.home() / ".env"))
load_dotenv(dotenv_path)
# PostgreSQL
POSTGRES_URL = os.environ.get(
"POSTGRES_URL",
f"postgres://{os.environ.get('POSTGRES_USER', 'legal_ai')}:"
f"{os.environ.get('POSTGRES_PASSWORD', '')}@"
f"{os.environ.get('POSTGRES_HOST', '127.0.0.1')}:"
f"{os.environ.get('POSTGRES_PORT', '5433')}/"
f"{os.environ.get('POSTGRES_DB', 'legal_ai')}",
)
# Redis
REDIS_URL = os.environ.get("REDIS_URL", "redis://127.0.0.1:6380/0")
# Voyage AI
VOYAGE_API_KEY = os.environ.get("VOYAGE_API_KEY", "")
VOYAGE_MODEL = "voyage-3-large"
VOYAGE_DIMENSIONS = 1024
# Anthropic (for Claude Vision OCR)
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
# Data directory
DATA_DIR = Path(os.environ.get("DATA_DIR", str(Path.home() / "legal-ai" / "data")))
CASES_DIR = DATA_DIR / "cases"
TRAINING_DIR = DATA_DIR / "training"
# Chunking parameters
CHUNK_SIZE_TOKENS = 600
CHUNK_OVERLAP_TOKENS = 100

View File

@@ -0,0 +1,97 @@
"""Pydantic models for cases, documents, and related entities."""
from __future__ import annotations
import enum
from datetime import date, datetime
from uuid import UUID
from pydantic import BaseModel, Field
class CaseStatus(str, enum.Enum):
NEW = "new"
IN_PROGRESS = "in_progress"
DRAFTED = "drafted"
REVIEWED = "reviewed"
FINAL = "final"
class DocType(str, enum.Enum):
APPEAL = "appeal" # כתב ערר
RESPONSE = "response" # תשובה
DECISION = "decision" # החלטה
REFERENCE = "reference" # מסמך עזר
EXHIBIT = "exhibit" # נספח
class SectionType(str, enum.Enum):
INTRO = "intro"
FACTS = "facts"
APPELLANT_CLAIMS = "appellant_claims"
RESPONDENT_CLAIMS = "respondent_claims"
LEGAL_ANALYSIS = "legal_analysis"
CONCLUSION = "conclusion"
RULING = "ruling"
OTHER = "other"
class ExtractionStatus(str, enum.Enum):
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
class CaseCreate(BaseModel):
case_number: str = Field(description="מספר תיק הערר (לדוגמה: 123-24)")
title: str = Field(description="כותרת קצרה של הערר")
appellants: list[str] = Field(default_factory=list, description="שמות העוררים")
respondents: list[str] = Field(default_factory=list, description="שמות המשיבים")
subject: str = Field(default="", description="נושא הערר")
property_address: str = Field(default="", description="כתובת הנכס")
permit_number: str = Field(default="", description="מספר היתר")
committee_type: str = Field(default="ועדה מקומית", description="סוג הוועדה")
hearing_date: date | None = Field(default=None, description="תאריך דיון")
notes: str = Field(default="", description="הערות")
class CaseInfo(BaseModel):
id: UUID
case_number: str
title: str
appellants: list[str]
respondents: list[str]
subject: str
property_address: str
permit_number: str
committee_type: str
status: CaseStatus
hearing_date: date | None
decision_date: date | None
tags: list[str]
notes: str
created_at: datetime
updated_at: datetime
class DocumentInfo(BaseModel):
id: UUID
case_id: UUID
doc_type: DocType
title: str
file_path: str
extraction_status: ExtractionStatus
page_count: int | None
created_at: datetime
class SearchResult(BaseModel):
chunk_content: str
score: float
case_number: str
document_title: str
section_type: str
page_number: int | None
document_id: UUID
case_id: UUID

View File

@@ -0,0 +1,219 @@
"""Ezer Mishpati - MCP Server entry point.
Run with: python -m legal_mcp.server
"""
from __future__ import annotations
import logging
import sys
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from mcp.server.fastmcp import FastMCP
# Configure logging to stderr (stdout is reserved for JSON-RPC)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
stream=sys.stderr,
)
logger = logging.getLogger("legal_mcp")
@asynccontextmanager
async def lifespan(server: FastMCP) -> AsyncIterator[None]:
"""Initialize DB schema on startup, close pool on shutdown."""
from legal_mcp.services.db import close_pool, init_schema
logger.info("Initializing database schema...")
await init_schema()
logger.info("Ezer Mishpati MCP server ready")
try:
yield
finally:
await close_pool()
logger.info("Ezer Mishpati MCP server stopped")
# Create MCP server
mcp = FastMCP(
"Ezer Mishpati - עוזר משפטי",
instructions="מערכת AI לסיוע בניסוח החלטות משפטיות בסגנון דפנה תמיר",
lifespan=lifespan,
)
# ── Import and register tools ───────────────────────────────────────
from legal_mcp.tools import cases, documents, search, drafting, workflow # noqa: E402
# Case management
@mcp.tool()
async def case_create(
case_number: str,
title: str,
appellants: list[str] | None = None,
respondents: list[str] | None = None,
subject: str = "",
property_address: str = "",
permit_number: str = "",
committee_type: str = "ועדה מקומית",
hearing_date: str = "",
notes: str = "",
) -> str:
"""יצירת תיק ערר חדש."""
return await cases.case_create(
case_number, title, appellants, respondents,
subject, property_address, permit_number, committee_type,
hearing_date, notes,
)
@mcp.tool()
async def case_list(status: str = "", limit: int = 50) -> str:
"""רשימת תיקי ערר. סינון אופציונלי לפי סטטוס (new/in_progress/drafted/reviewed/final)."""
return await cases.case_list(status, limit)
@mcp.tool()
async def case_get(case_number: str) -> str:
"""פרטי תיק מלאים כולל רשימת מסמכים."""
return await cases.case_get(case_number)
@mcp.tool()
async def case_update(
case_number: str,
status: str = "",
title: str = "",
subject: str = "",
notes: str = "",
hearing_date: str = "",
decision_date: str = "",
tags: list[str] | None = None,
) -> str:
"""עדכון פרטי תיק."""
return await cases.case_update(
case_number, status, title, subject, notes,
hearing_date, decision_date, tags,
)
# Documents
@mcp.tool()
async def document_upload(
case_number: str,
file_path: str,
doc_type: str = "appeal",
title: str = "",
) -> str:
"""העלאה ועיבוד מסמך לתיק ערר (PDF/DOCX/RTF/TXT). מחלץ טקסט ויוצר embeddings."""
return await documents.document_upload(case_number, file_path, doc_type, title)
@mcp.tool()
async def document_upload_training(
file_path: str,
decision_number: str = "",
decision_date: str = "",
subject_categories: list[str] | None = None,
title: str = "",
) -> str:
"""העלאת החלטה קודמת של דפנה לקורפוס הסגנון. קטגוריות: בנייה, שימוש חורג, תכנית, היתר, הקלה, חלוקה, תמ"א 38, היטל השבחה, פיצויים 197."""
return await documents.document_upload_training(
file_path, decision_number, decision_date, subject_categories, title,
)
@mcp.tool()
async def document_get_text(case_number: str, doc_title: str = "") -> str:
"""קבלת טקסט מלא של מסמך מתוך תיק."""
return await documents.document_get_text(case_number, doc_title)
@mcp.tool()
async def document_list(case_number: str) -> str:
"""רשימת מסמכים בתיק."""
return await documents.document_list(case_number)
# Search
@mcp.tool()
async def search_decisions(
query: str,
limit: int = 10,
section_type: str = "",
) -> str:
"""חיפוש סמנטי בהחלטות קודמות ובמסמכים."""
return await search.search_decisions(query, limit, section_type)
@mcp.tool()
async def search_case_documents(
case_number: str,
query: str,
limit: int = 10,
) -> str:
"""חיפוש סמנטי בתוך מסמכי תיק ספציפי."""
return await search.search_case_documents(case_number, query, limit)
@mcp.tool()
async def find_similar_cases(
description: str,
limit: int = 5,
) -> str:
"""מציאת תיקים דומים על בסיס תיאור."""
return await search.find_similar_cases(description, limit)
# Drafting
@mcp.tool()
async def get_style_guide() -> str:
"""שליפת דפוסי הסגנון של דפנה - נוסחאות, ביטויים אופייניים ומבנה."""
return await drafting.get_style_guide()
@mcp.tool()
async def draft_section(
case_number: str,
section: str,
instructions: str = "",
) -> str:
"""הרכבת הקשר מלא לניסוח סעיף (עובדות + תקדימים + סגנון)."""
return await drafting.draft_section(case_number, section, instructions)
@mcp.tool()
async def get_decision_template(case_number: str) -> str:
"""תבנית מבנית להחלטה מלאה עם פרטי התיק."""
return await drafting.get_decision_template(case_number)
@mcp.tool()
async def analyze_style() -> str:
"""ניתוח סגנון על קורפוס ההחלטות של דפנה. מחלץ ושומר דפוסי כתיבה."""
return await drafting.analyze_style()
# Workflow
@mcp.tool()
async def workflow_status(case_number: str) -> str:
"""סטטוס תהליך עבודה מלא לתיק."""
return await workflow.workflow_status(case_number)
@mcp.tool()
async def processing_status() -> str:
"""סטטוס כללי - מספר תיקים, מסמכים, chunks."""
return await workflow.processing_status()
def main():
mcp.run(transport="stdio")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,130 @@
"""Legal document chunker - splits text into sections and chunks for RAG."""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from legal_mcp import config
# Hebrew legal section headers
SECTION_PATTERNS = [
(r"רקע\s*עובדתי|רקע\s*כללי|העובדות|הרקע", "facts"),
(r"טענות\s*העוררי[םן]|טענות\s*המערערי[םן]|עיקר\s*טענות\s*העוררי[םן]", "appellant_claims"),
(r"טענות\s*המשיבי[םן]|תשובת\s*המשיבי[םן]|עיקר\s*טענות\s*המשיבי[םן]", "respondent_claims"),
(r"דיון\s*והכרעה|דיון|הכרעה|ניתוח\s*משפטי|המסגרת\s*המשפטית", "legal_analysis"),
(r"מסקנ[הות]|סיכום", "conclusion"),
(r"החלטה|לפיכך\s*אני\s*מחליט|התוצאה", "ruling"),
(r"מבוא|פתיחה|לפניי", "intro"),
]
@dataclass
class Chunk:
content: str
section_type: str = "other"
page_number: int | None = None
chunk_index: int = 0
def chunk_document(
text: str,
chunk_size: int = config.CHUNK_SIZE_TOKENS,
overlap: int = config.CHUNK_OVERLAP_TOKENS,
) -> list[Chunk]:
"""Split a legal document into chunks, respecting section boundaries."""
if not text.strip():
return []
sections = _split_into_sections(text)
chunks: list[Chunk] = []
idx = 0
for section_type, section_text in sections:
section_chunks = _split_section(section_text, chunk_size, overlap)
for chunk_text in section_chunks:
chunks.append(Chunk(
content=chunk_text,
section_type=section_type,
chunk_index=idx,
))
idx += 1
return chunks
def _split_into_sections(text: str) -> list[tuple[str, str]]:
"""Split text into (section_type, text) pairs based on Hebrew headers."""
# Find all section headers and their positions
markers: list[tuple[int, str]] = []
for pattern, section_type in SECTION_PATTERNS:
for match in re.finditer(pattern, text):
markers.append((match.start(), section_type))
if not markers:
# No sections found - treat as single block
return [("other", text)]
markers.sort(key=lambda x: x[0])
sections: list[tuple[str, str]] = []
# Text before first section
if markers[0][0] > 0:
intro_text = text[: markers[0][0]].strip()
if intro_text:
sections.append(("intro", intro_text))
# Each section
for i, (pos, section_type) in enumerate(markers):
end = markers[i + 1][0] if i + 1 < len(markers) else len(text)
section_text = text[pos:end].strip()
if section_text:
sections.append((section_type, section_text))
return sections
def _split_section(text: str, chunk_size: int, overlap: int) -> list[str]:
"""Split a section into overlapping chunks by paragraphs.
Uses approximate token counting (Hebrew ~1.5 chars per token).
"""
if not text.strip():
return []
paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
chunks: list[str] = []
current: list[str] = []
current_tokens = 0
for para in paragraphs:
para_tokens = _estimate_tokens(para)
if current_tokens + para_tokens > chunk_size and current:
chunks.append("\n".join(current))
# Keep overlap
overlap_paras: list[str] = []
overlap_tokens = 0
for p in reversed(current):
pt = _estimate_tokens(p)
if overlap_tokens + pt > overlap:
break
overlap_paras.insert(0, p)
overlap_tokens += pt
current = overlap_paras
current_tokens = overlap_tokens
current.append(para)
current_tokens += para_tokens
if current:
chunks.append("\n".join(current))
return chunks
def _estimate_tokens(text: str) -> int:
"""Rough token estimate for Hebrew text (~1.5 chars per token)."""
return max(1, len(text) // 2)

View File

@@ -0,0 +1,440 @@
"""Database service - asyncpg connection pool and queries."""
from __future__ import annotations
import json
import logging
from datetime import date
from uuid import UUID, uuid4
import asyncpg
from pgvector.asyncpg import register_vector
from legal_mcp import config
logger = logging.getLogger(__name__)
_pool: asyncpg.Pool | None = None
async def get_pool() -> asyncpg.Pool:
global _pool
if _pool is None:
# First, ensure pgvector extension exists (before registering type codec)
conn = await asyncpg.connect(config.POSTGRES_URL)
await conn.execute('CREATE EXTENSION IF NOT EXISTS vector')
await conn.execute('CREATE EXTENSION IF NOT EXISTS "uuid-ossp"')
await conn.close()
_pool = await asyncpg.create_pool(
config.POSTGRES_URL,
min_size=2,
max_size=10,
init=_init_connection,
)
return _pool
async def _init_connection(conn: asyncpg.Connection) -> None:
await register_vector(conn)
async def close_pool() -> None:
global _pool
if _pool:
await _pool.close()
_pool = None
# ── Schema ──────────────────────────────────────────────────────────
SCHEMA_SQL = """
CREATE TABLE IF NOT EXISTS cases (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_number TEXT UNIQUE NOT NULL,
title TEXT NOT NULL,
appellants JSONB DEFAULT '[]',
respondents JSONB DEFAULT '[]',
subject TEXT DEFAULT '',
property_address TEXT DEFAULT '',
permit_number TEXT DEFAULT '',
committee_type TEXT DEFAULT 'ועדה מקומית',
status TEXT DEFAULT 'new',
hearing_date DATE,
decision_date DATE,
tags JSONB DEFAULT '[]',
notes TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS documents (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
doc_type TEXT NOT NULL,
title TEXT NOT NULL,
file_path TEXT NOT NULL,
extracted_text TEXT DEFAULT '',
extraction_status TEXT DEFAULT 'pending',
page_count INTEGER,
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS document_chunks (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID REFERENCES documents(id) ON DELETE CASCADE,
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
section_type TEXT DEFAULT 'other',
embedding vector(1024),
page_number INTEGER,
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS style_corpus (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID REFERENCES documents(id) ON DELETE SET NULL,
decision_number TEXT,
decision_date DATE,
subject_categories JSONB DEFAULT '[]',
full_text TEXT NOT NULL,
summary TEXT DEFAULT '',
outcome TEXT DEFAULT '',
key_principles JSONB DEFAULT '[]',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE TABLE IF NOT EXISTS style_patterns (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
pattern_type TEXT NOT NULL,
pattern_text TEXT NOT NULL,
frequency INTEGER DEFAULT 1,
context TEXT DEFAULT '',
examples JSONB DEFAULT '[]',
created_at TIMESTAMPTZ DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_chunks_embedding
ON document_chunks USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
CREATE INDEX IF NOT EXISTS idx_chunks_case ON document_chunks(case_id);
CREATE INDEX IF NOT EXISTS idx_chunks_doc ON document_chunks(document_id);
CREATE INDEX IF NOT EXISTS idx_docs_case ON documents(case_id);
CREATE INDEX IF NOT EXISTS idx_cases_status ON cases(status);
CREATE INDEX IF NOT EXISTS idx_cases_number ON cases(case_number);
"""
async def init_schema() -> None:
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(SCHEMA_SQL)
logger.info("Database schema initialized")
# ── Case CRUD ───────────────────────────────────────────────────────
async def create_case(
case_number: str,
title: str,
appellants: list[str] | None = None,
respondents: list[str] | None = None,
subject: str = "",
property_address: str = "",
permit_number: str = "",
committee_type: str = "ועדה מקומית",
hearing_date: date | None = None,
notes: str = "",
) -> dict:
pool = await get_pool()
case_id = uuid4()
async with pool.acquire() as conn:
await conn.execute(
"""INSERT INTO cases (id, case_number, title, appellants, respondents,
subject, property_address, permit_number, committee_type,
hearing_date, notes)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)""",
case_id, case_number, title,
json.dumps(appellants or []),
json.dumps(respondents or []),
subject, property_address, permit_number, committee_type,
hearing_date, notes,
)
return await get_case(case_id)
async def get_case(case_id: UUID) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow("SELECT * FROM cases WHERE id = $1", case_id)
if row is None:
return None
return _row_to_case(row)
async def get_case_by_number(case_number: str) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT * FROM cases WHERE case_number = $1", case_number
)
if row is None:
return None
return _row_to_case(row)
async def list_cases(status: str | None = None, limit: int = 50) -> list[dict]:
pool = await get_pool()
async with pool.acquire() as conn:
if status:
rows = await conn.fetch(
"SELECT * FROM cases WHERE status = $1 ORDER BY updated_at DESC LIMIT $2",
status, limit,
)
else:
rows = await conn.fetch(
"SELECT * FROM cases ORDER BY updated_at DESC LIMIT $1", limit
)
return [_row_to_case(r) for r in rows]
async def update_case(case_id: UUID, **fields) -> dict | None:
if not fields:
return await get_case(case_id)
pool = await get_pool()
set_clauses = []
values = []
for i, (key, val) in enumerate(fields.items(), start=2):
if key in ("appellants", "respondents", "tags"):
val = json.dumps(val)
set_clauses.append(f"{key} = ${i}")
values.append(val)
set_clauses.append("updated_at = now()")
sql = f"UPDATE cases SET {', '.join(set_clauses)} WHERE id = $1"
async with pool.acquire() as conn:
await conn.execute(sql, case_id, *values)
return await get_case(case_id)
def _row_to_case(row: asyncpg.Record) -> dict:
d = dict(row)
for field in ("appellants", "respondents", "tags"):
if isinstance(d.get(field), str):
d[field] = json.loads(d[field])
d["id"] = str(d["id"])
return d
# ── Document CRUD ───────────────────────────────────────────────────
async def create_document(
case_id: UUID,
doc_type: str,
title: str,
file_path: str,
page_count: int | None = None,
) -> dict:
pool = await get_pool()
doc_id = uuid4()
async with pool.acquire() as conn:
await conn.execute(
"""INSERT INTO documents (id, case_id, doc_type, title, file_path, page_count)
VALUES ($1, $2, $3, $4, $5, $6)""",
doc_id, case_id, doc_type, title, file_path, page_count,
)
row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id)
return _row_to_doc(row)
async def update_document(doc_id: UUID, **fields) -> None:
if not fields:
return
pool = await get_pool()
set_clauses = []
values = []
for i, (key, val) in enumerate(fields.items(), start=2):
if key == "metadata":
val = json.dumps(val)
set_clauses.append(f"{key} = ${i}")
values.append(val)
sql = f"UPDATE documents SET {', '.join(set_clauses)} WHERE id = $1"
async with pool.acquire() as conn:
await conn.execute(sql, doc_id, *values)
async def get_document(doc_id: UUID) -> dict | None:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id)
return _row_to_doc(row) if row else None
async def list_documents(case_id: UUID) -> list[dict]:
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT * FROM documents WHERE case_id = $1 ORDER BY created_at", case_id
)
return [_row_to_doc(r) for r in rows]
async def get_document_text(doc_id: UUID) -> str:
pool = await get_pool()
async with pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT extracted_text FROM documents WHERE id = $1", doc_id
)
return row["extracted_text"] if row else ""
def _row_to_doc(row: asyncpg.Record) -> dict:
d = dict(row)
d["id"] = str(d["id"])
d["case_id"] = str(d["case_id"])
if isinstance(d.get("metadata"), str):
d["metadata"] = json.loads(d["metadata"])
return d
# ── Chunks & Vectors ───────────────────────────────────────────────
async def store_chunks(
document_id: UUID,
case_id: UUID | None,
chunks: list[dict],
) -> int:
"""Store document chunks with embeddings. Each chunk dict has:
content, section_type, embedding (list[float]), page_number, chunk_index
"""
pool = await get_pool()
async with pool.acquire() as conn:
# Delete existing chunks for this document
await conn.execute(
"DELETE FROM document_chunks WHERE document_id = $1", document_id
)
for chunk in chunks:
await conn.execute(
"""INSERT INTO document_chunks
(document_id, case_id, chunk_index, content, section_type, embedding, page_number)
VALUES ($1, $2, $3, $4, $5, $6, $7)""",
document_id, case_id,
chunk["chunk_index"],
chunk["content"],
chunk.get("section_type", "other"),
chunk["embedding"],
chunk.get("page_number"),
)
return len(chunks)
async def search_similar(
query_embedding: list[float],
limit: int = 10,
case_id: UUID | None = None,
section_type: str | None = None,
) -> list[dict]:
"""Cosine similarity search on document chunks."""
pool = await get_pool()
conditions = []
params: list = [query_embedding, limit]
param_idx = 3
if case_id:
conditions.append(f"dc.case_id = ${param_idx}")
params.append(case_id)
param_idx += 1
if section_type:
conditions.append(f"dc.section_type = ${param_idx}")
params.append(section_type)
param_idx += 1
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
sql = f"""
SELECT dc.content, dc.section_type, dc.page_number,
dc.document_id, dc.case_id,
d.title AS document_title,
c.case_number,
1 - (dc.embedding <=> $1) AS score
FROM document_chunks dc
JOIN documents d ON d.id = dc.document_id
JOIN cases c ON c.id = dc.case_id
{where}
ORDER BY dc.embedding <=> $1
LIMIT $2
"""
async with pool.acquire() as conn:
rows = await conn.fetch(sql, *params)
return [dict(r) for r in rows]
# ── Style corpus ────────────────────────────────────────────────────
async def add_to_style_corpus(
document_id: UUID | None,
decision_number: str,
decision_date: date | None,
subject_categories: list[str],
full_text: str,
summary: str = "",
outcome: str = "",
key_principles: list[str] | None = None,
) -> UUID:
pool = await get_pool()
corpus_id = uuid4()
async with pool.acquire() as conn:
await conn.execute(
"""INSERT INTO style_corpus
(id, document_id, decision_number, decision_date,
subject_categories, full_text, summary, outcome, key_principles)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""",
corpus_id, document_id, decision_number, decision_date,
json.dumps(subject_categories), full_text, summary, outcome,
json.dumps(key_principles or []),
)
return corpus_id
async def get_style_patterns(pattern_type: str | None = None) -> list[dict]:
pool = await get_pool()
async with pool.acquire() as conn:
if pattern_type:
rows = await conn.fetch(
"SELECT * FROM style_patterns WHERE pattern_type = $1 ORDER BY frequency DESC",
pattern_type,
)
else:
rows = await conn.fetch(
"SELECT * FROM style_patterns ORDER BY pattern_type, frequency DESC"
)
return [dict(r) for r in rows]
async def upsert_style_pattern(
pattern_type: str,
pattern_text: str,
context: str = "",
examples: list[str] | None = None,
) -> None:
pool = await get_pool()
async with pool.acquire() as conn:
existing = await conn.fetchrow(
"SELECT id, frequency FROM style_patterns WHERE pattern_type = $1 AND pattern_text = $2",
pattern_type, pattern_text,
)
if existing:
await conn.execute(
"UPDATE style_patterns SET frequency = frequency + 1 WHERE id = $1",
existing["id"],
)
else:
await conn.execute(
"""INSERT INTO style_patterns (pattern_type, pattern_text, context, examples)
VALUES ($1, $2, $3, $4)""",
pattern_type, pattern_text, context,
json.dumps(examples or []),
)

View File

@@ -0,0 +1,55 @@
"""Embedding service using Voyage AI API."""
from __future__ import annotations
import logging
import voyageai
from legal_mcp import config
logger = logging.getLogger(__name__)
_client: voyageai.Client | None = None
def _get_client() -> voyageai.Client:
global _client
if _client is None:
_client = voyageai.Client(api_key=config.VOYAGE_API_KEY)
return _client
async def embed_texts(texts: list[str], input_type: str = "document") -> list[list[float]]:
"""Embed a batch of texts using Voyage AI.
Args:
texts: List of texts to embed (max 128 per call).
input_type: "document" for indexing, "query" for search queries.
Returns:
List of embedding vectors (1024 dimensions each).
"""
if not texts:
return []
client = _get_client()
all_embeddings = []
# Voyage AI supports up to 128 texts per batch
for i in range(0, len(texts), 128):
batch = texts[i : i + 128]
result = client.embed(
batch,
model=config.VOYAGE_MODEL,
input_type=input_type,
)
all_embeddings.extend(result.embeddings)
return all_embeddings
async def embed_query(query: str) -> list[float]:
"""Embed a single search query."""
results = await embed_texts([query], input_type="query")
return results[0]

View File

@@ -0,0 +1,126 @@
"""Text extraction from PDF, DOCX, and RTF files.
Primary PDF extraction: Claude Vision API (for scanned documents).
Fallback: PyMuPDF direct text extraction (for born-digital PDFs).
"""
from __future__ import annotations
import base64
import logging
from pathlib import Path
import anthropic
import fitz # PyMuPDF
from docx import Document as DocxDocument
from striprtf.striprtf import rtf_to_text
from legal_mcp import config
logger = logging.getLogger(__name__)
_anthropic_client: anthropic.Anthropic | None = None
def _get_anthropic() -> anthropic.Anthropic:
global _anthropic_client
if _anthropic_client is None:
_anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
return _anthropic_client
async def extract_text(file_path: str) -> tuple[str, int]:
"""Extract text from a document file.
Returns:
Tuple of (extracted_text, page_count).
page_count is 0 for non-PDF files.
"""
path = Path(file_path)
suffix = path.suffix.lower()
if suffix == ".pdf":
return await _extract_pdf(path)
elif suffix == ".docx":
return _extract_docx(path), 0
elif suffix == ".rtf":
return _extract_rtf(path), 0
elif suffix == ".txt":
return path.read_text(encoding="utf-8"), 0
else:
raise ValueError(f"Unsupported file type: {suffix}")
async def _extract_pdf(path: Path) -> tuple[str, int]:
"""Extract text from PDF. Try direct text first, fall back to Claude Vision for scanned pages."""
doc = fitz.open(str(path))
page_count = len(doc)
pages_text: list[str] = []
for page_num in range(page_count):
page = doc[page_num]
# Try direct text extraction first
text = page.get_text().strip()
if len(text) > 50:
# Sufficient text found - born-digital page
pages_text.append(text)
logger.debug("Page %d: direct text extraction (%d chars)", page_num + 1, len(text))
else:
# Likely scanned - use Claude Vision
logger.info("Page %d: using Claude Vision OCR", page_num + 1)
pix = page.get_pixmap(dpi=200)
img_bytes = pix.tobytes("png")
ocr_text = await _ocr_with_claude(img_bytes, page_num + 1)
pages_text.append(ocr_text)
doc.close()
return "\n\n".join(pages_text), page_count
async def _ocr_with_claude(image_bytes: bytes, page_num: int) -> str:
"""OCR a single page image using Claude Vision API."""
client = _get_anthropic()
b64_image = base64.b64encode(image_bytes).decode("utf-8")
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4096,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": b64_image,
},
},
{
"type": "text",
"text": (
"חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
"שמור על מבנה הפסקאות המקורי. "
"החזר רק את הטקסט המחולץ, ללא הערות נוספות."
),
},
],
}
],
)
return message.content[0].text
def _extract_docx(path: Path) -> str:
"""Extract text from DOCX file."""
doc = DocxDocument(str(path))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
return "\n\n".join(paragraphs)
def _extract_rtf(path: Path) -> str:
"""Extract text from RTF file."""
rtf_content = path.read_text(encoding="utf-8", errors="replace")
return rtf_to_text(rtf_content)

View File

@@ -0,0 +1,79 @@
"""Document processing pipeline: extract → chunk → embed → store."""
from __future__ import annotations
import logging
from uuid import UUID
from legal_mcp.services import chunker, db, embeddings, extractor
logger = logging.getLogger(__name__)
async def process_document(document_id: UUID, case_id: UUID) -> dict:
"""Full processing pipeline for a document.
1. Extract text from file
2. Split into chunks
3. Generate embeddings
4. Store chunks + embeddings in DB
Returns processing summary.
"""
doc = await db.get_document(document_id)
if not doc:
raise ValueError(f"Document {document_id} not found")
await db.update_document(document_id, extraction_status="processing")
try:
# Step 1: Extract text
logger.info("Extracting text from %s", doc["file_path"])
text, page_count = await extractor.extract_text(doc["file_path"])
await db.update_document(
document_id,
extracted_text=text,
page_count=page_count,
)
# Step 2: Chunk
logger.info("Chunking document (%d chars)", len(text))
chunks = chunker.chunk_document(text)
if not chunks:
await db.update_document(document_id, extraction_status="completed")
return {"status": "completed", "chunks": 0, "message": "No text to chunk"}
# Step 3: Embed
logger.info("Generating embeddings for %d chunks", len(chunks))
texts = [c.content for c in chunks]
embs = await embeddings.embed_texts(texts, input_type="document")
# Step 4: Store
chunk_dicts = [
{
"content": c.content,
"section_type": c.section_type,
"embedding": emb,
"page_number": c.page_number,
"chunk_index": c.chunk_index,
}
for c, emb in zip(chunks, embs)
]
stored = await db.store_chunks(document_id, case_id, chunk_dicts)
await db.update_document(document_id, extraction_status="completed")
logger.info("Document processed: %d chunks stored", stored)
return {
"status": "completed",
"chunks": stored,
"pages": page_count,
"text_length": len(text),
}
except Exception as e:
logger.exception("Document processing failed: %s", e)
await db.update_document(document_id, extraction_status="failed")
return {"status": "failed", "error": str(e)}

View File

@@ -0,0 +1,121 @@
"""Style analyzer - extracts writing patterns from Dafna's decision corpus."""
from __future__ import annotations
import logging
import re
import anthropic
from legal_mcp import config
from legal_mcp.services import db
logger = logging.getLogger(__name__)
ANALYSIS_PROMPT = """\
אתה מנתח סגנון כתיבה משפטית. לפניך החלטות משפטיות שנכתבו על ידי אותה יושבת ראש של ועדת ערר.
נתח את ההחלטות וחלץ את דפוסי הכתיבה הבאים:
1. **נוסחאות פתיחה** (opening_formula) - איך מתחילות ההחלטות
2. **ביטויי מעבר** (transition) - ביטויים שמחברים בין חלקי ההחלטה
3. **סגנון ציטוט** (citation_style) - איך מצטטים חקיקה ופסיקה
4. **מבנה ניתוח** (analysis_structure) - איך בנוי הניתוח המשפטי
5. **נוסחאות סיום** (closing_formula) - איך מסתיימות ההחלטות
6. **ביטויים אופייניים** (characteristic_phrase) - ביטויים ייחודיים שחוזרים
לכל דפוס, תן:
- הטקסט המדויק של הדפוס
- הקשר (באיזה חלק של ההחלטה הוא מופיע)
- דוגמה מתוך הטקסט
החזר את התוצאות בפורמט הבא (JSON array):
```json
[
{{
"type": "opening_formula",
"text": "לפניי ערר על החלטת...",
"context": "פתיחת ההחלטה",
"example": "לפניי ערר על החלטת הוועדה המקומית לתכנון ובניה ירושלים"
}}
]
```
ההחלטות:
{decisions}
"""
async def analyze_corpus() -> dict:
"""Analyze the style corpus and extract/update patterns.
Returns summary of patterns found.
"""
pool = await db.get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT full_text, decision_number FROM style_corpus ORDER BY decision_date DESC LIMIT 20"
)
if not rows:
return {"error": "אין החלטות בקורפוס. העלה החלטות קודמות תחילה."}
# Prepare text for analysis
decisions_text = ""
for row in rows:
decisions_text += f"\n\n--- החלטה {row['decision_number'] or 'ללא מספר'} ---\n"
# Limit each decision to ~3000 chars to fit context
text = row["full_text"]
if len(text) > 3000:
text = text[:1500] + "\n...\n" + text[-1500:]
decisions_text += text
# Call Claude to analyze patterns
client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
message = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=16384,
messages=[
{
"role": "user",
"content": ANALYSIS_PROMPT.format(decisions=decisions_text),
}
],
)
response_text = message.content[0].text
# Extract JSON from response - prefer code-block fenced JSON
import json
code_block = re.search(r"```(?:json)?\s*(\[[\s\S]*?\])\s*```", response_text)
if code_block:
json_str = code_block.group(1)
else:
# Fallback: find the last JSON array (skip prose brackets)
all_arrays = list(re.finditer(r"\[[\s\S]*?\]", response_text))
if not all_arrays:
return {"error": "Could not parse analysis results", "raw": response_text}
json_str = all_arrays[-1].group()
try:
patterns = json.loads(json_str)
except json.JSONDecodeError as e:
return {"error": f"JSON parse error: {e}", "raw": response_text}
# Store patterns
count = 0
for pattern in patterns:
await db.upsert_style_pattern(
pattern_type=pattern.get("type", "other"),
pattern_text=pattern.get("text", ""),
context=pattern.get("context", ""),
examples=[pattern.get("example", "")],
)
count += 1
return {
"patterns_found": count,
"decisions_analyzed": len(rows),
"pattern_types": list({p.get("type") for p in patterns}),
}

View File

@@ -0,0 +1,177 @@
"""MCP tools for case management."""
from __future__ import annotations
import json
import subprocess
from pathlib import Path
from uuid import UUID
from legal_mcp import config
from legal_mcp.services import db
async def case_create(
case_number: str,
title: str,
appellants: list[str] | None = None,
respondents: list[str] | None = None,
subject: str = "",
property_address: str = "",
permit_number: str = "",
committee_type: str = "ועדה מקומית",
hearing_date: str = "",
notes: str = "",
) -> str:
"""יצירת תיק ערר חדש.
Args:
case_number: מספר תיק הערר (לדוגמה: 123-24)
title: כותרת קצרה של הערר
appellants: שמות העוררים
respondents: שמות המשיבים
subject: נושא הערר
property_address: כתובת הנכס
permit_number: מספר היתר
committee_type: סוג הוועדה (ברירת מחדל: ועדה מקומית)
hearing_date: תאריך דיון (YYYY-MM-DD)
notes: הערות
"""
from datetime import date as date_type
h_date = None
if hearing_date:
h_date = date_type.fromisoformat(hearing_date)
case = await db.create_case(
case_number=case_number,
title=title,
appellants=appellants,
respondents=respondents,
subject=subject,
property_address=property_address,
permit_number=permit_number,
committee_type=committee_type,
hearing_date=h_date,
notes=notes,
)
# Initialize git repo for the case
case_dir = config.CASES_DIR / case_number
case_dir.mkdir(parents=True, exist_ok=True)
(case_dir / "documents").mkdir(exist_ok=True)
(case_dir / "drafts").mkdir(exist_ok=True)
# Save case metadata
case_json = case_dir / "case.json"
case_json.write_text(json.dumps(case, default=str, ensure_ascii=False, indent=2))
# Create notes file
notes_file = case_dir / "notes.md"
notes_file.write_text(f"# הערות - תיק {case_number}\n\n{notes}\n")
# Initialize git repo
subprocess.run(["git", "init"], cwd=case_dir, capture_output=True)
subprocess.run(["git", "add", "."], cwd=case_dir, capture_output=True)
subprocess.run(
["git", "commit", "-m", f"אתחול תיק {case_number}: {title}"],
cwd=case_dir,
capture_output=True,
env={"GIT_AUTHOR_NAME": "Ezer Mishpati", "GIT_AUTHOR_EMAIL": "legal@local",
"GIT_COMMITTER_NAME": "Ezer Mishpati", "GIT_COMMITTER_EMAIL": "legal@local",
"PATH": "/usr/bin:/bin"},
)
return json.dumps(case, default=str, ensure_ascii=False, indent=2)
async def case_list(status: str = "", limit: int = 50) -> str:
"""רשימת תיקי ערר עם אפשרות סינון לפי סטטוס.
Args:
status: סינון לפי סטטוס (new, in_progress, drafted, reviewed, final). ריק = הכל
limit: מספר תוצאות מקסימלי
"""
cases = await db.list_cases(status=status or None, limit=limit)
if not cases:
return "אין תיקים."
return json.dumps(cases, default=str, ensure_ascii=False, indent=2)
async def case_get(case_number: str) -> str:
"""קבלת פרטי תיק מלאים כולל רשימת מסמכים.
Args:
case_number: מספר תיק הערר
"""
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
docs = await db.list_documents(UUID(case["id"]))
case["documents"] = docs
return json.dumps(case, default=str, ensure_ascii=False, indent=2)
async def case_update(
case_number: str,
status: str = "",
title: str = "",
subject: str = "",
notes: str = "",
hearing_date: str = "",
decision_date: str = "",
tags: list[str] | None = None,
) -> str:
"""עדכון פרטי תיק.
Args:
case_number: מספר תיק הערר
status: סטטוס חדש (new, in_progress, drafted, reviewed, final)
title: כותרת חדשה
subject: נושא חדש
notes: הערות חדשות
hearing_date: תאריך דיון (YYYY-MM-DD)
decision_date: תאריך החלטה (YYYY-MM-DD)
tags: תגיות
"""
from datetime import date as date_type
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
fields = {}
if status:
fields["status"] = status
if title:
fields["title"] = title
if subject:
fields["subject"] = subject
if notes:
fields["notes"] = notes
if hearing_date:
fields["hearing_date"] = date_type.fromisoformat(hearing_date)
if decision_date:
fields["decision_date"] = date_type.fromisoformat(decision_date)
if tags is not None:
fields["tags"] = tags
updated = await db.update_case(UUID(case["id"]), **fields)
# Git commit the update
case_dir = config.CASES_DIR / case_number
if case_dir.exists():
case_json = case_dir / "case.json"
case_json.write_text(json.dumps(updated, default=str, ensure_ascii=False, indent=2))
subprocess.run(["git", "add", "case.json"], cwd=case_dir, capture_output=True)
subprocess.run(
["git", "commit", "-m", f"עדכון תיק: {', '.join(fields.keys())}"],
cwd=case_dir,
capture_output=True,
env={"GIT_AUTHOR_NAME": "Ezer Mishpati", "GIT_AUTHOR_EMAIL": "legal@local",
"GIT_COMMITTER_NAME": "Ezer Mishpati", "GIT_COMMITTER_EMAIL": "legal@local",
"PATH": "/usr/bin:/bin"},
)
return json.dumps(updated, default=str, ensure_ascii=False, indent=2)

View File

@@ -0,0 +1,218 @@
"""MCP tools for document management and processing."""
from __future__ import annotations
import json
import shutil
import subprocess
from pathlib import Path
from uuid import UUID
from legal_mcp import config
from legal_mcp.services import db, processor
async def document_upload(
case_number: str,
file_path: str,
doc_type: str = "appeal",
title: str = "",
) -> str:
"""העלאה ועיבוד מסמך לתיק ערר. מחלץ טקסט, יוצר chunks ו-embeddings.
Args:
case_number: מספר תיק הערר
file_path: נתיב מלא לקובץ (PDF, DOCX, RTF, TXT)
doc_type: סוג מסמך (appeal=כתב ערר, response=תשובה, decision=החלטה, reference=מסמך עזר, exhibit=נספח)
title: שם המסמך (אם ריק, ייקח משם הקובץ)
"""
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
source = Path(file_path)
if not source.exists():
return f"קובץ לא נמצא: {file_path}"
case_id = UUID(case["id"])
if not title:
title = source.stem
# Copy file to case directory
case_dir = config.CASES_DIR / case_number / "documents"
case_dir.mkdir(parents=True, exist_ok=True)
dest = case_dir / source.name
shutil.copy2(str(source), str(dest))
# Create document record
doc = await db.create_document(
case_id=case_id,
doc_type=doc_type,
title=title,
file_path=str(dest),
)
# Process document (extract → chunk → embed → store)
result = await processor.process_document(UUID(doc["id"]), case_id)
# Git commit
repo_dir = config.CASES_DIR / case_number
if repo_dir.exists():
subprocess.run(["git", "add", "."], cwd=repo_dir, capture_output=True)
doc_type_hebrew = {
"appeal": "כתב ערר",
"response": "תשובה",
"decision": "החלטה",
"reference": "מסמך עזר",
"exhibit": "נספח",
}.get(doc_type, doc_type)
subprocess.run(
["git", "commit", "-m", f"הוספת {doc_type_hebrew}: {title}"],
cwd=repo_dir,
capture_output=True,
env={"GIT_AUTHOR_NAME": "Ezer Mishpati", "GIT_AUTHOR_EMAIL": "legal@local",
"GIT_COMMITTER_NAME": "Ezer Mishpati", "GIT_COMMITTER_EMAIL": "legal@local",
"PATH": "/usr/bin:/bin"},
)
return json.dumps({
"document": doc,
"processing": result,
}, default=str, ensure_ascii=False, indent=2)
async def document_upload_training(
file_path: str,
decision_number: str = "",
decision_date: str = "",
subject_categories: list[str] | None = None,
title: str = "",
) -> str:
"""העלאת החלטה קודמת של דפנה לקורפוס הסגנון (training).
Args:
file_path: נתיב מלא לקובץ ההחלטה
decision_number: מספר ההחלטה
decision_date: תאריך ההחלטה (YYYY-MM-DD)
subject_categories: קטגוריות - אפשר לבחור כמה (בנייה, שימוש חורג, תכנית, היתר, הקלה, חלוקה, תמ"א 38, היטל השבחה, פיצויים 197)
title: שם המסמך
"""
from datetime import date as date_type
from legal_mcp.services import extractor, embeddings, chunker
source = Path(file_path)
if not source.exists():
return f"קובץ לא נמצא: {file_path}"
if not title:
title = source.stem
# Copy to training directory (skip if already there)
config.TRAINING_DIR.mkdir(parents=True, exist_ok=True)
dest = config.TRAINING_DIR / source.name
if source.resolve() != dest.resolve():
shutil.copy2(str(source), str(dest))
# Extract text
text, page_count = await extractor.extract_text(str(dest))
# Parse date
d_date = None
if decision_date:
d_date = date_type.fromisoformat(decision_date)
# Add to style corpus
corpus_id = await db.add_to_style_corpus(
document_id=None,
decision_number=decision_number,
decision_date=d_date,
subject_categories=subject_categories or [],
full_text=text,
)
# Chunk and embed for RAG search over training corpus
chunks = chunker.chunk_document(text)
if chunks:
# Create a document record (no case association)
doc = await db.create_document(
case_id=None,
doc_type="decision",
title=f"[קורפוס] {title}",
file_path=str(dest),
page_count=page_count,
)
doc_id = UUID(doc["id"])
await db.update_document(doc_id, extracted_text=text, extraction_status="completed")
# Generate embeddings and store chunks
texts = [c.content for c in chunks]
embs = await embeddings.embed_texts(texts, input_type="document")
chunk_dicts = [
{
"content": c.content,
"section_type": c.section_type,
"embedding": emb,
"page_number": c.page_number,
"chunk_index": c.chunk_index,
}
for c, emb in zip(chunks, embs)
]
await db.store_chunks(doc_id, None, chunk_dicts)
return json.dumps({
"corpus_id": str(corpus_id),
"title": title,
"pages": page_count,
"text_length": len(text),
"chunks": len(chunks) if chunks else 0,
}, default=str, ensure_ascii=False, indent=2)
async def document_get_text(case_number: str, doc_title: str = "") -> str:
"""קבלת טקסט מלא של מסמך מתוך תיק.
Args:
case_number: מספר תיק הערר
doc_title: שם המסמך (אם ריק, מחזיר את כל המסמכים)
"""
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
docs = await db.list_documents(UUID(case["id"]))
if not docs:
return f"אין מסמכים בתיק {case_number}."
if doc_title:
docs = [d for d in docs if doc_title.lower() in d["title"].lower()]
if not docs:
return f"מסמך '{doc_title}' לא נמצא בתיק."
results = []
for doc in docs:
text = await db.get_document_text(UUID(doc["id"]))
results.append({
"title": doc["title"],
"doc_type": doc["doc_type"],
"text": text[:10000] if text else "(ללא טקסט)",
})
return json.dumps(results, ensure_ascii=False, indent=2)
async def document_list(case_number: str) -> str:
"""רשימת מסמכים בתיק.
Args:
case_number: מספר תיק הערר
"""
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
docs = await db.list_documents(UUID(case["id"]))
if not docs:
return f"אין מסמכים בתיק {case_number}."
return json.dumps(docs, default=str, ensure_ascii=False, indent=2)

View File

@@ -0,0 +1,202 @@
"""MCP tools for decision drafting support."""
from __future__ import annotations
import json
from uuid import UUID
from legal_mcp.services import db, embeddings
DECISION_TEMPLATE = """# החלטה
## בפני: דפנה תמיר, יו"ר ועדת הערר מחוז ירושלים
**ערר מספר:** {case_number}
**נושא:** {subject}
**העוררים:** {appellants}
**המשיבים:** {respondents}
**כתובת הנכס:** {property_address}
---
## א. רקע עובדתי
[תיאור הרקע העובדתי של הערר]
## ב. טענות העוררים
[סיכום טענות העוררים]
## ג. טענות המשיבים
[סיכום טענות המשיבים]
## ד. דיון והכרעה
[ניתוח משפטי]
## ה. מסקנה
[מסקנת הוועדה]
## ו. החלטה
[ההחלטה הסופית]
---
ניתנה היום, {date}
דפנה תמיר, יו"ר ועדת הערר
"""
async def get_style_guide() -> str:
"""שליפת דפוסי הסגנון של דפנה - נוסחאות, ביטויים אופייניים ומבנה."""
patterns = await db.get_style_patterns()
if not patterns:
return "לא נמצאו דפוסי סגנון. יש להעלות החלטות קודמות ולהריץ ניתוח סגנון (/style-report)."
grouped: dict[str, list] = {}
for p in patterns:
pt = p["pattern_type"]
if pt not in grouped:
grouped[pt] = []
grouped[pt].append({
"text": p["pattern_text"],
"context": p["context"],
"frequency": p["frequency"],
})
type_names = {
"opening_formula": "נוסחאות פתיחה",
"transition": "ביטויי מעבר",
"citation_style": "סגנון ציטוט",
"analysis_structure": "מבנה ניתוח",
"closing_formula": "נוסחאות סיום",
"characteristic_phrase": "ביטויים אופייניים",
}
result = "# מדריך סגנון - דפנה תמיר\n\n"
for ptype, items in grouped.items():
result += f"## {type_names.get(ptype, ptype)}\n\n"
for item in items:
result += f"- **{item['text']}** ({item['context']}, תדירות: {item['frequency']})\n"
result += "\n"
return result
async def draft_section(
case_number: str,
section: str,
instructions: str = "",
) -> str:
"""הרכבת הקשר מלא לניסוח סעיף בהחלטה - כולל עובדות מהמסמכים, תקדימים רלוונטיים ודפוסי סגנון.
Args:
case_number: מספר תיק הערר
section: סוג הסעיף (facts, appellant_claims, respondent_claims, legal_analysis, conclusion, ruling)
instructions: הנחיות נוספות לניסוח
"""
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
case_id = UUID(case["id"])
# 1. Get relevant chunks from case documents
section_query = {
"facts": "רקע עובדתי של התיק",
"appellant_claims": "טענות העוררים",
"respondent_claims": "טענות המשיבים",
"legal_analysis": "ניתוח משפטי ודיון",
"conclusion": "מסקנות",
"ruling": "החלטה",
}.get(section, section)
query_emb = await embeddings.embed_query(section_query)
case_chunks = await db.search_similar(
query_embedding=query_emb, limit=10, case_id=case_id
)
# 2. Get similar sections from precedents
precedent_chunks = await db.search_similar(
query_embedding=query_emb, limit=5, section_type=section
)
# Filter out chunks from the same case
precedent_chunks = [c for c in precedent_chunks if str(c["case_id"]) != case["id"]]
# 3. Get style patterns
style_patterns = await db.get_style_patterns()
# Build context
context = {
"case": {
"case_number": case["case_number"],
"title": case["title"],
"appellants": case["appellants"],
"respondents": case["respondents"],
"subject": case["subject"],
"property_address": case["property_address"],
},
"section": section,
"instructions": instructions,
"case_documents": [
{
"document": c["document_title"],
"section_type": c["section_type"],
"content": c["content"],
}
for c in case_chunks
],
"precedents": [
{
"case_number": c["case_number"],
"document": c["document_title"],
"content": c["content"][:500],
}
for c in precedent_chunks[:3]
],
"style_patterns": [
{
"type": p["pattern_type"],
"text": p["pattern_text"],
}
for p in style_patterns[:15]
],
}
return json.dumps(context, ensure_ascii=False, indent=2)
async def get_decision_template(case_number: str) -> str:
"""קבלת תבנית מבנית להחלטה מלאה עם פרטי התיק.
Args:
case_number: מספר תיק הערר
"""
from datetime import date
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
template = DECISION_TEMPLATE.format(
case_number=case["case_number"],
subject=case["subject"],
appellants=", ".join(case.get("appellants", [])),
respondents=", ".join(case.get("respondents", [])),
property_address=case.get("property_address", ""),
date=date.today().strftime("%d.%m.%Y"),
)
return template
async def analyze_style() -> str:
"""הרצת ניתוח סגנון על קורפוס ההחלטות של דפנה. מחלץ דפוסי כתיבה ושומר אותם."""
from legal_mcp.services.style_analyzer import analyze_corpus
result = await analyze_corpus()
return json.dumps(result, ensure_ascii=False, indent=2)

View File

@@ -0,0 +1,124 @@
"""MCP tools for RAG search over legal documents and decisions."""
from __future__ import annotations
import json
from uuid import UUID
from legal_mcp.services import db, embeddings
async def search_decisions(
query: str,
limit: int = 10,
section_type: str = "",
) -> str:
"""חיפוש סמנטי בהחלטות קודמות ובמסמכים.
Args:
query: שאילתת חיפוש בעברית (לדוגמה: "שימוש חורג למסחר באזור מגורים")
limit: מספר תוצאות מקסימלי
section_type: סינון לפי סוג סעיף (facts, legal_analysis, conclusion, ruling, וכו'). ריק = הכל
"""
query_emb = await embeddings.embed_query(query)
results = await db.search_similar(
query_embedding=query_emb,
limit=limit,
section_type=section_type or None,
)
if not results:
return "לא נמצאו תוצאות."
formatted = []
for r in results:
formatted.append({
"score": round(float(r["score"]), 4),
"case_number": r["case_number"],
"document": r["document_title"],
"section": r["section_type"],
"page": r["page_number"],
"content": r["content"],
})
return json.dumps(formatted, ensure_ascii=False, indent=2)
async def search_case_documents(
case_number: str,
query: str,
limit: int = 10,
) -> str:
"""חיפוש סמנטי בתוך מסמכי תיק ספציפי.
Args:
case_number: מספר תיק הערר
query: שאילתת חיפוש
limit: מספר תוצאות מקסימלי
"""
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
query_emb = await embeddings.embed_query(query)
results = await db.search_similar(
query_embedding=query_emb,
limit=limit,
case_id=UUID(case["id"]),
)
if not results:
return f"לא נמצאו תוצאות בתיק {case_number}."
formatted = []
for r in results:
formatted.append({
"score": round(float(r["score"]), 4),
"document": r["document_title"],
"section": r["section_type"],
"page": r["page_number"],
"content": r["content"],
})
return json.dumps(formatted, ensure_ascii=False, indent=2)
async def find_similar_cases(
description: str,
limit: int = 5,
) -> str:
"""מציאת תיקים דומים על בסיס תיאור.
Args:
description: תיאור התיק או הנושא (לדוגמה: "ערר על סירוב להיתר בנייה לתוספת קומה")
limit: מספר תוצאות מקסימלי
"""
query_emb = await embeddings.embed_query(description)
results = await db.search_similar(
query_embedding=query_emb,
limit=limit * 3, # Get more to deduplicate by case
)
if not results:
return "לא נמצאו תיקים דומים."
# Deduplicate by case_number, keep best score per case
seen_cases = {}
for r in results:
cn = r["case_number"]
if cn not in seen_cases or r["score"] > seen_cases[cn]["score"]:
seen_cases[cn] = r
# Sort by score and limit
top_cases = sorted(seen_cases.values(), key=lambda x: x["score"], reverse=True)[:limit]
formatted = []
for r in top_cases:
formatted.append({
"score": round(float(r["score"]), 4),
"case_number": r["case_number"],
"document": r["document_title"],
"relevant_section": r["content"][:500],
})
return json.dumps(formatted, ensure_ascii=False, indent=2)

View File

@@ -0,0 +1,118 @@
"""MCP tools for workflow status tracking."""
from __future__ import annotations
import json
from uuid import UUID
from legal_mcp.services import db
async def workflow_status(case_number: str) -> str:
"""סטטוס תהליך עבודה מלא לתיק - מסמכים, עיבוד, טיוטות.
Args:
case_number: מספר תיק הערר
"""
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
case_id = UUID(case["id"])
docs = await db.list_documents(case_id)
# Count chunks per document
pool = await db.get_pool()
async with pool.acquire() as conn:
chunk_counts = await conn.fetch(
"SELECT document_id, COUNT(*) as count FROM document_chunks WHERE case_id = $1 GROUP BY document_id",
case_id,
)
chunk_map = {str(r["document_id"]): r["count"] for r in chunk_counts}
doc_status = []
for doc in docs:
doc_status.append({
"title": doc["title"],
"type": doc["doc_type"],
"extraction": doc["extraction_status"],
"chunks": chunk_map.get(doc["id"], 0),
"pages": doc.get("page_count"),
})
# Check draft status
from pathlib import Path
from legal_mcp import config
case_dir = config.CASES_DIR / case_number
draft_path = case_dir / "drafts" / "decision.md"
has_draft = draft_path.exists()
draft_size = draft_path.stat().st_size if has_draft else 0
status = {
"case_number": case["case_number"],
"title": case["title"],
"status": case["status"],
"documents": doc_status,
"total_documents": len(docs),
"total_chunks": sum(chunk_map.values()),
"has_draft": has_draft,
"draft_size_bytes": draft_size,
"next_steps": _suggest_next_steps(case, docs, has_draft),
}
return json.dumps(status, ensure_ascii=False, indent=2)
def _suggest_next_steps(case: dict, docs: list, has_draft: bool) -> list[str]:
"""Suggest next steps based on case state."""
steps = []
doc_types = {d["doc_type"] for d in docs}
if not docs:
steps.append("העלה מסמכים לתיק (כתב ערר, תשובת ועדה)")
else:
if "appeal" not in doc_types:
steps.append("העלה כתב ערר")
if "response" not in doc_types:
steps.append("העלה תשובת ועדה/משיבים")
pending = [d for d in docs if d["extraction_status"] == "pending"]
if pending:
steps.append(f"עיבוד {len(pending)} מסמכים ממתינים")
if docs and not has_draft:
steps.append("התחל ניסוח טיוטת החלטה (/draft-decision)")
elif has_draft and case["status"] in ("new", "in_progress"):
steps.append("סקור ועדכן את הטיוטה")
steps.append("עדכן סטטוס ל-drafted")
if case["status"] == "drafted":
steps.append("סקירה סופית ועדכון סטטוס ל-reviewed")
elif case["status"] == "reviewed":
steps.append("אישור סופי ועדכון סטטוס ל-final")
return steps
async def processing_status() -> str:
"""סטטוס כללי - מספר תיקים, מסמכים ממתינים לעיבוד."""
pool = await db.get_pool()
async with pool.acquire() as conn:
case_count = await conn.fetchval("SELECT COUNT(*) FROM cases")
doc_count = await conn.fetchval("SELECT COUNT(*) FROM documents")
pending_count = await conn.fetchval(
"SELECT COUNT(*) FROM documents WHERE extraction_status = 'pending'"
)
chunk_count = await conn.fetchval("SELECT COUNT(*) FROM document_chunks")
corpus_count = await conn.fetchval("SELECT COUNT(*) FROM style_corpus")
pattern_count = await conn.fetchval("SELECT COUNT(*) FROM style_patterns")
return json.dumps({
"cases": case_count,
"documents": doc_count,
"pending_processing": pending_count,
"chunks": chunk_count,
"style_corpus_entries": corpus_count,
"style_patterns": pattern_count,
}, ensure_ascii=False, indent=2)