commit 6f515dc2cbc382335d86d4d2a5952236088f7c81 Author: Chaim Date: Mon Mar 23 12:33:07 2026 +0000 Initial commit: MCP server + web upload interface Ezer Mishpati - AI legal decision drafting system with: - MCP server (FastMCP) with document processing pipeline - Web upload interface (FastAPI) for file upload and classification - pgvector-based semantic search - Hebrew legal document chunking and embedding diff --git a/.claude/commands/case-status.md b/.claude/commands/case-status.md new file mode 100644 index 0000000..daf3f5e --- /dev/null +++ b/.claude/commands/case-status.md @@ -0,0 +1,7 @@ +הצגת סטטוס מלא של תיק ערר. + +שאל את המשתמש על מספר התיק, ואז השתמש ב-workflow_status להצגת: +- פרטי התיק +- סטטוס עיבוד מסמכים +- מצב הטיוטה +- הצעדים הבאים diff --git a/.claude/commands/draft-decision.md b/.claude/commands/draft-decision.md new file mode 100644 index 0000000..348e0db --- /dev/null +++ b/.claude/commands/draft-decision.md @@ -0,0 +1,24 @@ +ניסוח החלטה מלאה לתיק ערר - סעיף אחר סעיף. + +שאל את המשתמש על מספר התיק. + +תהליך הניסוח: + +1. שלוף את פרטי התיק עם case_get +2. שלוף את מדריך הסגנון עם get_style_guide +3. שלוף את תבנית ההחלטה עם get_decision_template + +לכל סעיף: +4. השתמש ב-draft_section כדי לקבל הקשר מלא (מסמכי התיק + תקדימים + סגנון) +5. נסח את הסעיף בסגנון דפנה על בסיס ההקשר +6. הצג למשתמש ובקש אישור/עריכה לפני המשך לסעיף הבא + +סדר הסעיפים: +- א. רקע עובדתי (facts) +- ב. טענות העוררים (appellant_claims) +- ג. טענות המשיבים (respondent_claims) +- ד. דיון והכרעה (legal_analysis) +- ה. מסקנה (conclusion) +- ו. החלטה (ruling) + +בסיום, שמור את הטיוטה המלאה בקובץ drafts/decision.md בתיקיית התיק. diff --git a/.claude/commands/new-case.md b/.claude/commands/new-case.md new file mode 100644 index 0000000..d56c196 --- /dev/null +++ b/.claude/commands/new-case.md @@ -0,0 +1,13 @@ +יצירת תיק ערר חדש. + +שאל את המשתמש על הפרטים הבאים ואז צור את התיק עם כלי case_create: + +1. מספר תיק (לדוגמה: 123-24) +2. כותרת קצרה +3. שמות העוררים +4. שמות המשיבים +5. נושא הערר +6. כתובת הנכס (אם רלוונטי) +7. תאריך דיון (אם ידוע) + +אחרי יצירת התיק, הצע למשתמש להעלות מסמכים עם /upload-doc. diff --git a/.claude/commands/search.md b/.claude/commands/search.md new file mode 100644 index 0000000..344de53 --- /dev/null +++ b/.claude/commands/search.md @@ -0,0 +1,12 @@ +חיפוש סמנטי בהחלטות קודמות ובמסמכים. + +שאל את המשתמש מה הוא מחפש (בעברית), ואז השתמש ב-search_decisions. + +הצג את התוצאות בצורה מסודרת: +- ציון רלוונטיות +- מספר תיק +- שם מסמך +- סוג סעיף +- תוכן רלוונטי + +אם המשתמש רוצה לחפש בתיק ספציפי, השתמש ב-search_case_documents. diff --git a/.claude/commands/style-report.md b/.claude/commands/style-report.md new file mode 100644 index 0000000..c131bc3 --- /dev/null +++ b/.claude/commands/style-report.md @@ -0,0 +1,13 @@ +הרצת ניתוח סגנון על קורפוס ההחלטות של דפנה. + +השתמש ב-analyze_style כדי לחלץ דפוסי כתיבה מההחלטות שהועלו. + +הצג את הדפוסים שנמצאו: +- נוסחאות פתיחה +- ביטויי מעבר +- סגנון ציטוט +- מבנה ניתוח +- נוסחאות סיום +- ביטויים אופייניים + +אם אין מספיק החלטות בקורפוס, הצע למשתמש להעלות עוד עם /upload-training. diff --git a/.claude/commands/upload-doc.md b/.claude/commands/upload-doc.md new file mode 100644 index 0000000..5b7347a --- /dev/null +++ b/.claude/commands/upload-doc.md @@ -0,0 +1,16 @@ +העלאת מסמך לתיק ערר. + +שאל את המשתמש: +1. מספר תיק +2. נתיב לקובץ (PDF, DOCX, RTF, או TXT) +3. סוג המסמך: + - appeal = כתב ערר + - response = תשובת ועדה/משיבים + - decision = החלטה + - reference = מסמך עזר + - exhibit = נספח +4. שם המסמך (אופציונלי) + +השתמש בכלי document_upload להעלאה. המסמך יעובד אוטומטית (חילוץ טקסט, חיתוך, ויצירת embeddings). + +הצג למשתמש את תוצאות העיבוד (מספר chunks, עמודים). diff --git a/.claude/commands/upload-training.md b/.claude/commands/upload-training.md new file mode 100644 index 0000000..c4c7c97 --- /dev/null +++ b/.claude/commands/upload-training.md @@ -0,0 +1,20 @@ +העלאת החלטה קודמת של דפנה לקורפוס הסגנון. + +שאל את המשתמש: +1. נתיב לקובץ ההחלטה +2. מספר ההחלטה (אם ידוע) +3. תאריך ההחלטה (YYYY-MM-DD) +4. קטגוריות (אפשר לבחור כמה): + - בנייה + - שימוש חורג + - תכנית + - היתר + - הקלה + - חלוקה + - תמ"א 38 + - היטל השבחה + - פיצויים לפי סעיף 197 + +השתמש בכלי document_upload_training עם subject_categories כרשימה. + +אחרי העלאת מספר החלטות, הצע להריץ /style-report לניתוח דפוסי הסגנון. diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..9a524eb --- /dev/null +++ b/.dockerignore @@ -0,0 +1,6 @@ +data/ +.claude/ +mcp-server/.venv/ +**/__pycache__/ +*.pyc +.git/ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea29a9b --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +data/uploads/ +data/cases/ +mcp-server/.venv/ +__pycache__/ +*.pyc +.env +data/training/ +*.egg-info/ diff --git a/.mcp.json b/.mcp.json new file mode 100644 index 0000000..3c27be3 --- /dev/null +++ b/.mcp.json @@ -0,0 +1,14 @@ +{ + "mcpServers": { + "legal-ai": { + "type": "stdio", + "command": "/home/chaim/legal-ai/mcp-server/.venv/bin/python", + "args": ["-m", "legal_mcp.server"], + "cwd": "/home/chaim/legal-ai/mcp-server", + "env": { + "DOTENV_PATH": "/home/chaim/.env", + "DATA_DIR": "/home/chaim/legal-ai/data" + } + } + } +} diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..f353943 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,48 @@ +# עוזר משפטי (Ezer Mishpati) + +מערכת AI לסיוע בניסוח החלטות משפטיות בסגנון דפנה תמיר, יו"ר ועדת הערר מחוז ירושלים. + +## כלי MCP זמינים + +### ניהול תיקים +- `case_create` - יצירת תיק ערר חדש +- `case_list` - רשימת תיקים (סינון אופציונלי לפי סטטוס) +- `case_get` - פרטי תיק מלאים כולל מסמכים +- `case_update` - עדכון פרטי תיק וסטטוס + +### מסמכים +- `document_upload` - העלאה ועיבוד מסמך (חילוץ טקסט → chunks → embeddings) +- `document_upload_training` - העלאת החלטה קודמת של דפנה לקורפוס +- `document_get_text` - קבלת טקסט מחולץ +- `document_list` - רשימת מסמכים בתיק + +### חיפוש +- `search_decisions` - חיפוש סמנטי בהחלטות ומסמכים +- `search_case_documents` - חיפוש בתוך תיק ספציפי +- `find_similar_cases` - מציאת תיקים דומים + +### ניסוח +- `get_style_guide` - דפוסי הסגנון של דפנה +- `draft_section` - הרכבת הקשר לניסוח סעיף (עובדות + תקדימים + סגנון) +- `get_decision_template` - תבנית מבנית להחלטה +- `analyze_style` - ניתוח סגנון על הקורפוס + +### תהליך עבודה +- `workflow_status` - סטטוס מלא לתיק +- `processing_status` - סטטוס כללי של המערכת + +## תהליך עבודה טיפוסי + +1. `/new-case` → יצירת תיק חדש +2. `/upload-doc` → העלאת כתב ערר ותשובת ועדה +3. חיפוש תיקים דומים +4. `/draft-decision` → ניסוח סעיף אחר סעיף +5. עריכה ושיפור עם Claude +6. עדכון סטטוס → final + +## הנחיות ניסוח + +- כל ההחלטות בעברית +- שמור על סגנון דפנה (השתמש ב-`get_style_guide` לפני ניסוח) +- הפנה לתקדימים מהקורפוס +- המבנה: רקע → טענות עוררים → טענות משיבים → דיון → מסקנה → החלטה diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..cf72610 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,26 @@ +FROM python:3.12-slim + +WORKDIR /app + +# System deps for PyMuPDF and document processing +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc libmupdf-dev libfreetype6-dev libharfbuzz-dev libjpeg62-turbo-dev \ + libopenjp2-7-dev && rm -rf /var/lib/apt/lists/* + +# Copy MCP server source (for importing services) +COPY mcp-server/pyproject.toml /app/mcp-server/pyproject.toml +COPY mcp-server/src/ /app/mcp-server/src/ + +# Install MCP server dependencies + web deps +RUN pip install --no-cache-dir /app/mcp-server && \ + pip install --no-cache-dir fastapi uvicorn python-multipart + +# Copy web app +COPY web/ /app/web/ + +ENV PYTHONPATH=/app/mcp-server/src +ENV DOTENV_PATH=/home/chaim/.env + +EXPOSE 8080 + +CMD ["uvicorn", "web.app:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/mcp-server/pyproject.toml b/mcp-server/pyproject.toml new file mode 100644 index 0000000..00fa7fd --- /dev/null +++ b/mcp-server/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "legal-mcp" +version = "0.1.0" +description = "MCP server for AI-assisted legal decision drafting" +requires-python = ">=3.10" +dependencies = [ + "mcp[cli]>=1.0.0", + "asyncpg>=0.29.0", + "pgvector>=0.3.0", + "voyageai>=0.3.0", + "anthropic>=0.40.0", + "python-dotenv>=1.0.0", + "pydantic>=2.0.0", + "pymupdf>=1.25.0", + "python-docx>=1.1.0", + "striprtf>=0.0.26", + "redis>=5.0.0", + "rq>=1.16.0", + "pillow>=10.0.0", +] + +[build-system] +requires = ["setuptools>=68.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["src"] diff --git a/mcp-server/src/legal_mcp/__init__.py b/mcp-server/src/legal_mcp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mcp-server/src/legal_mcp/__main__.py b/mcp-server/src/legal_mcp/__main__.py new file mode 100644 index 0000000..9ba674b --- /dev/null +++ b/mcp-server/src/legal_mcp/__main__.py @@ -0,0 +1,4 @@ +"""Allow running with: python -m legal_mcp""" +from legal_mcp.server import main + +main() diff --git a/mcp-server/src/legal_mcp/config.py b/mcp-server/src/legal_mcp/config.py new file mode 100644 index 0000000..582ecb1 --- /dev/null +++ b/mcp-server/src/legal_mcp/config.py @@ -0,0 +1,40 @@ +"""Configuration loaded from central .env file.""" + +import os +from pathlib import Path + +from dotenv import load_dotenv + +# Load from central .env or override path +dotenv_path = os.environ.get("DOTENV_PATH", str(Path.home() / ".env")) +load_dotenv(dotenv_path) + +# PostgreSQL +POSTGRES_URL = os.environ.get( + "POSTGRES_URL", + f"postgres://{os.environ.get('POSTGRES_USER', 'legal_ai')}:" + f"{os.environ.get('POSTGRES_PASSWORD', '')}@" + f"{os.environ.get('POSTGRES_HOST', '127.0.0.1')}:" + f"{os.environ.get('POSTGRES_PORT', '5433')}/" + f"{os.environ.get('POSTGRES_DB', 'legal_ai')}", +) + +# Redis +REDIS_URL = os.environ.get("REDIS_URL", "redis://127.0.0.1:6380/0") + +# Voyage AI +VOYAGE_API_KEY = os.environ.get("VOYAGE_API_KEY", "") +VOYAGE_MODEL = "voyage-3-large" +VOYAGE_DIMENSIONS = 1024 + +# Anthropic (for Claude Vision OCR) +ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") + +# Data directory +DATA_DIR = Path(os.environ.get("DATA_DIR", str(Path.home() / "legal-ai" / "data"))) +CASES_DIR = DATA_DIR / "cases" +TRAINING_DIR = DATA_DIR / "training" + +# Chunking parameters +CHUNK_SIZE_TOKENS = 600 +CHUNK_OVERLAP_TOKENS = 100 diff --git a/mcp-server/src/legal_mcp/models.py b/mcp-server/src/legal_mcp/models.py new file mode 100644 index 0000000..bfd1315 --- /dev/null +++ b/mcp-server/src/legal_mcp/models.py @@ -0,0 +1,97 @@ +"""Pydantic models for cases, documents, and related entities.""" + +from __future__ import annotations + +import enum +from datetime import date, datetime +from uuid import UUID + +from pydantic import BaseModel, Field + + +class CaseStatus(str, enum.Enum): + NEW = "new" + IN_PROGRESS = "in_progress" + DRAFTED = "drafted" + REVIEWED = "reviewed" + FINAL = "final" + + +class DocType(str, enum.Enum): + APPEAL = "appeal" # כתב ערר + RESPONSE = "response" # תשובה + DECISION = "decision" # החלטה + REFERENCE = "reference" # מסמך עזר + EXHIBIT = "exhibit" # נספח + + +class SectionType(str, enum.Enum): + INTRO = "intro" + FACTS = "facts" + APPELLANT_CLAIMS = "appellant_claims" + RESPONDENT_CLAIMS = "respondent_claims" + LEGAL_ANALYSIS = "legal_analysis" + CONCLUSION = "conclusion" + RULING = "ruling" + OTHER = "other" + + +class ExtractionStatus(str, enum.Enum): + PENDING = "pending" + PROCESSING = "processing" + COMPLETED = "completed" + FAILED = "failed" + + +class CaseCreate(BaseModel): + case_number: str = Field(description="מספר תיק הערר (לדוגמה: 123-24)") + title: str = Field(description="כותרת קצרה של הערר") + appellants: list[str] = Field(default_factory=list, description="שמות העוררים") + respondents: list[str] = Field(default_factory=list, description="שמות המשיבים") + subject: str = Field(default="", description="נושא הערר") + property_address: str = Field(default="", description="כתובת הנכס") + permit_number: str = Field(default="", description="מספר היתר") + committee_type: str = Field(default="ועדה מקומית", description="סוג הוועדה") + hearing_date: date | None = Field(default=None, description="תאריך דיון") + notes: str = Field(default="", description="הערות") + + +class CaseInfo(BaseModel): + id: UUID + case_number: str + title: str + appellants: list[str] + respondents: list[str] + subject: str + property_address: str + permit_number: str + committee_type: str + status: CaseStatus + hearing_date: date | None + decision_date: date | None + tags: list[str] + notes: str + created_at: datetime + updated_at: datetime + + +class DocumentInfo(BaseModel): + id: UUID + case_id: UUID + doc_type: DocType + title: str + file_path: str + extraction_status: ExtractionStatus + page_count: int | None + created_at: datetime + + +class SearchResult(BaseModel): + chunk_content: str + score: float + case_number: str + document_title: str + section_type: str + page_number: int | None + document_id: UUID + case_id: UUID diff --git a/mcp-server/src/legal_mcp/server.py b/mcp-server/src/legal_mcp/server.py new file mode 100644 index 0000000..b93c172 --- /dev/null +++ b/mcp-server/src/legal_mcp/server.py @@ -0,0 +1,219 @@ +"""Ezer Mishpati - MCP Server entry point. + +Run with: python -m legal_mcp.server +""" + +from __future__ import annotations + +import logging +import sys +from collections.abc import AsyncIterator +from contextlib import asynccontextmanager + +from mcp.server.fastmcp import FastMCP + +# Configure logging to stderr (stdout is reserved for JSON-RPC) +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", + stream=sys.stderr, +) +logger = logging.getLogger("legal_mcp") + + +@asynccontextmanager +async def lifespan(server: FastMCP) -> AsyncIterator[None]: + """Initialize DB schema on startup, close pool on shutdown.""" + from legal_mcp.services.db import close_pool, init_schema + + logger.info("Initializing database schema...") + await init_schema() + logger.info("Ezer Mishpati MCP server ready") + try: + yield + finally: + await close_pool() + logger.info("Ezer Mishpati MCP server stopped") + + +# Create MCP server +mcp = FastMCP( + "Ezer Mishpati - עוזר משפטי", + instructions="מערכת AI לסיוע בניסוח החלטות משפטיות בסגנון דפנה תמיר", + lifespan=lifespan, +) + +# ── Import and register tools ─────────────────────────────────────── + +from legal_mcp.tools import cases, documents, search, drafting, workflow # noqa: E402 + + +# Case management +@mcp.tool() +async def case_create( + case_number: str, + title: str, + appellants: list[str] | None = None, + respondents: list[str] | None = None, + subject: str = "", + property_address: str = "", + permit_number: str = "", + committee_type: str = "ועדה מקומית", + hearing_date: str = "", + notes: str = "", +) -> str: + """יצירת תיק ערר חדש.""" + return await cases.case_create( + case_number, title, appellants, respondents, + subject, property_address, permit_number, committee_type, + hearing_date, notes, + ) + + +@mcp.tool() +async def case_list(status: str = "", limit: int = 50) -> str: + """רשימת תיקי ערר. סינון אופציונלי לפי סטטוס (new/in_progress/drafted/reviewed/final).""" + return await cases.case_list(status, limit) + + +@mcp.tool() +async def case_get(case_number: str) -> str: + """פרטי תיק מלאים כולל רשימת מסמכים.""" + return await cases.case_get(case_number) + + +@mcp.tool() +async def case_update( + case_number: str, + status: str = "", + title: str = "", + subject: str = "", + notes: str = "", + hearing_date: str = "", + decision_date: str = "", + tags: list[str] | None = None, +) -> str: + """עדכון פרטי תיק.""" + return await cases.case_update( + case_number, status, title, subject, notes, + hearing_date, decision_date, tags, + ) + + +# Documents +@mcp.tool() +async def document_upload( + case_number: str, + file_path: str, + doc_type: str = "appeal", + title: str = "", +) -> str: + """העלאה ועיבוד מסמך לתיק ערר (PDF/DOCX/RTF/TXT). מחלץ טקסט ויוצר embeddings.""" + return await documents.document_upload(case_number, file_path, doc_type, title) + + +@mcp.tool() +async def document_upload_training( + file_path: str, + decision_number: str = "", + decision_date: str = "", + subject_categories: list[str] | None = None, + title: str = "", +) -> str: + """העלאת החלטה קודמת של דפנה לקורפוס הסגנון. קטגוריות: בנייה, שימוש חורג, תכנית, היתר, הקלה, חלוקה, תמ"א 38, היטל השבחה, פיצויים 197.""" + return await documents.document_upload_training( + file_path, decision_number, decision_date, subject_categories, title, + ) + + +@mcp.tool() +async def document_get_text(case_number: str, doc_title: str = "") -> str: + """קבלת טקסט מלא של מסמך מתוך תיק.""" + return await documents.document_get_text(case_number, doc_title) + + +@mcp.tool() +async def document_list(case_number: str) -> str: + """רשימת מסמכים בתיק.""" + return await documents.document_list(case_number) + + +# Search +@mcp.tool() +async def search_decisions( + query: str, + limit: int = 10, + section_type: str = "", +) -> str: + """חיפוש סמנטי בהחלטות קודמות ובמסמכים.""" + return await search.search_decisions(query, limit, section_type) + + +@mcp.tool() +async def search_case_documents( + case_number: str, + query: str, + limit: int = 10, +) -> str: + """חיפוש סמנטי בתוך מסמכי תיק ספציפי.""" + return await search.search_case_documents(case_number, query, limit) + + +@mcp.tool() +async def find_similar_cases( + description: str, + limit: int = 5, +) -> str: + """מציאת תיקים דומים על בסיס תיאור.""" + return await search.find_similar_cases(description, limit) + + +# Drafting +@mcp.tool() +async def get_style_guide() -> str: + """שליפת דפוסי הסגנון של דפנה - נוסחאות, ביטויים אופייניים ומבנה.""" + return await drafting.get_style_guide() + + +@mcp.tool() +async def draft_section( + case_number: str, + section: str, + instructions: str = "", +) -> str: + """הרכבת הקשר מלא לניסוח סעיף (עובדות + תקדימים + סגנון).""" + return await drafting.draft_section(case_number, section, instructions) + + +@mcp.tool() +async def get_decision_template(case_number: str) -> str: + """תבנית מבנית להחלטה מלאה עם פרטי התיק.""" + return await drafting.get_decision_template(case_number) + + +@mcp.tool() +async def analyze_style() -> str: + """ניתוח סגנון על קורפוס ההחלטות של דפנה. מחלץ ושומר דפוסי כתיבה.""" + return await drafting.analyze_style() + + +# Workflow +@mcp.tool() +async def workflow_status(case_number: str) -> str: + """סטטוס תהליך עבודה מלא לתיק.""" + return await workflow.workflow_status(case_number) + + +@mcp.tool() +async def processing_status() -> str: + """סטטוס כללי - מספר תיקים, מסמכים, chunks.""" + return await workflow.processing_status() + + + +def main(): + mcp.run(transport="stdio") + + +if __name__ == "__main__": + main() diff --git a/mcp-server/src/legal_mcp/services/__init__.py b/mcp-server/src/legal_mcp/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mcp-server/src/legal_mcp/services/chunker.py b/mcp-server/src/legal_mcp/services/chunker.py new file mode 100644 index 0000000..f9da9a0 --- /dev/null +++ b/mcp-server/src/legal_mcp/services/chunker.py @@ -0,0 +1,130 @@ +"""Legal document chunker - splits text into sections and chunks for RAG.""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field + +from legal_mcp import config + +# Hebrew legal section headers +SECTION_PATTERNS = [ + (r"רקע\s*עובדתי|רקע\s*כללי|העובדות|הרקע", "facts"), + (r"טענות\s*העוררי[םן]|טענות\s*המערערי[םן]|עיקר\s*טענות\s*העוררי[םן]", "appellant_claims"), + (r"טענות\s*המשיבי[םן]|תשובת\s*המשיבי[םן]|עיקר\s*טענות\s*המשיבי[םן]", "respondent_claims"), + (r"דיון\s*והכרעה|דיון|הכרעה|ניתוח\s*משפטי|המסגרת\s*המשפטית", "legal_analysis"), + (r"מסקנ[הות]|סיכום", "conclusion"), + (r"החלטה|לפיכך\s*אני\s*מחליט|התוצאה", "ruling"), + (r"מבוא|פתיחה|לפניי", "intro"), +] + + +@dataclass +class Chunk: + content: str + section_type: str = "other" + page_number: int | None = None + chunk_index: int = 0 + + +def chunk_document( + text: str, + chunk_size: int = config.CHUNK_SIZE_TOKENS, + overlap: int = config.CHUNK_OVERLAP_TOKENS, +) -> list[Chunk]: + """Split a legal document into chunks, respecting section boundaries.""" + if not text.strip(): + return [] + + sections = _split_into_sections(text) + chunks: list[Chunk] = [] + idx = 0 + + for section_type, section_text in sections: + section_chunks = _split_section(section_text, chunk_size, overlap) + for chunk_text in section_chunks: + chunks.append(Chunk( + content=chunk_text, + section_type=section_type, + chunk_index=idx, + )) + idx += 1 + + return chunks + + +def _split_into_sections(text: str) -> list[tuple[str, str]]: + """Split text into (section_type, text) pairs based on Hebrew headers.""" + # Find all section headers and their positions + markers: list[tuple[int, str]] = [] + + for pattern, section_type in SECTION_PATTERNS: + for match in re.finditer(pattern, text): + markers.append((match.start(), section_type)) + + if not markers: + # No sections found - treat as single block + return [("other", text)] + + markers.sort(key=lambda x: x[0]) + + sections: list[tuple[str, str]] = [] + + # Text before first section + if markers[0][0] > 0: + intro_text = text[: markers[0][0]].strip() + if intro_text: + sections.append(("intro", intro_text)) + + # Each section + for i, (pos, section_type) in enumerate(markers): + end = markers[i + 1][0] if i + 1 < len(markers) else len(text) + section_text = text[pos:end].strip() + if section_text: + sections.append((section_type, section_text)) + + return sections + + +def _split_section(text: str, chunk_size: int, overlap: int) -> list[str]: + """Split a section into overlapping chunks by paragraphs. + + Uses approximate token counting (Hebrew ~1.5 chars per token). + """ + if not text.strip(): + return [] + + paragraphs = [p.strip() for p in text.split("\n") if p.strip()] + chunks: list[str] = [] + current: list[str] = [] + current_tokens = 0 + + for para in paragraphs: + para_tokens = _estimate_tokens(para) + + if current_tokens + para_tokens > chunk_size and current: + chunks.append("\n".join(current)) + # Keep overlap + overlap_paras: list[str] = [] + overlap_tokens = 0 + for p in reversed(current): + pt = _estimate_tokens(p) + if overlap_tokens + pt > overlap: + break + overlap_paras.insert(0, p) + overlap_tokens += pt + current = overlap_paras + current_tokens = overlap_tokens + + current.append(para) + current_tokens += para_tokens + + if current: + chunks.append("\n".join(current)) + + return chunks + + +def _estimate_tokens(text: str) -> int: + """Rough token estimate for Hebrew text (~1.5 chars per token).""" + return max(1, len(text) // 2) diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py new file mode 100644 index 0000000..b6ac6fb --- /dev/null +++ b/mcp-server/src/legal_mcp/services/db.py @@ -0,0 +1,440 @@ +"""Database service - asyncpg connection pool and queries.""" + +from __future__ import annotations + +import json +import logging +from datetime import date +from uuid import UUID, uuid4 + +import asyncpg +from pgvector.asyncpg import register_vector + +from legal_mcp import config + +logger = logging.getLogger(__name__) + +_pool: asyncpg.Pool | None = None + + +async def get_pool() -> asyncpg.Pool: + global _pool + if _pool is None: + # First, ensure pgvector extension exists (before registering type codec) + conn = await asyncpg.connect(config.POSTGRES_URL) + await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') + await conn.execute('CREATE EXTENSION IF NOT EXISTS "uuid-ossp"') + await conn.close() + + _pool = await asyncpg.create_pool( + config.POSTGRES_URL, + min_size=2, + max_size=10, + init=_init_connection, + ) + return _pool + + +async def _init_connection(conn: asyncpg.Connection) -> None: + await register_vector(conn) + + +async def close_pool() -> None: + global _pool + if _pool: + await _pool.close() + _pool = None + + +# ── Schema ────────────────────────────────────────────────────────── + +SCHEMA_SQL = """ + +CREATE TABLE IF NOT EXISTS cases ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + case_number TEXT UNIQUE NOT NULL, + title TEXT NOT NULL, + appellants JSONB DEFAULT '[]', + respondents JSONB DEFAULT '[]', + subject TEXT DEFAULT '', + property_address TEXT DEFAULT '', + permit_number TEXT DEFAULT '', + committee_type TEXT DEFAULT 'ועדה מקומית', + status TEXT DEFAULT 'new', + hearing_date DATE, + decision_date DATE, + tags JSONB DEFAULT '[]', + notes TEXT DEFAULT '', + created_at TIMESTAMPTZ DEFAULT now(), + updated_at TIMESTAMPTZ DEFAULT now() +); + +CREATE TABLE IF NOT EXISTS documents ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + case_id UUID REFERENCES cases(id) ON DELETE CASCADE, + doc_type TEXT NOT NULL, + title TEXT NOT NULL, + file_path TEXT NOT NULL, + extracted_text TEXT DEFAULT '', + extraction_status TEXT DEFAULT 'pending', + page_count INTEGER, + metadata JSONB DEFAULT '{}', + created_at TIMESTAMPTZ DEFAULT now() +); + +CREATE TABLE IF NOT EXISTS document_chunks ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + document_id UUID REFERENCES documents(id) ON DELETE CASCADE, + case_id UUID REFERENCES cases(id) ON DELETE CASCADE, + chunk_index INTEGER NOT NULL, + content TEXT NOT NULL, + section_type TEXT DEFAULT 'other', + embedding vector(1024), + page_number INTEGER, + created_at TIMESTAMPTZ DEFAULT now() +); + +CREATE TABLE IF NOT EXISTS style_corpus ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + document_id UUID REFERENCES documents(id) ON DELETE SET NULL, + decision_number TEXT, + decision_date DATE, + subject_categories JSONB DEFAULT '[]', + full_text TEXT NOT NULL, + summary TEXT DEFAULT '', + outcome TEXT DEFAULT '', + key_principles JSONB DEFAULT '[]', + created_at TIMESTAMPTZ DEFAULT now() +); + +CREATE TABLE IF NOT EXISTS style_patterns ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + pattern_type TEXT NOT NULL, + pattern_text TEXT NOT NULL, + frequency INTEGER DEFAULT 1, + context TEXT DEFAULT '', + examples JSONB DEFAULT '[]', + created_at TIMESTAMPTZ DEFAULT now() +); + +CREATE INDEX IF NOT EXISTS idx_chunks_embedding + ON document_chunks USING ivfflat (embedding vector_cosine_ops) + WITH (lists = 100); + +CREATE INDEX IF NOT EXISTS idx_chunks_case ON document_chunks(case_id); +CREATE INDEX IF NOT EXISTS idx_chunks_doc ON document_chunks(document_id); +CREATE INDEX IF NOT EXISTS idx_docs_case ON documents(case_id); +CREATE INDEX IF NOT EXISTS idx_cases_status ON cases(status); +CREATE INDEX IF NOT EXISTS idx_cases_number ON cases(case_number); +""" + + +async def init_schema() -> None: + pool = await get_pool() + async with pool.acquire() as conn: + await conn.execute(SCHEMA_SQL) + logger.info("Database schema initialized") + + +# ── Case CRUD ─────────────────────────────────────────────────────── + +async def create_case( + case_number: str, + title: str, + appellants: list[str] | None = None, + respondents: list[str] | None = None, + subject: str = "", + property_address: str = "", + permit_number: str = "", + committee_type: str = "ועדה מקומית", + hearing_date: date | None = None, + notes: str = "", +) -> dict: + pool = await get_pool() + case_id = uuid4() + async with pool.acquire() as conn: + await conn.execute( + """INSERT INTO cases (id, case_number, title, appellants, respondents, + subject, property_address, permit_number, committee_type, + hearing_date, notes) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)""", + case_id, case_number, title, + json.dumps(appellants or []), + json.dumps(respondents or []), + subject, property_address, permit_number, committee_type, + hearing_date, notes, + ) + return await get_case(case_id) + + +async def get_case(case_id: UUID) -> dict | None: + pool = await get_pool() + async with pool.acquire() as conn: + row = await conn.fetchrow("SELECT * FROM cases WHERE id = $1", case_id) + if row is None: + return None + return _row_to_case(row) + + +async def get_case_by_number(case_number: str) -> dict | None: + pool = await get_pool() + async with pool.acquire() as conn: + row = await conn.fetchrow( + "SELECT * FROM cases WHERE case_number = $1", case_number + ) + if row is None: + return None + return _row_to_case(row) + + +async def list_cases(status: str | None = None, limit: int = 50) -> list[dict]: + pool = await get_pool() + async with pool.acquire() as conn: + if status: + rows = await conn.fetch( + "SELECT * FROM cases WHERE status = $1 ORDER BY updated_at DESC LIMIT $2", + status, limit, + ) + else: + rows = await conn.fetch( + "SELECT * FROM cases ORDER BY updated_at DESC LIMIT $1", limit + ) + return [_row_to_case(r) for r in rows] + + +async def update_case(case_id: UUID, **fields) -> dict | None: + if not fields: + return await get_case(case_id) + pool = await get_pool() + set_clauses = [] + values = [] + for i, (key, val) in enumerate(fields.items(), start=2): + if key in ("appellants", "respondents", "tags"): + val = json.dumps(val) + set_clauses.append(f"{key} = ${i}") + values.append(val) + set_clauses.append("updated_at = now()") + sql = f"UPDATE cases SET {', '.join(set_clauses)} WHERE id = $1" + async with pool.acquire() as conn: + await conn.execute(sql, case_id, *values) + return await get_case(case_id) + + +def _row_to_case(row: asyncpg.Record) -> dict: + d = dict(row) + for field in ("appellants", "respondents", "tags"): + if isinstance(d.get(field), str): + d[field] = json.loads(d[field]) + d["id"] = str(d["id"]) + return d + + +# ── Document CRUD ─────────────────────────────────────────────────── + +async def create_document( + case_id: UUID, + doc_type: str, + title: str, + file_path: str, + page_count: int | None = None, +) -> dict: + pool = await get_pool() + doc_id = uuid4() + async with pool.acquire() as conn: + await conn.execute( + """INSERT INTO documents (id, case_id, doc_type, title, file_path, page_count) + VALUES ($1, $2, $3, $4, $5, $6)""", + doc_id, case_id, doc_type, title, file_path, page_count, + ) + row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id) + return _row_to_doc(row) + + +async def update_document(doc_id: UUID, **fields) -> None: + if not fields: + return + pool = await get_pool() + set_clauses = [] + values = [] + for i, (key, val) in enumerate(fields.items(), start=2): + if key == "metadata": + val = json.dumps(val) + set_clauses.append(f"{key} = ${i}") + values.append(val) + sql = f"UPDATE documents SET {', '.join(set_clauses)} WHERE id = $1" + async with pool.acquire() as conn: + await conn.execute(sql, doc_id, *values) + + +async def get_document(doc_id: UUID) -> dict | None: + pool = await get_pool() + async with pool.acquire() as conn: + row = await conn.fetchrow("SELECT * FROM documents WHERE id = $1", doc_id) + return _row_to_doc(row) if row else None + + +async def list_documents(case_id: UUID) -> list[dict]: + pool = await get_pool() + async with pool.acquire() as conn: + rows = await conn.fetch( + "SELECT * FROM documents WHERE case_id = $1 ORDER BY created_at", case_id + ) + return [_row_to_doc(r) for r in rows] + + +async def get_document_text(doc_id: UUID) -> str: + pool = await get_pool() + async with pool.acquire() as conn: + row = await conn.fetchrow( + "SELECT extracted_text FROM documents WHERE id = $1", doc_id + ) + return row["extracted_text"] if row else "" + + +def _row_to_doc(row: asyncpg.Record) -> dict: + d = dict(row) + d["id"] = str(d["id"]) + d["case_id"] = str(d["case_id"]) + if isinstance(d.get("metadata"), str): + d["metadata"] = json.loads(d["metadata"]) + return d + + +# ── Chunks & Vectors ─────────────────────────────────────────────── + +async def store_chunks( + document_id: UUID, + case_id: UUID | None, + chunks: list[dict], +) -> int: + """Store document chunks with embeddings. Each chunk dict has: + content, section_type, embedding (list[float]), page_number, chunk_index + """ + pool = await get_pool() + async with pool.acquire() as conn: + # Delete existing chunks for this document + await conn.execute( + "DELETE FROM document_chunks WHERE document_id = $1", document_id + ) + for chunk in chunks: + await conn.execute( + """INSERT INTO document_chunks + (document_id, case_id, chunk_index, content, section_type, embedding, page_number) + VALUES ($1, $2, $3, $4, $5, $6, $7)""", + document_id, case_id, + chunk["chunk_index"], + chunk["content"], + chunk.get("section_type", "other"), + chunk["embedding"], + chunk.get("page_number"), + ) + return len(chunks) + + +async def search_similar( + query_embedding: list[float], + limit: int = 10, + case_id: UUID | None = None, + section_type: str | None = None, +) -> list[dict]: + """Cosine similarity search on document chunks.""" + pool = await get_pool() + conditions = [] + params: list = [query_embedding, limit] + param_idx = 3 + + if case_id: + conditions.append(f"dc.case_id = ${param_idx}") + params.append(case_id) + param_idx += 1 + if section_type: + conditions.append(f"dc.section_type = ${param_idx}") + params.append(section_type) + param_idx += 1 + + where = f"WHERE {' AND '.join(conditions)}" if conditions else "" + + sql = f""" + SELECT dc.content, dc.section_type, dc.page_number, + dc.document_id, dc.case_id, + d.title AS document_title, + c.case_number, + 1 - (dc.embedding <=> $1) AS score + FROM document_chunks dc + JOIN documents d ON d.id = dc.document_id + JOIN cases c ON c.id = dc.case_id + {where} + ORDER BY dc.embedding <=> $1 + LIMIT $2 + """ + async with pool.acquire() as conn: + rows = await conn.fetch(sql, *params) + return [dict(r) for r in rows] + + +# ── Style corpus ──────────────────────────────────────────────────── + +async def add_to_style_corpus( + document_id: UUID | None, + decision_number: str, + decision_date: date | None, + subject_categories: list[str], + full_text: str, + summary: str = "", + outcome: str = "", + key_principles: list[str] | None = None, +) -> UUID: + pool = await get_pool() + corpus_id = uuid4() + async with pool.acquire() as conn: + await conn.execute( + """INSERT INTO style_corpus + (id, document_id, decision_number, decision_date, + subject_categories, full_text, summary, outcome, key_principles) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""", + corpus_id, document_id, decision_number, decision_date, + json.dumps(subject_categories), full_text, summary, outcome, + json.dumps(key_principles or []), + ) + return corpus_id + + +async def get_style_patterns(pattern_type: str | None = None) -> list[dict]: + pool = await get_pool() + async with pool.acquire() as conn: + if pattern_type: + rows = await conn.fetch( + "SELECT * FROM style_patterns WHERE pattern_type = $1 ORDER BY frequency DESC", + pattern_type, + ) + else: + rows = await conn.fetch( + "SELECT * FROM style_patterns ORDER BY pattern_type, frequency DESC" + ) + return [dict(r) for r in rows] + + +async def upsert_style_pattern( + pattern_type: str, + pattern_text: str, + context: str = "", + examples: list[str] | None = None, +) -> None: + pool = await get_pool() + async with pool.acquire() as conn: + existing = await conn.fetchrow( + "SELECT id, frequency FROM style_patterns WHERE pattern_type = $1 AND pattern_text = $2", + pattern_type, pattern_text, + ) + if existing: + await conn.execute( + "UPDATE style_patterns SET frequency = frequency + 1 WHERE id = $1", + existing["id"], + ) + else: + await conn.execute( + """INSERT INTO style_patterns (pattern_type, pattern_text, context, examples) + VALUES ($1, $2, $3, $4)""", + pattern_type, pattern_text, context, + json.dumps(examples or []), + ) diff --git a/mcp-server/src/legal_mcp/services/embeddings.py b/mcp-server/src/legal_mcp/services/embeddings.py new file mode 100644 index 0000000..69d39be --- /dev/null +++ b/mcp-server/src/legal_mcp/services/embeddings.py @@ -0,0 +1,55 @@ +"""Embedding service using Voyage AI API.""" + +from __future__ import annotations + +import logging + +import voyageai + +from legal_mcp import config + +logger = logging.getLogger(__name__) + +_client: voyageai.Client | None = None + + +def _get_client() -> voyageai.Client: + global _client + if _client is None: + _client = voyageai.Client(api_key=config.VOYAGE_API_KEY) + return _client + + +async def embed_texts(texts: list[str], input_type: str = "document") -> list[list[float]]: + """Embed a batch of texts using Voyage AI. + + Args: + texts: List of texts to embed (max 128 per call). + input_type: "document" for indexing, "query" for search queries. + + Returns: + List of embedding vectors (1024 dimensions each). + """ + if not texts: + return [] + + client = _get_client() + all_embeddings = [] + + # Voyage AI supports up to 128 texts per batch + for i in range(0, len(texts), 128): + batch = texts[i : i + 128] + result = client.embed( + batch, + model=config.VOYAGE_MODEL, + input_type=input_type, + ) + all_embeddings.extend(result.embeddings) + + return all_embeddings + + +async def embed_query(query: str) -> list[float]: + """Embed a single search query.""" + results = await embed_texts([query], input_type="query") + return results[0] diff --git a/mcp-server/src/legal_mcp/services/extractor.py b/mcp-server/src/legal_mcp/services/extractor.py new file mode 100644 index 0000000..7874e9c --- /dev/null +++ b/mcp-server/src/legal_mcp/services/extractor.py @@ -0,0 +1,126 @@ +"""Text extraction from PDF, DOCX, and RTF files. + +Primary PDF extraction: Claude Vision API (for scanned documents). +Fallback: PyMuPDF direct text extraction (for born-digital PDFs). +""" + +from __future__ import annotations + +import base64 +import logging +from pathlib import Path + +import anthropic +import fitz # PyMuPDF +from docx import Document as DocxDocument +from striprtf.striprtf import rtf_to_text + +from legal_mcp import config + +logger = logging.getLogger(__name__) + +_anthropic_client: anthropic.Anthropic | None = None + + +def _get_anthropic() -> anthropic.Anthropic: + global _anthropic_client + if _anthropic_client is None: + _anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY) + return _anthropic_client + + +async def extract_text(file_path: str) -> tuple[str, int]: + """Extract text from a document file. + + Returns: + Tuple of (extracted_text, page_count). + page_count is 0 for non-PDF files. + """ + path = Path(file_path) + suffix = path.suffix.lower() + + if suffix == ".pdf": + return await _extract_pdf(path) + elif suffix == ".docx": + return _extract_docx(path), 0 + elif suffix == ".rtf": + return _extract_rtf(path), 0 + elif suffix == ".txt": + return path.read_text(encoding="utf-8"), 0 + else: + raise ValueError(f"Unsupported file type: {suffix}") + + +async def _extract_pdf(path: Path) -> tuple[str, int]: + """Extract text from PDF. Try direct text first, fall back to Claude Vision for scanned pages.""" + doc = fitz.open(str(path)) + page_count = len(doc) + pages_text: list[str] = [] + + for page_num in range(page_count): + page = doc[page_num] + # Try direct text extraction first + text = page.get_text().strip() + + if len(text) > 50: + # Sufficient text found - born-digital page + pages_text.append(text) + logger.debug("Page %d: direct text extraction (%d chars)", page_num + 1, len(text)) + else: + # Likely scanned - use Claude Vision + logger.info("Page %d: using Claude Vision OCR", page_num + 1) + pix = page.get_pixmap(dpi=200) + img_bytes = pix.tobytes("png") + ocr_text = await _ocr_with_claude(img_bytes, page_num + 1) + pages_text.append(ocr_text) + + doc.close() + return "\n\n".join(pages_text), page_count + + +async def _ocr_with_claude(image_bytes: bytes, page_num: int) -> str: + """OCR a single page image using Claude Vision API.""" + client = _get_anthropic() + b64_image = base64.b64encode(image_bytes).decode("utf-8") + + message = client.messages.create( + model="claude-sonnet-4-20250514", + max_tokens=4096, + messages=[ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": b64_image, + }, + }, + { + "type": "text", + "text": ( + "חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. " + "שמור על מבנה הפסקאות המקורי. " + "החזר רק את הטקסט המחולץ, ללא הערות נוספות." + ), + }, + ], + } + ], + ) + return message.content[0].text + + +def _extract_docx(path: Path) -> str: + """Extract text from DOCX file.""" + doc = DocxDocument(str(path)) + paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] + return "\n\n".join(paragraphs) + + +def _extract_rtf(path: Path) -> str: + """Extract text from RTF file.""" + rtf_content = path.read_text(encoding="utf-8", errors="replace") + return rtf_to_text(rtf_content) diff --git a/mcp-server/src/legal_mcp/services/processor.py b/mcp-server/src/legal_mcp/services/processor.py new file mode 100644 index 0000000..4598760 --- /dev/null +++ b/mcp-server/src/legal_mcp/services/processor.py @@ -0,0 +1,79 @@ +"""Document processing pipeline: extract → chunk → embed → store.""" + +from __future__ import annotations + +import logging +from uuid import UUID + +from legal_mcp.services import chunker, db, embeddings, extractor + +logger = logging.getLogger(__name__) + + +async def process_document(document_id: UUID, case_id: UUID) -> dict: + """Full processing pipeline for a document. + + 1. Extract text from file + 2. Split into chunks + 3. Generate embeddings + 4. Store chunks + embeddings in DB + + Returns processing summary. + """ + doc = await db.get_document(document_id) + if not doc: + raise ValueError(f"Document {document_id} not found") + + await db.update_document(document_id, extraction_status="processing") + + try: + # Step 1: Extract text + logger.info("Extracting text from %s", doc["file_path"]) + text, page_count = await extractor.extract_text(doc["file_path"]) + + await db.update_document( + document_id, + extracted_text=text, + page_count=page_count, + ) + + # Step 2: Chunk + logger.info("Chunking document (%d chars)", len(text)) + chunks = chunker.chunk_document(text) + + if not chunks: + await db.update_document(document_id, extraction_status="completed") + return {"status": "completed", "chunks": 0, "message": "No text to chunk"} + + # Step 3: Embed + logger.info("Generating embeddings for %d chunks", len(chunks)) + texts = [c.content for c in chunks] + embs = await embeddings.embed_texts(texts, input_type="document") + + # Step 4: Store + chunk_dicts = [ + { + "content": c.content, + "section_type": c.section_type, + "embedding": emb, + "page_number": c.page_number, + "chunk_index": c.chunk_index, + } + for c, emb in zip(chunks, embs) + ] + + stored = await db.store_chunks(document_id, case_id, chunk_dicts) + await db.update_document(document_id, extraction_status="completed") + + logger.info("Document processed: %d chunks stored", stored) + return { + "status": "completed", + "chunks": stored, + "pages": page_count, + "text_length": len(text), + } + + except Exception as e: + logger.exception("Document processing failed: %s", e) + await db.update_document(document_id, extraction_status="failed") + return {"status": "failed", "error": str(e)} diff --git a/mcp-server/src/legal_mcp/services/style_analyzer.py b/mcp-server/src/legal_mcp/services/style_analyzer.py new file mode 100644 index 0000000..2a3c43a --- /dev/null +++ b/mcp-server/src/legal_mcp/services/style_analyzer.py @@ -0,0 +1,121 @@ +"""Style analyzer - extracts writing patterns from Dafna's decision corpus.""" + +from __future__ import annotations + +import logging +import re + +import anthropic + +from legal_mcp import config +from legal_mcp.services import db + +logger = logging.getLogger(__name__) + + +ANALYSIS_PROMPT = """\ +אתה מנתח סגנון כתיבה משפטית. לפניך החלטות משפטיות שנכתבו על ידי אותה יושבת ראש של ועדת ערר. + +נתח את ההחלטות וחלץ את דפוסי הכתיבה הבאים: + +1. **נוסחאות פתיחה** (opening_formula) - איך מתחילות ההחלטות +2. **ביטויי מעבר** (transition) - ביטויים שמחברים בין חלקי ההחלטה +3. **סגנון ציטוט** (citation_style) - איך מצטטים חקיקה ופסיקה +4. **מבנה ניתוח** (analysis_structure) - איך בנוי הניתוח המשפטי +5. **נוסחאות סיום** (closing_formula) - איך מסתיימות ההחלטות +6. **ביטויים אופייניים** (characteristic_phrase) - ביטויים ייחודיים שחוזרים + +לכל דפוס, תן: +- הטקסט המדויק של הדפוס +- הקשר (באיזה חלק של ההחלטה הוא מופיע) +- דוגמה מתוך הטקסט + +החזר את התוצאות בפורמט הבא (JSON array): +```json +[ + {{ + "type": "opening_formula", + "text": "לפניי ערר על החלטת...", + "context": "פתיחת ההחלטה", + "example": "לפניי ערר על החלטת הוועדה המקומית לתכנון ובניה ירושלים" + }} +] +``` + +ההחלטות: +{decisions} +""" + + +async def analyze_corpus() -> dict: + """Analyze the style corpus and extract/update patterns. + + Returns summary of patterns found. + """ + pool = await db.get_pool() + async with pool.acquire() as conn: + rows = await conn.fetch( + "SELECT full_text, decision_number FROM style_corpus ORDER BY decision_date DESC LIMIT 20" + ) + + if not rows: + return {"error": "אין החלטות בקורפוס. העלה החלטות קודמות תחילה."} + + # Prepare text for analysis + decisions_text = "" + for row in rows: + decisions_text += f"\n\n--- החלטה {row['decision_number'] or 'ללא מספר'} ---\n" + # Limit each decision to ~3000 chars to fit context + text = row["full_text"] + if len(text) > 3000: + text = text[:1500] + "\n...\n" + text[-1500:] + decisions_text += text + + # Call Claude to analyze patterns + client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY) + message = client.messages.create( + model="claude-sonnet-4-6", + max_tokens=16384, + messages=[ + { + "role": "user", + "content": ANALYSIS_PROMPT.format(decisions=decisions_text), + } + ], + ) + + response_text = message.content[0].text + + # Extract JSON from response - prefer code-block fenced JSON + import json + code_block = re.search(r"```(?:json)?\s*(\[[\s\S]*?\])\s*```", response_text) + if code_block: + json_str = code_block.group(1) + else: + # Fallback: find the last JSON array (skip prose brackets) + all_arrays = list(re.finditer(r"\[[\s\S]*?\]", response_text)) + if not all_arrays: + return {"error": "Could not parse analysis results", "raw": response_text} + json_str = all_arrays[-1].group() + + try: + patterns = json.loads(json_str) + except json.JSONDecodeError as e: + return {"error": f"JSON parse error: {e}", "raw": response_text} + + # Store patterns + count = 0 + for pattern in patterns: + await db.upsert_style_pattern( + pattern_type=pattern.get("type", "other"), + pattern_text=pattern.get("text", ""), + context=pattern.get("context", ""), + examples=[pattern.get("example", "")], + ) + count += 1 + + return { + "patterns_found": count, + "decisions_analyzed": len(rows), + "pattern_types": list({p.get("type") for p in patterns}), + } diff --git a/mcp-server/src/legal_mcp/tools/__init__.py b/mcp-server/src/legal_mcp/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mcp-server/src/legal_mcp/tools/cases.py b/mcp-server/src/legal_mcp/tools/cases.py new file mode 100644 index 0000000..524934e --- /dev/null +++ b/mcp-server/src/legal_mcp/tools/cases.py @@ -0,0 +1,177 @@ +"""MCP tools for case management.""" + +from __future__ import annotations + +import json +import subprocess +from pathlib import Path +from uuid import UUID + +from legal_mcp import config +from legal_mcp.services import db + + +async def case_create( + case_number: str, + title: str, + appellants: list[str] | None = None, + respondents: list[str] | None = None, + subject: str = "", + property_address: str = "", + permit_number: str = "", + committee_type: str = "ועדה מקומית", + hearing_date: str = "", + notes: str = "", +) -> str: + """יצירת תיק ערר חדש. + + Args: + case_number: מספר תיק הערר (לדוגמה: 123-24) + title: כותרת קצרה של הערר + appellants: שמות העוררים + respondents: שמות המשיבים + subject: נושא הערר + property_address: כתובת הנכס + permit_number: מספר היתר + committee_type: סוג הוועדה (ברירת מחדל: ועדה מקומית) + hearing_date: תאריך דיון (YYYY-MM-DD) + notes: הערות + """ + from datetime import date as date_type + + h_date = None + if hearing_date: + h_date = date_type.fromisoformat(hearing_date) + + case = await db.create_case( + case_number=case_number, + title=title, + appellants=appellants, + respondents=respondents, + subject=subject, + property_address=property_address, + permit_number=permit_number, + committee_type=committee_type, + hearing_date=h_date, + notes=notes, + ) + + # Initialize git repo for the case + case_dir = config.CASES_DIR / case_number + case_dir.mkdir(parents=True, exist_ok=True) + (case_dir / "documents").mkdir(exist_ok=True) + (case_dir / "drafts").mkdir(exist_ok=True) + + # Save case metadata + case_json = case_dir / "case.json" + case_json.write_text(json.dumps(case, default=str, ensure_ascii=False, indent=2)) + + # Create notes file + notes_file = case_dir / "notes.md" + notes_file.write_text(f"# הערות - תיק {case_number}\n\n{notes}\n") + + # Initialize git repo + subprocess.run(["git", "init"], cwd=case_dir, capture_output=True) + subprocess.run(["git", "add", "."], cwd=case_dir, capture_output=True) + subprocess.run( + ["git", "commit", "-m", f"אתחול תיק {case_number}: {title}"], + cwd=case_dir, + capture_output=True, + env={"GIT_AUTHOR_NAME": "Ezer Mishpati", "GIT_AUTHOR_EMAIL": "legal@local", + "GIT_COMMITTER_NAME": "Ezer Mishpati", "GIT_COMMITTER_EMAIL": "legal@local", + "PATH": "/usr/bin:/bin"}, + ) + + return json.dumps(case, default=str, ensure_ascii=False, indent=2) + + +async def case_list(status: str = "", limit: int = 50) -> str: + """רשימת תיקי ערר עם אפשרות סינון לפי סטטוס. + + Args: + status: סינון לפי סטטוס (new, in_progress, drafted, reviewed, final). ריק = הכל + limit: מספר תוצאות מקסימלי + """ + cases = await db.list_cases(status=status or None, limit=limit) + if not cases: + return "אין תיקים." + return json.dumps(cases, default=str, ensure_ascii=False, indent=2) + + +async def case_get(case_number: str) -> str: + """קבלת פרטי תיק מלאים כולל רשימת מסמכים. + + Args: + case_number: מספר תיק הערר + """ + case = await db.get_case_by_number(case_number) + if not case: + return f"תיק {case_number} לא נמצא." + + docs = await db.list_documents(UUID(case["id"])) + case["documents"] = docs + return json.dumps(case, default=str, ensure_ascii=False, indent=2) + + +async def case_update( + case_number: str, + status: str = "", + title: str = "", + subject: str = "", + notes: str = "", + hearing_date: str = "", + decision_date: str = "", + tags: list[str] | None = None, +) -> str: + """עדכון פרטי תיק. + + Args: + case_number: מספר תיק הערר + status: סטטוס חדש (new, in_progress, drafted, reviewed, final) + title: כותרת חדשה + subject: נושא חדש + notes: הערות חדשות + hearing_date: תאריך דיון (YYYY-MM-DD) + decision_date: תאריך החלטה (YYYY-MM-DD) + tags: תגיות + """ + from datetime import date as date_type + + case = await db.get_case_by_number(case_number) + if not case: + return f"תיק {case_number} לא נמצא." + + fields = {} + if status: + fields["status"] = status + if title: + fields["title"] = title + if subject: + fields["subject"] = subject + if notes: + fields["notes"] = notes + if hearing_date: + fields["hearing_date"] = date_type.fromisoformat(hearing_date) + if decision_date: + fields["decision_date"] = date_type.fromisoformat(decision_date) + if tags is not None: + fields["tags"] = tags + + updated = await db.update_case(UUID(case["id"]), **fields) + + # Git commit the update + case_dir = config.CASES_DIR / case_number + if case_dir.exists(): + case_json = case_dir / "case.json" + case_json.write_text(json.dumps(updated, default=str, ensure_ascii=False, indent=2)) + subprocess.run(["git", "add", "case.json"], cwd=case_dir, capture_output=True) + subprocess.run( + ["git", "commit", "-m", f"עדכון תיק: {', '.join(fields.keys())}"], + cwd=case_dir, + capture_output=True, + env={"GIT_AUTHOR_NAME": "Ezer Mishpati", "GIT_AUTHOR_EMAIL": "legal@local", + "GIT_COMMITTER_NAME": "Ezer Mishpati", "GIT_COMMITTER_EMAIL": "legal@local", + "PATH": "/usr/bin:/bin"}, + ) + + return json.dumps(updated, default=str, ensure_ascii=False, indent=2) diff --git a/mcp-server/src/legal_mcp/tools/documents.py b/mcp-server/src/legal_mcp/tools/documents.py new file mode 100644 index 0000000..9534476 --- /dev/null +++ b/mcp-server/src/legal_mcp/tools/documents.py @@ -0,0 +1,218 @@ +"""MCP tools for document management and processing.""" + +from __future__ import annotations + +import json +import shutil +import subprocess +from pathlib import Path +from uuid import UUID + +from legal_mcp import config +from legal_mcp.services import db, processor + + +async def document_upload( + case_number: str, + file_path: str, + doc_type: str = "appeal", + title: str = "", +) -> str: + """העלאה ועיבוד מסמך לתיק ערר. מחלץ טקסט, יוצר chunks ו-embeddings. + + Args: + case_number: מספר תיק הערר + file_path: נתיב מלא לקובץ (PDF, DOCX, RTF, TXT) + doc_type: סוג מסמך (appeal=כתב ערר, response=תשובה, decision=החלטה, reference=מסמך עזר, exhibit=נספח) + title: שם המסמך (אם ריק, ייקח משם הקובץ) + """ + case = await db.get_case_by_number(case_number) + if not case: + return f"תיק {case_number} לא נמצא." + + source = Path(file_path) + if not source.exists(): + return f"קובץ לא נמצא: {file_path}" + + case_id = UUID(case["id"]) + if not title: + title = source.stem + + # Copy file to case directory + case_dir = config.CASES_DIR / case_number / "documents" + case_dir.mkdir(parents=True, exist_ok=True) + dest = case_dir / source.name + shutil.copy2(str(source), str(dest)) + + # Create document record + doc = await db.create_document( + case_id=case_id, + doc_type=doc_type, + title=title, + file_path=str(dest), + ) + + # Process document (extract → chunk → embed → store) + result = await processor.process_document(UUID(doc["id"]), case_id) + + # Git commit + repo_dir = config.CASES_DIR / case_number + if repo_dir.exists(): + subprocess.run(["git", "add", "."], cwd=repo_dir, capture_output=True) + doc_type_hebrew = { + "appeal": "כתב ערר", + "response": "תשובה", + "decision": "החלטה", + "reference": "מסמך עזר", + "exhibit": "נספח", + }.get(doc_type, doc_type) + subprocess.run( + ["git", "commit", "-m", f"הוספת {doc_type_hebrew}: {title}"], + cwd=repo_dir, + capture_output=True, + env={"GIT_AUTHOR_NAME": "Ezer Mishpati", "GIT_AUTHOR_EMAIL": "legal@local", + "GIT_COMMITTER_NAME": "Ezer Mishpati", "GIT_COMMITTER_EMAIL": "legal@local", + "PATH": "/usr/bin:/bin"}, + ) + + return json.dumps({ + "document": doc, + "processing": result, + }, default=str, ensure_ascii=False, indent=2) + + +async def document_upload_training( + file_path: str, + decision_number: str = "", + decision_date: str = "", + subject_categories: list[str] | None = None, + title: str = "", +) -> str: + """העלאת החלטה קודמת של דפנה לקורפוס הסגנון (training). + + Args: + file_path: נתיב מלא לקובץ ההחלטה + decision_number: מספר ההחלטה + decision_date: תאריך ההחלטה (YYYY-MM-DD) + subject_categories: קטגוריות - אפשר לבחור כמה (בנייה, שימוש חורג, תכנית, היתר, הקלה, חלוקה, תמ"א 38, היטל השבחה, פיצויים 197) + title: שם המסמך + """ + from datetime import date as date_type + + from legal_mcp.services import extractor, embeddings, chunker + + source = Path(file_path) + if not source.exists(): + return f"קובץ לא נמצא: {file_path}" + + if not title: + title = source.stem + + # Copy to training directory (skip if already there) + config.TRAINING_DIR.mkdir(parents=True, exist_ok=True) + dest = config.TRAINING_DIR / source.name + if source.resolve() != dest.resolve(): + shutil.copy2(str(source), str(dest)) + + # Extract text + text, page_count = await extractor.extract_text(str(dest)) + + # Parse date + d_date = None + if decision_date: + d_date = date_type.fromisoformat(decision_date) + + # Add to style corpus + corpus_id = await db.add_to_style_corpus( + document_id=None, + decision_number=decision_number, + decision_date=d_date, + subject_categories=subject_categories or [], + full_text=text, + ) + + # Chunk and embed for RAG search over training corpus + chunks = chunker.chunk_document(text) + if chunks: + # Create a document record (no case association) + doc = await db.create_document( + case_id=None, + doc_type="decision", + title=f"[קורפוס] {title}", + file_path=str(dest), + page_count=page_count, + ) + doc_id = UUID(doc["id"]) + await db.update_document(doc_id, extracted_text=text, extraction_status="completed") + + # Generate embeddings and store chunks + texts = [c.content for c in chunks] + embs = await embeddings.embed_texts(texts, input_type="document") + chunk_dicts = [ + { + "content": c.content, + "section_type": c.section_type, + "embedding": emb, + "page_number": c.page_number, + "chunk_index": c.chunk_index, + } + for c, emb in zip(chunks, embs) + ] + await db.store_chunks(doc_id, None, chunk_dicts) + + return json.dumps({ + "corpus_id": str(corpus_id), + "title": title, + "pages": page_count, + "text_length": len(text), + "chunks": len(chunks) if chunks else 0, + }, default=str, ensure_ascii=False, indent=2) + + +async def document_get_text(case_number: str, doc_title: str = "") -> str: + """קבלת טקסט מלא של מסמך מתוך תיק. + + Args: + case_number: מספר תיק הערר + doc_title: שם המסמך (אם ריק, מחזיר את כל המסמכים) + """ + case = await db.get_case_by_number(case_number) + if not case: + return f"תיק {case_number} לא נמצא." + + docs = await db.list_documents(UUID(case["id"])) + if not docs: + return f"אין מסמכים בתיק {case_number}." + + if doc_title: + docs = [d for d in docs if doc_title.lower() in d["title"].lower()] + if not docs: + return f"מסמך '{doc_title}' לא נמצא בתיק." + + results = [] + for doc in docs: + text = await db.get_document_text(UUID(doc["id"])) + results.append({ + "title": doc["title"], + "doc_type": doc["doc_type"], + "text": text[:10000] if text else "(ללא טקסט)", + }) + + return json.dumps(results, ensure_ascii=False, indent=2) + + +async def document_list(case_number: str) -> str: + """רשימת מסמכים בתיק. + + Args: + case_number: מספר תיק הערר + """ + case = await db.get_case_by_number(case_number) + if not case: + return f"תיק {case_number} לא נמצא." + + docs = await db.list_documents(UUID(case["id"])) + if not docs: + return f"אין מסמכים בתיק {case_number}." + + return json.dumps(docs, default=str, ensure_ascii=False, indent=2) diff --git a/mcp-server/src/legal_mcp/tools/drafting.py b/mcp-server/src/legal_mcp/tools/drafting.py new file mode 100644 index 0000000..3c20b4b --- /dev/null +++ b/mcp-server/src/legal_mcp/tools/drafting.py @@ -0,0 +1,202 @@ +"""MCP tools for decision drafting support.""" + +from __future__ import annotations + +import json +from uuid import UUID + +from legal_mcp.services import db, embeddings + + +DECISION_TEMPLATE = """# החלטה + +## בפני: דפנה תמיר, יו"ר ועדת הערר מחוז ירושלים + +**ערר מספר:** {case_number} +**נושא:** {subject} +**העוררים:** {appellants} +**המשיבים:** {respondents} +**כתובת הנכס:** {property_address} + +--- + +## א. רקע עובדתי + +[תיאור הרקע העובדתי של הערר] + +## ב. טענות העוררים + +[סיכום טענות העוררים] + +## ג. טענות המשיבים + +[סיכום טענות המשיבים] + +## ד. דיון והכרעה + +[ניתוח משפטי] + +## ה. מסקנה + +[מסקנת הוועדה] + +## ו. החלטה + +[ההחלטה הסופית] + +--- +ניתנה היום, {date} +דפנה תמיר, יו"ר ועדת הערר +""" + + +async def get_style_guide() -> str: + """שליפת דפוסי הסגנון של דפנה - נוסחאות, ביטויים אופייניים ומבנה.""" + patterns = await db.get_style_patterns() + + if not patterns: + return "לא נמצאו דפוסי סגנון. יש להעלות החלטות קודמות ולהריץ ניתוח סגנון (/style-report)." + + grouped: dict[str, list] = {} + for p in patterns: + pt = p["pattern_type"] + if pt not in grouped: + grouped[pt] = [] + grouped[pt].append({ + "text": p["pattern_text"], + "context": p["context"], + "frequency": p["frequency"], + }) + + type_names = { + "opening_formula": "נוסחאות פתיחה", + "transition": "ביטויי מעבר", + "citation_style": "סגנון ציטוט", + "analysis_structure": "מבנה ניתוח", + "closing_formula": "נוסחאות סיום", + "characteristic_phrase": "ביטויים אופייניים", + } + + result = "# מדריך סגנון - דפנה תמיר\n\n" + for ptype, items in grouped.items(): + result += f"## {type_names.get(ptype, ptype)}\n\n" + for item in items: + result += f"- **{item['text']}** ({item['context']}, תדירות: {item['frequency']})\n" + result += "\n" + + return result + + +async def draft_section( + case_number: str, + section: str, + instructions: str = "", +) -> str: + """הרכבת הקשר מלא לניסוח סעיף בהחלטה - כולל עובדות מהמסמכים, תקדימים רלוונטיים ודפוסי סגנון. + + Args: + case_number: מספר תיק הערר + section: סוג הסעיף (facts, appellant_claims, respondent_claims, legal_analysis, conclusion, ruling) + instructions: הנחיות נוספות לניסוח + """ + case = await db.get_case_by_number(case_number) + if not case: + return f"תיק {case_number} לא נמצא." + + case_id = UUID(case["id"]) + + # 1. Get relevant chunks from case documents + section_query = { + "facts": "רקע עובדתי של התיק", + "appellant_claims": "טענות העוררים", + "respondent_claims": "טענות המשיבים", + "legal_analysis": "ניתוח משפטי ודיון", + "conclusion": "מסקנות", + "ruling": "החלטה", + }.get(section, section) + + query_emb = await embeddings.embed_query(section_query) + case_chunks = await db.search_similar( + query_embedding=query_emb, limit=10, case_id=case_id + ) + + # 2. Get similar sections from precedents + precedent_chunks = await db.search_similar( + query_embedding=query_emb, limit=5, section_type=section + ) + # Filter out chunks from the same case + precedent_chunks = [c for c in precedent_chunks if str(c["case_id"]) != case["id"]] + + # 3. Get style patterns + style_patterns = await db.get_style_patterns() + + # Build context + context = { + "case": { + "case_number": case["case_number"], + "title": case["title"], + "appellants": case["appellants"], + "respondents": case["respondents"], + "subject": case["subject"], + "property_address": case["property_address"], + }, + "section": section, + "instructions": instructions, + "case_documents": [ + { + "document": c["document_title"], + "section_type": c["section_type"], + "content": c["content"], + } + for c in case_chunks + ], + "precedents": [ + { + "case_number": c["case_number"], + "document": c["document_title"], + "content": c["content"][:500], + } + for c in precedent_chunks[:3] + ], + "style_patterns": [ + { + "type": p["pattern_type"], + "text": p["pattern_text"], + } + for p in style_patterns[:15] + ], + } + + return json.dumps(context, ensure_ascii=False, indent=2) + + +async def get_decision_template(case_number: str) -> str: + """קבלת תבנית מבנית להחלטה מלאה עם פרטי התיק. + + Args: + case_number: מספר תיק הערר + """ + from datetime import date + + case = await db.get_case_by_number(case_number) + if not case: + return f"תיק {case_number} לא נמצא." + + template = DECISION_TEMPLATE.format( + case_number=case["case_number"], + subject=case["subject"], + appellants=", ".join(case.get("appellants", [])), + respondents=", ".join(case.get("respondents", [])), + property_address=case.get("property_address", ""), + date=date.today().strftime("%d.%m.%Y"), + ) + + return template + + +async def analyze_style() -> str: + """הרצת ניתוח סגנון על קורפוס ההחלטות של דפנה. מחלץ דפוסי כתיבה ושומר אותם.""" + from legal_mcp.services.style_analyzer import analyze_corpus + + result = await analyze_corpus() + return json.dumps(result, ensure_ascii=False, indent=2) diff --git a/mcp-server/src/legal_mcp/tools/search.py b/mcp-server/src/legal_mcp/tools/search.py new file mode 100644 index 0000000..078874e --- /dev/null +++ b/mcp-server/src/legal_mcp/tools/search.py @@ -0,0 +1,124 @@ +"""MCP tools for RAG search over legal documents and decisions.""" + +from __future__ import annotations + +import json +from uuid import UUID + +from legal_mcp.services import db, embeddings + + +async def search_decisions( + query: str, + limit: int = 10, + section_type: str = "", +) -> str: + """חיפוש סמנטי בהחלטות קודמות ובמסמכים. + + Args: + query: שאילתת חיפוש בעברית (לדוגמה: "שימוש חורג למסחר באזור מגורים") + limit: מספר תוצאות מקסימלי + section_type: סינון לפי סוג סעיף (facts, legal_analysis, conclusion, ruling, וכו'). ריק = הכל + """ + query_emb = await embeddings.embed_query(query) + results = await db.search_similar( + query_embedding=query_emb, + limit=limit, + section_type=section_type or None, + ) + + if not results: + return "לא נמצאו תוצאות." + + formatted = [] + for r in results: + formatted.append({ + "score": round(float(r["score"]), 4), + "case_number": r["case_number"], + "document": r["document_title"], + "section": r["section_type"], + "page": r["page_number"], + "content": r["content"], + }) + + return json.dumps(formatted, ensure_ascii=False, indent=2) + + +async def search_case_documents( + case_number: str, + query: str, + limit: int = 10, +) -> str: + """חיפוש סמנטי בתוך מסמכי תיק ספציפי. + + Args: + case_number: מספר תיק הערר + query: שאילתת חיפוש + limit: מספר תוצאות מקסימלי + """ + case = await db.get_case_by_number(case_number) + if not case: + return f"תיק {case_number} לא נמצא." + + query_emb = await embeddings.embed_query(query) + results = await db.search_similar( + query_embedding=query_emb, + limit=limit, + case_id=UUID(case["id"]), + ) + + if not results: + return f"לא נמצאו תוצאות בתיק {case_number}." + + formatted = [] + for r in results: + formatted.append({ + "score": round(float(r["score"]), 4), + "document": r["document_title"], + "section": r["section_type"], + "page": r["page_number"], + "content": r["content"], + }) + + return json.dumps(formatted, ensure_ascii=False, indent=2) + + +async def find_similar_cases( + description: str, + limit: int = 5, +) -> str: + """מציאת תיקים דומים על בסיס תיאור. + + Args: + description: תיאור התיק או הנושא (לדוגמה: "ערר על סירוב להיתר בנייה לתוספת קומה") + limit: מספר תוצאות מקסימלי + """ + query_emb = await embeddings.embed_query(description) + results = await db.search_similar( + query_embedding=query_emb, + limit=limit * 3, # Get more to deduplicate by case + ) + + if not results: + return "לא נמצאו תיקים דומים." + + # Deduplicate by case_number, keep best score per case + seen_cases = {} + for r in results: + cn = r["case_number"] + if cn not in seen_cases or r["score"] > seen_cases[cn]["score"]: + seen_cases[cn] = r + + # Sort by score and limit + top_cases = sorted(seen_cases.values(), key=lambda x: x["score"], reverse=True)[:limit] + + formatted = [] + for r in top_cases: + formatted.append({ + "score": round(float(r["score"]), 4), + "case_number": r["case_number"], + "document": r["document_title"], + "relevant_section": r["content"][:500], + }) + + return json.dumps(formatted, ensure_ascii=False, indent=2) diff --git a/mcp-server/src/legal_mcp/tools/workflow.py b/mcp-server/src/legal_mcp/tools/workflow.py new file mode 100644 index 0000000..60f2b63 --- /dev/null +++ b/mcp-server/src/legal_mcp/tools/workflow.py @@ -0,0 +1,118 @@ +"""MCP tools for workflow status tracking.""" + +from __future__ import annotations + +import json +from uuid import UUID + +from legal_mcp.services import db + + +async def workflow_status(case_number: str) -> str: + """סטטוס תהליך עבודה מלא לתיק - מסמכים, עיבוד, טיוטות. + + Args: + case_number: מספר תיק הערר + """ + case = await db.get_case_by_number(case_number) + if not case: + return f"תיק {case_number} לא נמצא." + + case_id = UUID(case["id"]) + docs = await db.list_documents(case_id) + + # Count chunks per document + pool = await db.get_pool() + async with pool.acquire() as conn: + chunk_counts = await conn.fetch( + "SELECT document_id, COUNT(*) as count FROM document_chunks WHERE case_id = $1 GROUP BY document_id", + case_id, + ) + chunk_map = {str(r["document_id"]): r["count"] for r in chunk_counts} + + doc_status = [] + for doc in docs: + doc_status.append({ + "title": doc["title"], + "type": doc["doc_type"], + "extraction": doc["extraction_status"], + "chunks": chunk_map.get(doc["id"], 0), + "pages": doc.get("page_count"), + }) + + # Check draft status + from pathlib import Path + from legal_mcp import config + + case_dir = config.CASES_DIR / case_number + draft_path = case_dir / "drafts" / "decision.md" + has_draft = draft_path.exists() + draft_size = draft_path.stat().st_size if has_draft else 0 + + status = { + "case_number": case["case_number"], + "title": case["title"], + "status": case["status"], + "documents": doc_status, + "total_documents": len(docs), + "total_chunks": sum(chunk_map.values()), + "has_draft": has_draft, + "draft_size_bytes": draft_size, + "next_steps": _suggest_next_steps(case, docs, has_draft), + } + + return json.dumps(status, ensure_ascii=False, indent=2) + + +def _suggest_next_steps(case: dict, docs: list, has_draft: bool) -> list[str]: + """Suggest next steps based on case state.""" + steps = [] + doc_types = {d["doc_type"] for d in docs} + + if not docs: + steps.append("העלה מסמכים לתיק (כתב ערר, תשובת ועדה)") + else: + if "appeal" not in doc_types: + steps.append("העלה כתב ערר") + if "response" not in doc_types: + steps.append("העלה תשובת ועדה/משיבים") + + pending = [d for d in docs if d["extraction_status"] == "pending"] + if pending: + steps.append(f"עיבוד {len(pending)} מסמכים ממתינים") + + if docs and not has_draft: + steps.append("התחל ניסוח טיוטת החלטה (/draft-decision)") + elif has_draft and case["status"] in ("new", "in_progress"): + steps.append("סקור ועדכן את הטיוטה") + steps.append("עדכן סטטוס ל-drafted") + + if case["status"] == "drafted": + steps.append("סקירה סופית ועדכון סטטוס ל-reviewed") + elif case["status"] == "reviewed": + steps.append("אישור סופי ועדכון סטטוס ל-final") + + return steps + + +async def processing_status() -> str: + """סטטוס כללי - מספר תיקים, מסמכים ממתינים לעיבוד.""" + pool = await db.get_pool() + async with pool.acquire() as conn: + case_count = await conn.fetchval("SELECT COUNT(*) FROM cases") + doc_count = await conn.fetchval("SELECT COUNT(*) FROM documents") + pending_count = await conn.fetchval( + "SELECT COUNT(*) FROM documents WHERE extraction_status = 'pending'" + ) + chunk_count = await conn.fetchval("SELECT COUNT(*) FROM document_chunks") + corpus_count = await conn.fetchval("SELECT COUNT(*) FROM style_corpus") + pattern_count = await conn.fetchval("SELECT COUNT(*) FROM style_patterns") + + return json.dumps({ + "cases": case_count, + "documents": doc_count, + "pending_processing": pending_count, + "chunks": chunk_count, + "style_corpus_entries": corpus_count, + "style_patterns": pattern_count, + }, ensure_ascii=False, indent=2) diff --git a/web/app.py b/web/app.py new file mode 100644 index 0000000..02a8e57 --- /dev/null +++ b/web/app.py @@ -0,0 +1,342 @@ +"""Ezer Mishpati — Web upload interface for legal documents.""" + +from __future__ import annotations + +import asyncio +import json +import logging +import re +import shutil +import subprocess +import sys +import time +from contextlib import asynccontextmanager +from pathlib import Path +from uuid import UUID, uuid4 + +# Allow importing legal_mcp from the MCP server source +sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src")) + +from fastapi import FastAPI, File, HTTPException, UploadFile +from fastapi.responses import FileResponse, StreamingResponse +from fastapi.staticfiles import StaticFiles +from pydantic import BaseModel + +from legal_mcp import config +from legal_mcp.services import chunker, db, embeddings, extractor, processor + +logger = logging.getLogger(__name__) + +UPLOAD_DIR = config.DATA_DIR / "uploads" +ALLOWED_EXTENSIONS = {".pdf", ".docx", ".rtf", ".txt"} +MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB + +# In-memory progress tracking +_progress: dict[str, dict] = {} + + +@asynccontextmanager +async def lifespan(app: FastAPI): + UPLOAD_DIR.mkdir(parents=True, exist_ok=True) + await db.init_schema() + yield + await db.close_pool() + + +app = FastAPI(title="Ezer Mishpati — Upload", lifespan=lifespan) + +STATIC_DIR = Path(__file__).parent / "static" + + +# ── API Endpoints ────────────────────────────────────────────────── + + +@app.get("/") +async def index(): + return FileResponse(STATIC_DIR / "index.html") + + +@app.post("/api/upload") +async def upload_file(file: UploadFile = File(...)): + """Upload a file to the temporary uploads directory.""" + if not file.filename: + raise HTTPException(400, "No filename provided") + + # Validate extension + ext = Path(file.filename).suffix.lower() + if ext not in ALLOWED_EXTENSIONS: + raise HTTPException(400, f"Unsupported file type: {ext}. Allowed: {', '.join(ALLOWED_EXTENSIONS)}") + + # Sanitize filename + safe_name = re.sub(r"[^\w\u0590-\u05FF\s.\-()]", "", Path(file.filename).stem) + if not safe_name: + safe_name = "document" + timestamp = int(time.time()) + filename = f"{timestamp}_{safe_name}{ext}" + + # Read and validate size + content = await file.read() + if len(content) > MAX_FILE_SIZE: + raise HTTPException(400, f"File too large. Max: {MAX_FILE_SIZE // (1024*1024)}MB") + + dest = UPLOAD_DIR / filename + dest.write_bytes(content) + + return { + "filename": filename, + "original_name": file.filename, + "size": len(content), + } + + +@app.get("/api/uploads") +async def list_uploads(): + """List files in the uploads (pending) directory.""" + if not UPLOAD_DIR.exists(): + return [] + files = [] + for f in sorted(UPLOAD_DIR.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True): + if f.is_file() and f.suffix.lower() in ALLOWED_EXTENSIONS: + stat = f.stat() + files.append({ + "filename": f.name, + "size": stat.st_size, + "uploaded_at": stat.st_mtime, + }) + return files + + +@app.delete("/api/uploads/{filename}") +async def delete_upload(filename: str): + """Remove a file from the uploads directory.""" + path = UPLOAD_DIR / filename + if not path.exists() or not path.parent.samefile(UPLOAD_DIR): + raise HTTPException(404, "File not found") + path.unlink() + return {"deleted": filename} + + +class ClassifyRequest(BaseModel): + filename: str + category: str # "training" or "case" + # For case documents + case_number: str = "" + doc_type: str = "appeal" + title: str = "" + # For training documents + decision_number: str = "" + decision_date: str = "" + subject_categories: list[str] = [] + + +@app.post("/api/classify") +async def classify_file(req: ClassifyRequest): + """Classify a pending file and start processing.""" + source = UPLOAD_DIR / req.filename + if not source.exists() or not source.parent.samefile(UPLOAD_DIR): + raise HTTPException(404, "File not found in uploads") + + if req.category not in ("training", "case"): + raise HTTPException(400, "Category must be 'training' or 'case'") + + if req.category == "case" and not req.case_number: + raise HTTPException(400, "case_number required for case documents") + + task_id = str(uuid4()) + _progress[task_id] = {"status": "queued", "filename": req.filename} + + asyncio.create_task(_process_file(task_id, source, req)) + + return {"task_id": task_id} + + +@app.get("/api/progress/{task_id}") +async def progress_stream(task_id: str): + """SSE stream of processing progress.""" + if task_id not in _progress: + raise HTTPException(404, "Task not found") + + async def event_stream(): + while True: + data = _progress.get(task_id, {}) + yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n" + if data.get("status") in ("completed", "failed"): + break + await asyncio.sleep(1) + # Clean up after a delay + await asyncio.sleep(30) + _progress.pop(task_id, None) + + return StreamingResponse(event_stream(), media_type="text/event-stream") + + +@app.get("/api/cases") +async def list_cases(): + """List existing cases for the dropdown.""" + cases = await db.list_cases() + return [ + { + "case_number": c["case_number"], + "title": c["title"], + "status": c["status"], + } + for c in cases + ] + + +# ── Background Processing ───────────────────────────────────────── + + +async def _process_file(task_id: str, source: Path, req: ClassifyRequest): + """Process a classified file in the background.""" + try: + if req.category == "case": + await _process_case_document(task_id, source, req) + else: + await _process_training_document(task_id, source, req) + except Exception as e: + logger.exception("Processing failed for %s", req.filename) + _progress[task_id] = {"status": "failed", "error": str(e), "filename": req.filename} + + +async def _process_case_document(task_id: str, source: Path, req: ClassifyRequest): + """Process a case document (mirrors documents.document_upload logic).""" + _progress[task_id] = {"status": "validating", "filename": req.filename} + + case = await db.get_case_by_number(req.case_number) + if not case: + _progress[task_id] = {"status": "failed", "error": f"Case {req.case_number} not found"} + return + + case_id = UUID(case["id"]) + title = req.title or source.stem.split("_", 1)[-1] # Remove timestamp prefix + + # Copy to case directory + _progress[task_id] = {"status": "copying", "filename": req.filename} + case_dir = config.CASES_DIR / req.case_number / "documents" + case_dir.mkdir(parents=True, exist_ok=True) + # Use original name without timestamp prefix + original_name = re.sub(r"^\d+_", "", source.name) + dest = case_dir / original_name + shutil.copy2(str(source), str(dest)) + + # Create document record + _progress[task_id] = {"status": "registering", "filename": req.filename} + doc = await db.create_document( + case_id=case_id, + doc_type=req.doc_type, + title=title, + file_path=str(dest), + ) + + # Process (extract → chunk → embed → store) + _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "extracting"} + result = await processor.process_document(UUID(doc["id"]), case_id) + + # Git commit + repo_dir = config.CASES_DIR / req.case_number + if repo_dir.exists(): + subprocess.run(["git", "add", "."], cwd=repo_dir, capture_output=True) + doc_type_hebrew = { + "appeal": "כתב ערר", "response": "תשובה", "decision": "החלטה", + "reference": "מסמך עזר", "exhibit": "נספח", + }.get(req.doc_type, req.doc_type) + subprocess.run( + ["git", "commit", "-m", f"הוספת {doc_type_hebrew}: {title}"], + cwd=repo_dir, capture_output=True, + env={"GIT_AUTHOR_NAME": "Ezer Mishpati", "GIT_AUTHOR_EMAIL": "legal@local", + "GIT_COMMITTER_NAME": "Ezer Mishpati", "GIT_COMMITTER_EMAIL": "legal@local", + "PATH": "/usr/bin:/bin"}, + ) + + # Remove from uploads + source.unlink(missing_ok=True) + + _progress[task_id] = { + "status": "completed", + "filename": req.filename, + "result": result, + "case_number": req.case_number, + "doc_type": req.doc_type, + } + + +async def _process_training_document(task_id: str, source: Path, req: ClassifyRequest): + """Process a training document (mirrors documents.document_upload_training logic).""" + from datetime import date as date_type + + title = req.title or source.stem.split("_", 1)[-1] + + # Copy to training directory + _progress[task_id] = {"status": "copying", "filename": req.filename} + config.TRAINING_DIR.mkdir(parents=True, exist_ok=True) + original_name = re.sub(r"^\d+_", "", source.name) + dest = config.TRAINING_DIR / original_name + shutil.copy2(str(source), str(dest)) + + # Extract text + _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "extracting"} + text, page_count = await extractor.extract_text(str(dest)) + + # Parse date + d_date = None + if req.decision_date: + d_date = date_type.fromisoformat(req.decision_date) + + # Add to style corpus + _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "corpus"} + corpus_id = await db.add_to_style_corpus( + document_id=None, + decision_number=req.decision_number, + decision_date=d_date, + subject_categories=req.subject_categories, + full_text=text, + ) + + # Chunk and embed + _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "chunking"} + chunks = chunker.chunk_document(text) + + chunk_count = 0 + if chunks: + doc = await db.create_document( + case_id=None, + doc_type="decision", + title=f"[קורפוס] {title}", + file_path=str(dest), + page_count=page_count, + ) + doc_id = UUID(doc["id"]) + await db.update_document(doc_id, extracted_text=text, extraction_status="completed") + + _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "embedding"} + texts = [c.content for c in chunks] + embs = await embeddings.embed_texts(texts, input_type="document") + + chunk_dicts = [ + { + "content": c.content, + "section_type": c.section_type, + "embedding": emb, + "page_number": c.page_number, + "chunk_index": c.chunk_index, + } + for c, emb in zip(chunks, embs) + ] + await db.store_chunks(doc_id, None, chunk_dicts) + chunk_count = len(chunks) + + # Remove from uploads + source.unlink(missing_ok=True) + + _progress[task_id] = { + "status": "completed", + "filename": req.filename, + "result": { + "corpus_id": str(corpus_id), + "title": title, + "pages": page_count, + "text_length": len(text), + "chunks": chunk_count, + }, + } diff --git a/web/static/index.html b/web/static/index.html new file mode 100644 index 0000000..4e721a2 --- /dev/null +++ b/web/static/index.html @@ -0,0 +1,571 @@ + + + + + +עוזר משפטי — העלאת מסמכים + + + +
+
+

עוזר משפטי — העלאת מסמכים

+

העלאה, סיווג ועיבוד מסמכים משפטיים

+
+ + +
+

גרור קבצים לכאן או לחץ לבחירה

+

PDF, DOCX, RTF, TXT — עד 50MB

+ +
+
+
+
+ + + + + + +
+ +
+ + + +