Initial commit: MCP server + web upload interface

Ezer Mishpati - AI legal decision drafting system with:
- MCP server (FastMCP) with document processing pipeline
- Web upload interface (FastAPI) for file upload and classification
- pgvector-based semantic search
- Hebrew legal document chunking and embedding
This commit is contained in:
2026-03-23 12:33:07 +00:00
commit 6f515dc2cb
33 changed files with 3297 additions and 0 deletions

View File

@@ -0,0 +1,218 @@
"""MCP tools for document management and processing."""
from __future__ import annotations
import json
import shutil
import subprocess
from pathlib import Path
from uuid import UUID
from legal_mcp import config
from legal_mcp.services import db, processor
async def document_upload(
case_number: str,
file_path: str,
doc_type: str = "appeal",
title: str = "",
) -> str:
"""העלאה ועיבוד מסמך לתיק ערר. מחלץ טקסט, יוצר chunks ו-embeddings.
Args:
case_number: מספר תיק הערר
file_path: נתיב מלא לקובץ (PDF, DOCX, RTF, TXT)
doc_type: סוג מסמך (appeal=כתב ערר, response=תשובה, decision=החלטה, reference=מסמך עזר, exhibit=נספח)
title: שם המסמך (אם ריק, ייקח משם הקובץ)
"""
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
source = Path(file_path)
if not source.exists():
return f"קובץ לא נמצא: {file_path}"
case_id = UUID(case["id"])
if not title:
title = source.stem
# Copy file to case directory
case_dir = config.CASES_DIR / case_number / "documents"
case_dir.mkdir(parents=True, exist_ok=True)
dest = case_dir / source.name
shutil.copy2(str(source), str(dest))
# Create document record
doc = await db.create_document(
case_id=case_id,
doc_type=doc_type,
title=title,
file_path=str(dest),
)
# Process document (extract → chunk → embed → store)
result = await processor.process_document(UUID(doc["id"]), case_id)
# Git commit
repo_dir = config.CASES_DIR / case_number
if repo_dir.exists():
subprocess.run(["git", "add", "."], cwd=repo_dir, capture_output=True)
doc_type_hebrew = {
"appeal": "כתב ערר",
"response": "תשובה",
"decision": "החלטה",
"reference": "מסמך עזר",
"exhibit": "נספח",
}.get(doc_type, doc_type)
subprocess.run(
["git", "commit", "-m", f"הוספת {doc_type_hebrew}: {title}"],
cwd=repo_dir,
capture_output=True,
env={"GIT_AUTHOR_NAME": "Ezer Mishpati", "GIT_AUTHOR_EMAIL": "legal@local",
"GIT_COMMITTER_NAME": "Ezer Mishpati", "GIT_COMMITTER_EMAIL": "legal@local",
"PATH": "/usr/bin:/bin"},
)
return json.dumps({
"document": doc,
"processing": result,
}, default=str, ensure_ascii=False, indent=2)
async def document_upload_training(
file_path: str,
decision_number: str = "",
decision_date: str = "",
subject_categories: list[str] | None = None,
title: str = "",
) -> str:
"""העלאת החלטה קודמת של דפנה לקורפוס הסגנון (training).
Args:
file_path: נתיב מלא לקובץ ההחלטה
decision_number: מספר ההחלטה
decision_date: תאריך ההחלטה (YYYY-MM-DD)
subject_categories: קטגוריות - אפשר לבחור כמה (בנייה, שימוש חורג, תכנית, היתר, הקלה, חלוקה, תמ"א 38, היטל השבחה, פיצויים 197)
title: שם המסמך
"""
from datetime import date as date_type
from legal_mcp.services import extractor, embeddings, chunker
source = Path(file_path)
if not source.exists():
return f"קובץ לא נמצא: {file_path}"
if not title:
title = source.stem
# Copy to training directory (skip if already there)
config.TRAINING_DIR.mkdir(parents=True, exist_ok=True)
dest = config.TRAINING_DIR / source.name
if source.resolve() != dest.resolve():
shutil.copy2(str(source), str(dest))
# Extract text
text, page_count = await extractor.extract_text(str(dest))
# Parse date
d_date = None
if decision_date:
d_date = date_type.fromisoformat(decision_date)
# Add to style corpus
corpus_id = await db.add_to_style_corpus(
document_id=None,
decision_number=decision_number,
decision_date=d_date,
subject_categories=subject_categories or [],
full_text=text,
)
# Chunk and embed for RAG search over training corpus
chunks = chunker.chunk_document(text)
if chunks:
# Create a document record (no case association)
doc = await db.create_document(
case_id=None,
doc_type="decision",
title=f"[קורפוס] {title}",
file_path=str(dest),
page_count=page_count,
)
doc_id = UUID(doc["id"])
await db.update_document(doc_id, extracted_text=text, extraction_status="completed")
# Generate embeddings and store chunks
texts = [c.content for c in chunks]
embs = await embeddings.embed_texts(texts, input_type="document")
chunk_dicts = [
{
"content": c.content,
"section_type": c.section_type,
"embedding": emb,
"page_number": c.page_number,
"chunk_index": c.chunk_index,
}
for c, emb in zip(chunks, embs)
]
await db.store_chunks(doc_id, None, chunk_dicts)
return json.dumps({
"corpus_id": str(corpus_id),
"title": title,
"pages": page_count,
"text_length": len(text),
"chunks": len(chunks) if chunks else 0,
}, default=str, ensure_ascii=False, indent=2)
async def document_get_text(case_number: str, doc_title: str = "") -> str:
"""קבלת טקסט מלא של מסמך מתוך תיק.
Args:
case_number: מספר תיק הערר
doc_title: שם המסמך (אם ריק, מחזיר את כל המסמכים)
"""
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
docs = await db.list_documents(UUID(case["id"]))
if not docs:
return f"אין מסמכים בתיק {case_number}."
if doc_title:
docs = [d for d in docs if doc_title.lower() in d["title"].lower()]
if not docs:
return f"מסמך '{doc_title}' לא נמצא בתיק."
results = []
for doc in docs:
text = await db.get_document_text(UUID(doc["id"]))
results.append({
"title": doc["title"],
"doc_type": doc["doc_type"],
"text": text[:10000] if text else "(ללא טקסט)",
})
return json.dumps(results, ensure_ascii=False, indent=2)
async def document_list(case_number: str) -> str:
"""רשימת מסמכים בתיק.
Args:
case_number: מספר תיק הערר
"""
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
docs = await db.list_documents(UUID(case["id"]))
if not docs:
return f"אין מסמכים בתיק {case_number}."
return json.dumps(docs, default=str, ensure_ascii=False, indent=2)