Initial commit: MCP server + web upload interface
Ezer Mishpati - AI legal decision drafting system with: - MCP server (FastMCP) with document processing pipeline - Web upload interface (FastAPI) for file upload and classification - pgvector-based semantic search - Hebrew legal document chunking and embedding
This commit is contained in:
218
mcp-server/src/legal_mcp/tools/documents.py
Normal file
218
mcp-server/src/legal_mcp/tools/documents.py
Normal file
@@ -0,0 +1,218 @@
|
||||
"""MCP tools for document management and processing."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from uuid import UUID
|
||||
|
||||
from legal_mcp import config
|
||||
from legal_mcp.services import db, processor
|
||||
|
||||
|
||||
async def document_upload(
|
||||
case_number: str,
|
||||
file_path: str,
|
||||
doc_type: str = "appeal",
|
||||
title: str = "",
|
||||
) -> str:
|
||||
"""העלאה ועיבוד מסמך לתיק ערר. מחלץ טקסט, יוצר chunks ו-embeddings.
|
||||
|
||||
Args:
|
||||
case_number: מספר תיק הערר
|
||||
file_path: נתיב מלא לקובץ (PDF, DOCX, RTF, TXT)
|
||||
doc_type: סוג מסמך (appeal=כתב ערר, response=תשובה, decision=החלטה, reference=מסמך עזר, exhibit=נספח)
|
||||
title: שם המסמך (אם ריק, ייקח משם הקובץ)
|
||||
"""
|
||||
case = await db.get_case_by_number(case_number)
|
||||
if not case:
|
||||
return f"תיק {case_number} לא נמצא."
|
||||
|
||||
source = Path(file_path)
|
||||
if not source.exists():
|
||||
return f"קובץ לא נמצא: {file_path}"
|
||||
|
||||
case_id = UUID(case["id"])
|
||||
if not title:
|
||||
title = source.stem
|
||||
|
||||
# Copy file to case directory
|
||||
case_dir = config.CASES_DIR / case_number / "documents"
|
||||
case_dir.mkdir(parents=True, exist_ok=True)
|
||||
dest = case_dir / source.name
|
||||
shutil.copy2(str(source), str(dest))
|
||||
|
||||
# Create document record
|
||||
doc = await db.create_document(
|
||||
case_id=case_id,
|
||||
doc_type=doc_type,
|
||||
title=title,
|
||||
file_path=str(dest),
|
||||
)
|
||||
|
||||
# Process document (extract → chunk → embed → store)
|
||||
result = await processor.process_document(UUID(doc["id"]), case_id)
|
||||
|
||||
# Git commit
|
||||
repo_dir = config.CASES_DIR / case_number
|
||||
if repo_dir.exists():
|
||||
subprocess.run(["git", "add", "."], cwd=repo_dir, capture_output=True)
|
||||
doc_type_hebrew = {
|
||||
"appeal": "כתב ערר",
|
||||
"response": "תשובה",
|
||||
"decision": "החלטה",
|
||||
"reference": "מסמך עזר",
|
||||
"exhibit": "נספח",
|
||||
}.get(doc_type, doc_type)
|
||||
subprocess.run(
|
||||
["git", "commit", "-m", f"הוספת {doc_type_hebrew}: {title}"],
|
||||
cwd=repo_dir,
|
||||
capture_output=True,
|
||||
env={"GIT_AUTHOR_NAME": "Ezer Mishpati", "GIT_AUTHOR_EMAIL": "legal@local",
|
||||
"GIT_COMMITTER_NAME": "Ezer Mishpati", "GIT_COMMITTER_EMAIL": "legal@local",
|
||||
"PATH": "/usr/bin:/bin"},
|
||||
)
|
||||
|
||||
return json.dumps({
|
||||
"document": doc,
|
||||
"processing": result,
|
||||
}, default=str, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
async def document_upload_training(
|
||||
file_path: str,
|
||||
decision_number: str = "",
|
||||
decision_date: str = "",
|
||||
subject_categories: list[str] | None = None,
|
||||
title: str = "",
|
||||
) -> str:
|
||||
"""העלאת החלטה קודמת של דפנה לקורפוס הסגנון (training).
|
||||
|
||||
Args:
|
||||
file_path: נתיב מלא לקובץ ההחלטה
|
||||
decision_number: מספר ההחלטה
|
||||
decision_date: תאריך ההחלטה (YYYY-MM-DD)
|
||||
subject_categories: קטגוריות - אפשר לבחור כמה (בנייה, שימוש חורג, תכנית, היתר, הקלה, חלוקה, תמ"א 38, היטל השבחה, פיצויים 197)
|
||||
title: שם המסמך
|
||||
"""
|
||||
from datetime import date as date_type
|
||||
|
||||
from legal_mcp.services import extractor, embeddings, chunker
|
||||
|
||||
source = Path(file_path)
|
||||
if not source.exists():
|
||||
return f"קובץ לא נמצא: {file_path}"
|
||||
|
||||
if not title:
|
||||
title = source.stem
|
||||
|
||||
# Copy to training directory (skip if already there)
|
||||
config.TRAINING_DIR.mkdir(parents=True, exist_ok=True)
|
||||
dest = config.TRAINING_DIR / source.name
|
||||
if source.resolve() != dest.resolve():
|
||||
shutil.copy2(str(source), str(dest))
|
||||
|
||||
# Extract text
|
||||
text, page_count = await extractor.extract_text(str(dest))
|
||||
|
||||
# Parse date
|
||||
d_date = None
|
||||
if decision_date:
|
||||
d_date = date_type.fromisoformat(decision_date)
|
||||
|
||||
# Add to style corpus
|
||||
corpus_id = await db.add_to_style_corpus(
|
||||
document_id=None,
|
||||
decision_number=decision_number,
|
||||
decision_date=d_date,
|
||||
subject_categories=subject_categories or [],
|
||||
full_text=text,
|
||||
)
|
||||
|
||||
# Chunk and embed for RAG search over training corpus
|
||||
chunks = chunker.chunk_document(text)
|
||||
if chunks:
|
||||
# Create a document record (no case association)
|
||||
doc = await db.create_document(
|
||||
case_id=None,
|
||||
doc_type="decision",
|
||||
title=f"[קורפוס] {title}",
|
||||
file_path=str(dest),
|
||||
page_count=page_count,
|
||||
)
|
||||
doc_id = UUID(doc["id"])
|
||||
await db.update_document(doc_id, extracted_text=text, extraction_status="completed")
|
||||
|
||||
# Generate embeddings and store chunks
|
||||
texts = [c.content for c in chunks]
|
||||
embs = await embeddings.embed_texts(texts, input_type="document")
|
||||
chunk_dicts = [
|
||||
{
|
||||
"content": c.content,
|
||||
"section_type": c.section_type,
|
||||
"embedding": emb,
|
||||
"page_number": c.page_number,
|
||||
"chunk_index": c.chunk_index,
|
||||
}
|
||||
for c, emb in zip(chunks, embs)
|
||||
]
|
||||
await db.store_chunks(doc_id, None, chunk_dicts)
|
||||
|
||||
return json.dumps({
|
||||
"corpus_id": str(corpus_id),
|
||||
"title": title,
|
||||
"pages": page_count,
|
||||
"text_length": len(text),
|
||||
"chunks": len(chunks) if chunks else 0,
|
||||
}, default=str, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
async def document_get_text(case_number: str, doc_title: str = "") -> str:
|
||||
"""קבלת טקסט מלא של מסמך מתוך תיק.
|
||||
|
||||
Args:
|
||||
case_number: מספר תיק הערר
|
||||
doc_title: שם המסמך (אם ריק, מחזיר את כל המסמכים)
|
||||
"""
|
||||
case = await db.get_case_by_number(case_number)
|
||||
if not case:
|
||||
return f"תיק {case_number} לא נמצא."
|
||||
|
||||
docs = await db.list_documents(UUID(case["id"]))
|
||||
if not docs:
|
||||
return f"אין מסמכים בתיק {case_number}."
|
||||
|
||||
if doc_title:
|
||||
docs = [d for d in docs if doc_title.lower() in d["title"].lower()]
|
||||
if not docs:
|
||||
return f"מסמך '{doc_title}' לא נמצא בתיק."
|
||||
|
||||
results = []
|
||||
for doc in docs:
|
||||
text = await db.get_document_text(UUID(doc["id"]))
|
||||
results.append({
|
||||
"title": doc["title"],
|
||||
"doc_type": doc["doc_type"],
|
||||
"text": text[:10000] if text else "(ללא טקסט)",
|
||||
})
|
||||
|
||||
return json.dumps(results, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
async def document_list(case_number: str) -> str:
|
||||
"""רשימת מסמכים בתיק.
|
||||
|
||||
Args:
|
||||
case_number: מספר תיק הערר
|
||||
"""
|
||||
case = await db.get_case_by_number(case_number)
|
||||
if not case:
|
||||
return f"תיק {case_number} לא נמצא."
|
||||
|
||||
docs = await db.list_documents(UUID(case["id"]))
|
||||
if not docs:
|
||||
return f"אין מסמכים בתיק {case_number}."
|
||||
|
||||
return json.dumps(docs, default=str, ensure_ascii=False, indent=2)
|
||||
Reference in New Issue
Block a user