Initial commit: MCP server + web upload interface

Ezer Mishpati - AI legal decision drafting system with: - MCP server (FastMCP) with document processing pipeline - Web upload interface (FastAPI) for file upload and classification - pgvector-based semantic search - Hebrew legal document chunking and embedding
2026-03-23 12:33:07 +00:00
commit 6f515dc2cb
33 changed files with 3297 additions and 0 deletions
--- a/mcp-server/src/legal_mcp/tools/documents.py
+++ b/mcp-server/src/legal_mcp/tools/documents.py
@@ -0,0 +1,218 @@
+"""MCP tools for document management and processing."""
+
+from __future__ import annotations
+
+import json
+import shutil
+import subprocess
+from pathlib import Path
+from uuid import UUID
+
+from legal_mcp import config
+from legal_mcp.services import db, processor
+
+
+async def document_upload(
+    case_number: str,
+    file_path: str,
+    doc_type: str = "appeal",
+    title: str = "",
+) -> str:
+    """העלאה ועיבוד מסמך לתיק ערר. מחלץ טקסט, יוצר chunks ו-embeddings.
+
+    Args:
+        case_number: מספר תיק הערר
+        file_path: נתיב מלא לקובץ (PDF, DOCX, RTF, TXT)
+        doc_type: סוג מסמך (appeal=כתב ערר, response=תשובה, decision=החלטה, reference=מסמך עזר, exhibit=נספח)
+        title: שם המסמך (אם ריק, ייקח משם הקובץ)
+    """
+    case = await db.get_case_by_number(case_number)
+    if not case:
+        return f"תיק {case_number} לא נמצא."
+
+    source = Path(file_path)
+    if not source.exists():
+        return f"קובץ לא נמצא: {file_path}"
+
+    case_id = UUID(case["id"])
+    if not title:
+        title = source.stem
+
+    # Copy file to case directory
+    case_dir = config.CASES_DIR / case_number / "documents"
+    case_dir.mkdir(parents=True, exist_ok=True)
+    dest = case_dir / source.name
+    shutil.copy2(str(source), str(dest))
+
+    # Create document record
+    doc = await db.create_document(
+        case_id=case_id,
+        doc_type=doc_type,
+        title=title,
+        file_path=str(dest),
+    )
+
+    # Process document (extract → chunk → embed → store)
+    result = await processor.process_document(UUID(doc["id"]), case_id)
+
+    # Git commit
+    repo_dir = config.CASES_DIR / case_number
+    if repo_dir.exists():
+        subprocess.run(["git", "add", "."], cwd=repo_dir, capture_output=True)
+        doc_type_hebrew = {
+            "appeal": "כתב ערר",
+            "response": "תשובה",
+            "decision": "החלטה",
+            "reference": "מסמך עזר",
+            "exhibit": "נספח",
+        }.get(doc_type, doc_type)
+        subprocess.run(
+            ["git", "commit", "-m", f"הוספת {doc_type_hebrew}: {title}"],
+            cwd=repo_dir,
+            capture_output=True,
+            env={"GIT_AUTHOR_NAME": "Ezer Mishpati", "GIT_AUTHOR_EMAIL": "legal@local",
+                 "GIT_COMMITTER_NAME": "Ezer Mishpati", "GIT_COMMITTER_EMAIL": "legal@local",
+                 "PATH": "/usr/bin:/bin"},
+        )
+
+    return json.dumps({
+        "document": doc,
+        "processing": result,
+    }, default=str, ensure_ascii=False, indent=2)
+
+
+async def document_upload_training(
+    file_path: str,
+    decision_number: str = "",
+    decision_date: str = "",
+    subject_categories: list[str] | None = None,
+    title: str = "",
+) -> str:
+    """העלאת החלטה קודמת של דפנה לקורפוס הסגנון (training).
+
+    Args:
+        file_path: נתיב מלא לקובץ ההחלטה
+        decision_number: מספר ההחלטה
+        decision_date: תאריך ההחלטה (YYYY-MM-DD)
+        subject_categories: קטגוריות - אפשר לבחור כמה (בנייה, שימוש חורג, תכנית, היתר, הקלה, חלוקה, תמ"א 38, היטל השבחה, פיצויים 197)
+        title: שם המסמך
+    """
+    from datetime import date as date_type
+
+    from legal_mcp.services import extractor, embeddings, chunker
+
+    source = Path(file_path)
+    if not source.exists():
+        return f"קובץ לא נמצא: {file_path}"
+
+    if not title:
+        title = source.stem
+
+    # Copy to training directory (skip if already there)
+    config.TRAINING_DIR.mkdir(parents=True, exist_ok=True)
+    dest = config.TRAINING_DIR / source.name
+    if source.resolve() != dest.resolve():
+        shutil.copy2(str(source), str(dest))
+
+    # Extract text
+    text, page_count = await extractor.extract_text(str(dest))
+
+    # Parse date
+    d_date = None
+    if decision_date:
+        d_date = date_type.fromisoformat(decision_date)
+
+    # Add to style corpus
+    corpus_id = await db.add_to_style_corpus(
+        document_id=None,
+        decision_number=decision_number,
+        decision_date=d_date,
+        subject_categories=subject_categories or [],
+        full_text=text,
+    )
+
+    # Chunk and embed for RAG search over training corpus
+    chunks = chunker.chunk_document(text)
+    if chunks:
+        # Create a document record (no case association)
+        doc = await db.create_document(
+            case_id=None,
+            doc_type="decision",
+            title=f"[קורפוס] {title}",
+            file_path=str(dest),
+            page_count=page_count,
+        )
+        doc_id = UUID(doc["id"])
+        await db.update_document(doc_id, extracted_text=text, extraction_status="completed")
+
+        # Generate embeddings and store chunks
+        texts = [c.content for c in chunks]
+        embs = await embeddings.embed_texts(texts, input_type="document")
+        chunk_dicts = [
+            {
+                "content": c.content,
+                "section_type": c.section_type,
+                "embedding": emb,
+                "page_number": c.page_number,
+                "chunk_index": c.chunk_index,
+            }
+            for c, emb in zip(chunks, embs)
+        ]
+        await db.store_chunks(doc_id, None, chunk_dicts)
+
+    return json.dumps({
+        "corpus_id": str(corpus_id),
+        "title": title,
+        "pages": page_count,
+        "text_length": len(text),
+        "chunks": len(chunks) if chunks else 0,
+    }, default=str, ensure_ascii=False, indent=2)
+
+
+async def document_get_text(case_number: str, doc_title: str = "") -> str:
+    """קבלת טקסט מלא של מסמך מתוך תיק.
+
+    Args:
+        case_number: מספר תיק הערר
+        doc_title: שם המסמך (אם ריק, מחזיר את כל המסמכים)
+    """
+    case = await db.get_case_by_number(case_number)
+    if not case:
+        return f"תיק {case_number} לא נמצא."
+
+    docs = await db.list_documents(UUID(case["id"]))
+    if not docs:
+        return f"אין מסמכים בתיק {case_number}."
+
+    if doc_title:
+        docs = [d for d in docs if doc_title.lower() in d["title"].lower()]
+        if not docs:
+            return f"מסמך '{doc_title}' לא נמצא בתיק."
+
+    results = []
+    for doc in docs:
+        text = await db.get_document_text(UUID(doc["id"]))
+        results.append({
+            "title": doc["title"],
+            "doc_type": doc["doc_type"],
+            "text": text[:10000] if text else "(ללא טקסט)",
+        })
+
+    return json.dumps(results, ensure_ascii=False, indent=2)
+
+
+async def document_list(case_number: str) -> str:
+    """רשימת מסמכים בתיק.
+
+    Args:
+        case_number: מספר תיק הערר
+    """
+    case = await db.get_case_by_number(case_number)
+    if not case:
+        return f"תיק {case_number} לא נמצא."
+
+    docs = await db.list_documents(UUID(case["id"]))
+    if not docs:
+        return f"אין מסמכים בתיק {case_number}."
+
+    return json.dumps(docs, default=str, ensure_ascii=False, indent=2)