Initial commit: MCP server + web upload interface

Ezer Mishpati - AI legal decision drafting system with: - MCP server (FastMCP) with document processing pipeline - Web upload interface (FastAPI) for file upload and classification - pgvector-based semantic search - Hebrew legal document chunking and embedding
2026-03-23 12:33:07 +00:00
commit 6f515dc2cb
33 changed files with 3297 additions and 0 deletions
--- a/mcp-server/src/legal_mcp/services/style_analyzer.py
+++ b/mcp-server/src/legal_mcp/services/style_analyzer.py
@@ -0,0 +1,121 @@
+"""Style analyzer - extracts writing patterns from Dafna's decision corpus."""
+
+from __future__ import annotations
+
+import logging
+import re
+
+import anthropic
+
+from legal_mcp import config
+from legal_mcp.services import db
+
+logger = logging.getLogger(__name__)
+
+
+ANALYSIS_PROMPT = """\
+אתה מנתח סגנון כתיבה משפטית. לפניך החלטות משפטיות שנכתבו על ידי אותה יושבת ראש של ועדת ערר.
+
+נתח את ההחלטות וחלץ את דפוסי הכתיבה הבאים:
+
+1. **נוסחאות פתיחה** (opening_formula) - איך מתחילות ההחלטות
+2. **ביטויי מעבר** (transition) - ביטויים שמחברים בין חלקי ההחלטה
+3. **סגנון ציטוט** (citation_style) - איך מצטטים חקיקה ופסיקה
+4. **מבנה ניתוח** (analysis_structure) - איך בנוי הניתוח המשפטי
+5. **נוסחאות סיום** (closing_formula) - איך מסתיימות ההחלטות
+6. **ביטויים אופייניים** (characteristic_phrase) - ביטויים ייחודיים שחוזרים
+
+לכל דפוס, תן:
+- הטקסט המדויק של הדפוס
+- הקשר (באיזה חלק של ההחלטה הוא מופיע)
+- דוגמה מתוך הטקסט
+
+החזר את התוצאות בפורמט הבא (JSON array):
+```json
+[
+  {{
+    "type": "opening_formula",
+    "text": "לפניי ערר על החלטת...",
+    "context": "פתיחת ההחלטה",
+    "example": "לפניי ערר על החלטת הוועדה המקומית לתכנון ובניה ירושלים"
+  }}
+]
+```
+
+ההחלטות:
+{decisions}
+"""
+
+
+async def analyze_corpus() -> dict:
+    """Analyze the style corpus and extract/update patterns.
+
+    Returns summary of patterns found.
+    """
+    pool = await db.get_pool()
+    async with pool.acquire() as conn:
+        rows = await conn.fetch(
+            "SELECT full_text, decision_number FROM style_corpus ORDER BY decision_date DESC LIMIT 20"
+        )
+
+    if not rows:
+        return {"error": "אין החלטות בקורפוס. העלה החלטות קודמות תחילה."}
+
+    # Prepare text for analysis
+    decisions_text = ""
+    for row in rows:
+        decisions_text += f"\n\n--- החלטה {row['decision_number'] or 'ללא מספר'} ---\n"
+        # Limit each decision to ~3000 chars to fit context
+        text = row["full_text"]
+        if len(text) > 3000:
+            text = text[:1500] + "\n...\n" + text[-1500:]
+        decisions_text += text
+
+    # Call Claude to analyze patterns
+    client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
+    message = client.messages.create(
+        model="claude-sonnet-4-6",
+        max_tokens=16384,
+        messages=[
+            {
+                "role": "user",
+                "content": ANALYSIS_PROMPT.format(decisions=decisions_text),
+            }
+        ],
+    )
+
+    response_text = message.content[0].text
+
+    # Extract JSON from response - prefer code-block fenced JSON
+    import json
+    code_block = re.search(r"```(?:json)?\s*(\[[\s\S]*?\])\s*```", response_text)
+    if code_block:
+        json_str = code_block.group(1)
+    else:
+        # Fallback: find the last JSON array (skip prose brackets)
+        all_arrays = list(re.finditer(r"\[[\s\S]*?\]", response_text))
+        if not all_arrays:
+            return {"error": "Could not parse analysis results", "raw": response_text}
+        json_str = all_arrays[-1].group()
+
+    try:
+        patterns = json.loads(json_str)
+    except json.JSONDecodeError as e:
+        return {"error": f"JSON parse error: {e}", "raw": response_text}
+
+    # Store patterns
+    count = 0
+    for pattern in patterns:
+        await db.upsert_style_pattern(
+            pattern_type=pattern.get("type", "other"),
+            pattern_text=pattern.get("text", ""),
+            context=pattern.get("context", ""),
+            examples=[pattern.get("example", "")],
+        )
+        count += 1
+
+    return {
+        "patterns_found": count,
+        "decisions_analyzed": len(rows),
+        "pattern_types": list({p.get("type") for p in patterns}),
+    }