Add full decision writing pipeline: classify, extract, brainstorm, write, QA, export

New services (11 files): - classifier.py: auto doc-type classification + party identification (Claude Haiku) - claims_extractor.py: claim extraction from pleadings (Claude Sonnet + regex) - references_extractor.py: plan/case-law/legislation detection (regex) - brainstorm.py: direction generation with 2-3 options (Claude Sonnet) - block_writer.py: 12-block decision writer (template + Claude Sonnet/Opus) - docx_exporter.py: DOCX export with David font, RTL, headings - qa_validator.py: 6 QA checks with export blocking on critical failure - learning_loop.py: draft vs final comparison + lesson extraction - metrics.py: KPIs dashboard per case and global - audit.py: action audit log - cli.py: standalone CLI with 11 commands Updated pipeline: extract → classify → chunk → embed → store → extract_references New MCP tools: 29 total (was 16) New DB tables: audit_log, decisions CRUD, claims CRUD Config: Infisical support, external service allowlist Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 10:21:47 +00:00
parent df7cc4f5a5
commit d9e5ef0f46
21 changed files with 3957 additions and 14 deletions
--- a/mcp-server/src/legal_mcp/services/references_extractor.py
+++ b/mcp-server/src/legal_mcp/services/references_extractor.py
@@ -0,0 +1,200 @@
+"""זיהוי תכניות, פסיקה וחקיקה במסמכים משפטיים.
+
+שלוש קטגוריות:
+1. תכניות (תב"ע, תמ"א, תכניות מקומיות/מחוזיות)
+2. פסיקה (עע"מ, בג"ץ, ע"א, עררים)
+3. חקיקה (חוק התכנון, חוק מיסוי מקרקעין, וכו')
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from uuid import UUID
+
+from legal_mcp.services import db
+
+logger = logging.getLogger(__name__)
+
+# ── Regex patterns ────────────────────────────────────────────────────
+
+# Plans (תכניות)
+PLAN_PATTERNS = [
+    # תמ"א with number
+    re.compile(r'תמ"א\s*[\-]?\s*(\d+)(?:\s*[\-/]\s*(\S+))?'),
+    # תכנית מתאר with identifiers
+    re.compile(r'תכנית\s+(?:מתאר\s+)?(?:ארצית|מחוזית|מקומית)?\s*(?:מס[\'"]?\s*)?(\d[\d/\-\.]+\S*)'),
+    # תב"ע with identifiers
+    re.compile(r'תב"ע\s*(?:מס[\'"]?\s*)?(\S+)'),
+    # Specific plan number patterns (e.g., 62/3, ירושלים 12345)
+    re.compile(r'תכנית\s+(\S+\s*\d[\d/\-\.]+\S*)'),
+]
+
+# Case law (פסיקה)
+CASE_LAW_PATTERNS = [
+    # Court types: עע"מ, בג"ץ, ע"א, בר"ם, עת"מ, עמ"נ, ע"ע, רע"א, דנ"א
+    re.compile(r'(עע"מ|בג"ץ|ע"א|בר"ם|עת"מ|עמ"נ|ע"ע|רע"א|דנ"א|בש"א)\s*(\d[\d/\-]+)'),
+    # ערר with district
+    re.compile(r'ערר\s*\(?\s*(מרכז|ירושלים|חי\'?פה?|ת"א|תל[- ]אביב|דרום|צפון)\s*\)?\s*(\d[\d/\-]+)'),
+    # ערר without district
+    re.compile(r'ערר\s+(\d{3,5}[\-/]\d{2,4})'),
+]
+
+# Legislation (חקיקה)
+LEGISLATION_PATTERNS = [
+    re.compile(r'(חוק\s+התכנון\s+והבני[יה]ה)\s*[,،]?\s*(?:תשכ"ה[- ])?(?:1965)?(?:\s*[,،]\s*ס(?:עיף|\'|ע\')\s*(\d+\S*))?'),
+    re.compile(r'(חוק\s+מיסוי\s+מקרקעין)\s*(?:\(שבח\s+ורכישה\))?\s*[,،]?\s*(?:תשכ"ג[- ])?(?:1963)?(?:\s*[,،]\s*ס(?:עיף|\'|ע\')\s*(\d+\S*))?'),
+    re.compile(r'(תקנות\s+התכנון\s+והבני[יה]ה)\s*\(([^)]+)\)'),
+    re.compile(r'(חוק\s+ההתיישנות)\s*(?:תשי"ח[- ])?(?:1958)?'),
+    re.compile(r'ס(?:עיף|\'|ע\')\s*(\d+\S*)\s*(?:ל|של)?(חוק\s+\S+(?:\s+\S+){0,3})'),
+]
+
+
+def extract_plans(text: str) -> list[dict]:
+    """זיהוי תכניות במסמך."""
+    plans = []
+    seen = set()
+
+    for pattern in PLAN_PATTERNS:
+        for match in pattern.finditer(text):
+            full = match.group(0).strip()
+            if full in seen or len(full) < 4:
+                continue
+            seen.add(full)
+
+            start = max(0, match.start() - 60)
+            end = min(len(text), match.end() + 100)
+            context = text[start:end].replace("\n", " ").strip()
+
+            plans.append({
+                "plan_name": full,
+                "context": context,
+            })
+
+    return plans
+
+
+def extract_case_law(text: str) -> list[dict]:
+    """זיהוי פסיקה מצוטטת במסמך."""
+    citations = []
+    seen = set()
+
+    for pattern in CASE_LAW_PATTERNS:
+        for match in pattern.finditer(text):
+            full = match.group(0).strip()
+            if full in seen:
+                continue
+            seen.add(full)
+
+            start = max(0, match.start() - 50)
+            end = min(len(text), match.end() + 100)
+            context = text[start:end].replace("\n", " ").strip()
+
+            # Try to extract case name from context
+            case_name = ""
+            name_match = re.search(r'(?:עניין|פרשת|נ[\'"]?\s+)\s*(\S+(?:\s+\S+)?)', context)
+            if name_match:
+                case_name = name_match.group(1).strip()
+
+            citations.append({
+                "citation_text": full,
+                "case_name": case_name,
+                "context": context,
+            })
+
+    return citations
+
+
+def extract_legislation(text: str) -> list[dict]:
+    """זיהוי חקיקה מצוטטת במסמך."""
+    legislation = []
+    seen = set()
+
+    for pattern in LEGISLATION_PATTERNS:
+        for match in pattern.finditer(text):
+            full = match.group(0).strip()
+            if full in seen or len(full) < 5:
+                continue
+            seen.add(full)
+
+            start = max(0, match.start() - 40)
+            end = min(len(text), match.end() + 80)
+            context = text[start:end].replace("\n", " ").strip()
+
+            legislation.append({
+                "statute_text": full,
+                "context": context,
+            })
+
+    return legislation
+
+
+def extract_all_references(text: str) -> dict:
+    """זיהוי כל ההפניות במסמך: תכניות, פסיקה, חקיקה."""
+    return {
+        "plans": extract_plans(text),
+        "case_law": extract_case_law(text),
+        "legislation": extract_legislation(text),
+    }
+
+
+async def extract_and_link_references(
+    document_id: UUID,
+    case_id: UUID,
+    text: str,
+) -> dict:
+    """זיהוי הפניות ושמירה ב-DB.
+
+    מזהה פסיקה וחקיקה, ומנסה לקשר לרשומות קיימות ב-case_law.
+    """
+    refs = extract_all_references(text)
+
+    # Try to match case_law citations to existing DB records
+    pool = await db.get_pool()
+    linked = 0
+
+    async with pool.acquire() as conn:
+        # Get existing case_law for matching
+        case_laws = await conn.fetch("SELECT id, case_number, case_name FROM case_law")
+        case_law_map = {}
+        for cl in case_laws:
+            case_law_map[cl["case_number"]] = cl["id"]
+            parts = cl["case_number"].split()
+            if len(parts) > 1:
+                case_law_map[parts[-1]] = cl["id"]
+
+        for cit in refs["case_law"]:
+            case_law_id = None
+            for key, cl_id in case_law_map.items():
+                if key in cit["citation_text"] or cit["citation_text"] in key:
+                    case_law_id = cl_id
+                    break
+
+            cit["matched_in_db"] = case_law_id is not None
+            if case_law_id:
+                linked += 1
+
+    # Store references in document metadata
+    doc = await db.get_document(document_id)
+    if doc:
+        existing_metadata = doc.get("metadata") or {}
+        if isinstance(existing_metadata, str):
+            existing_metadata = json.loads(existing_metadata)
+        existing_metadata["references"] = {
+            "plans": [{"plan_name": p["plan_name"]} for p in refs["plans"]],
+            "case_law": [
+                {"citation": c["citation_text"], "case_name": c.get("case_name", ""), "in_db": c.get("matched_in_db", False)}
+                for c in refs["case_law"]
+            ],
+            "legislation": [{"statute": l["statute_text"]} for l in refs["legislation"]],
+        }
+        await db.update_document(document_id, metadata=existing_metadata)
+
+    return {
+        "plans": len(refs["plans"]),
+        "case_law": len(refs["case_law"]),
+        "case_law_linked": linked,
+        "legislation": len(refs["legislation"]),
+        "details": refs,
+    }