Add CMPA (betterment levy) training support and update methodology

Support ingestion of betterment levy (היטל השבחה) decisions into a separate training corpus (CMPA). Key changes: - Add .doc file extraction via LibreOffice conversion in extractor - Add practice_area/appeal_subtype columns to style_corpus table - Route training files to cmp/ or cmpa/ subdirs based on appeal subtype - Fix derive_subtype to handle ARAR-YY-NNNN format (was matching year digit) - Expose practice_area/appeal_subtype params in MCP upload_training tool - Add appeal_subtype filter to analyze_style for per-type style analysis - Update betterment levy methodology in lessons.py: checklist (from generic to corpus-based), opening/closing strategies, and discussion rules Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 14:00:35 +00:00
parent 684a4cfd3b
commit ba39707c70
8 changed files with 145 additions and 51 deletions
--- a/mcp-server/src/legal_mcp/server.py
+++ b/mcp-server/src/legal_mcp/server.py
@@ -165,10 +165,13 @@ async def document_upload_training(
    decision_date: str = "",
    subject_categories: list[str] | None = None,
    title: str = "",
    practice_area: str = "appeals_committee",
    appeal_subtype: str = "",
 ) -> str:
-    """העלאת החלטה קודמת של דפנה לקורפוס הסגנון. קטגוריות: בנייה, שימוש חורג, תכנית, היתר, הקלה, חלוקה, תמ"א 38, היטל השבחה, פיצויים 197."""
+    """העלאת החלטה קודמת של דפנה לקורפוס הסגנון. קטגוריות: בנייה, שימוש חורג, תכנית, היתר, הקלה, חלוקה, תמ"א 38, היטל השבחה, פיצויים 197. סוג ערר: building_permit / betterment_levy / compensation_197 (ריק = אוטומטי ממספר ההחלטה)."""
    return await documents.document_upload_training(
        file_path, decision_number, decision_date, subject_categories, title,
        practice_area, appeal_subtype,
    )
@@ -319,9 +322,9 @@ async def export_docx(case_number: str, output_path: str = "") -> str:
@mcp.tool()
-async def analyze_style() -> str:
+async def analyze_style(appeal_subtype: str = "") -> str:
-    """ניתוח סגנון על קורפוס ההחלטות של דפנה. מחלץ ושומר דפוסי כתיבה."""
+    """ניתוח סגנון על קורפוס ההחלטות של דפנה. מחלץ ושומר דפוסי כתיבה. סוג ערר: building_permit / betterment_levy / compensation_197 (ריק = הכל)."""
-    return await drafting.analyze_style()
+    return await drafting.analyze_style(appeal_subtype)
@mcp.tool()
--- a/mcp-server/src/legal_mcp/services/db.py
+++ b/mcp-server/src/legal_mcp/services/db.py
@@ -104,6 +104,8 @@ CREATE TABLE IF NOT EXISTS style_corpus (
    summary TEXT DEFAULT '',
    outcome TEXT DEFAULT '',
    key_principles JSONB DEFAULT '[]',
    practice_area TEXT DEFAULT 'appeals_committee',
    appeal_subtype TEXT DEFAULT '',
    created_at TIMESTAMPTZ DEFAULT now()
 );
@@ -159,6 +161,10 @@ ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_type TEXT DEFAULT '';
 ALTER TABLE cases ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT 'appeals_committee';
 ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
 -- הרחבת style_corpus עם practice_area / appeal_subtype
 ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT 'appeals_committee';
 ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
 -- טבלת qa_results
 CREATE TABLE IF NOT EXISTS qa_results (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
@@ -882,6 +888,8 @@ async def add_to_style_corpus(
    summary: str = "",
    outcome: str = "",
    key_principles: list[str] | None = None,
    practice_area: str = "appeals_committee",
    appeal_subtype: str = "",
 ) -> UUID:
    pool = await get_pool()
    corpus_id = uuid4()
@@ -889,11 +897,13 @@ async def add_to_style_corpus(
        await conn.execute(
            """INSERT INTO style_corpus
               (id, document_id, decision_number, decision_date,
-                subject_categories, full_text, summary, outcome, key_principles)
+                subject_categories, full_text, summary, outcome, key_principles,
-               VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""",
+                practice_area, appeal_subtype)
               VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)""",
            corpus_id, document_id, decision_number, decision_date,
            json.dumps(subject_categories), full_text, summary, outcome,
            json.dumps(key_principles or []),
            practice_area, appeal_subtype,
        )
    return corpus_id
--- a/mcp-server/src/legal_mcp/services/extractor.py
+++ b/mcp-server/src/legal_mcp/services/extractor.py
@@ -1,7 +1,8 @@
-"""Text extraction from PDF, DOCX, and RTF files.
+"""Text extraction from PDF, DOCX, DOC, and RTF files.
 Primary PDF extraction: PyMuPDF direct text (for born-digital PDFs).
 Fallback: Google Cloud Vision OCR (for scanned documents).
 DOC files: converted to DOCX via LibreOffice before extraction.
 Post-processing: Hebrew abbreviation quote fixer.
 """
@@ -10,6 +11,8 @@ from __future__ import annotations
 import asyncio
 import logging
 import re
 import subprocess
 import tempfile
 from pathlib import Path
 import fitz  # PyMuPDF
@@ -129,6 +132,8 @@ async def extract_text(file_path: str) -> tuple[str, int]:
        return await _extract_pdf(path)
    elif suffix == ".docx":
        return _extract_docx(path), 0
    elif suffix == ".doc":
        return _extract_doc(path), 0
    elif suffix == ".rtf":
        return _extract_rtf(path), 0
    elif suffix in (".txt", ".md"):
@@ -187,6 +192,21 @@ def _ocr_with_google_vision(image_bytes: bytes, page_num: int) -> str:
    return _fix_hebrew_quotes(text)
 def _extract_doc(path: Path) -> str:
    """Extract text from legacy .doc file by converting to .docx via LibreOffice."""
    with tempfile.TemporaryDirectory() as tmp_dir:
        result = subprocess.run(
            ["libreoffice", "--headless", "--convert-to", "docx", str(path), "--outdir", tmp_dir],
            capture_output=True, text=True, timeout=120,
        )
        if result.returncode != 0:
            raise RuntimeError(f"LibreOffice conversion failed: {result.stderr}")
        docx_path = Path(tmp_dir) / f"{path.stem}.docx"
        if not docx_path.exists():
            raise FileNotFoundError(f"Converted file not found: {docx_path}")
        return _extract_docx(docx_path)
 def _extract_docx(path: Path) -> str:
    """Extract text from DOCX file."""
    doc = DocxDocument(str(path))
--- a/mcp-server/src/legal_mcp/services/lessons.py
+++ b/mcp-server/src/legal_mcp/services/lessons.py
@@ -72,9 +72,14 @@ OPENING_STRATEGIES = {
        ),
    },
    "betterment_levy": {
-        "style": "direct_with_disclaimer",
+        "style": "direct_factual",
        "paragraphs": (1, 3),
-        "description": "פתיחה ישירה עם מסקנה + 'על מנת לא לצאת בחסר'",
+        "description": (
            "פתיחה ישירה ועובדתית: 'בפנינו ערר על דרישת תשלום היטל השבחה מיום [תאריך] "
            "בסך של [סכום] ₪' → רקע קצר (נכס, תכנית משביחה, מימוש) → "
            "תמצית טענות הצדדים (עוררים + משיבה בנפרד). "
            "אין הקשר תכנוני רחב. הפתיחה = עובדות בלבד."
        ),
    },
 }
@@ -101,9 +106,16 @@ SUMMARY_STRATEGIES = {
        ),
    },
    "betterment_levy": {
-        "heading": "סיכום",
+        "heading": "various",
-        "format": "numbered_hebrew_dry",
+        "format": "dry_operative",
-        "description": "אותיות עבריות, סיום יבש ללא פסקה חמה",
+        "description": (
            "סיום יבש ואופרטיבי. כותרת משתנה: 'סוף דבר' / 'לאור כל האמור לעיל' / ללא כותרת. "
            "תוכן: 'הערר נדחה/מתקבל' + הוצאות ('כל צד ישא בהוצאותיו' / חיוב בסכום). "
            "אם מתקבל: הוראות אופרטיביות (החזר, שומה מתוקנת, תנאים). "
            "חתימה: 'ניתנה פה אחד היום, [תאריך עברי], [תאריך לועזי].' "
            "לעיתים: 'התיק ייסגר.' / 'עומדת זכות ערר כדין.' "
            "אין פסקה חמה. אין חזרה על נימוקים."
        ),
    },
 }
@@ -129,7 +141,12 @@ DISCUSSION_RULES: dict[str, list[str]] = {
        "מבנה ישיר: נקודות עיקריות → ניתוח → מסקנה.",
    ],
    "betterment_levy": [
-        "מבנה ישיר עם מסקנה מוקדמת + 'על מנת לא לצאת בחסר' לנקודות נוספות.",
+        "פתיחת דיון: מסקנה מוקדמת ('לאחר שבחנו... מצאנו כי דין הערר להידחות/להתקבל').",
        "תקן ביקורת: ציון רף ההתערבות בשומה מכרעת (בר\"ם 3644/13 גלר) — אבחנה בין שמאי למשפטי.",
        "הצגת הלכה פסוקה: ציטוט ארוך מפס\"ד מרכזי → 'ברוח הדברים לעיל נבחן את טענות הצדדים'.",
        "טיפול שיטתי: כל טענה/סוגיה בנפרד → ניתוח → מסקנת ביניים.",
        "ביטויים: 'אין בידינו לקבל', 'לא מצאנו מקום להתערב', 'קביעה נכונה שאין מקום להתערב בה'.",
        "'על מנת לא לצאת בחסר' — לנקודות obiter dicta בסוף הדיון.",
    ],
 }
@@ -448,26 +465,41 @@ CONTENT_CHECKLISTS: dict[str, str] = {
 """,
    "betterment_levy": """## צ'קליסט תוכן — ערר היטל השבחה
-⚠️ שים לב: אין עדיין החלטות היטל השבחה בקורפוס האימון.
+מבוסס על ניתוח 26 החלטות של דפנה תמיר (קורפוס CMPA, אפריל 2026).
 הצ'קליסט הזה מבוסס על ידע כללי — לא על ניתוח ספציפי של סגנון דפנה.
-### א. המסגרת הנורמטיבית
+### א. תקן ביקורת (חובה בפתיחת הדיון)
 - ציין את רף ההתערבות: "ועדת הערר תיטה לאמץ את חוות דעתו של השמאי..."
 - אבחנה: התערבות מצומצמת בעניינים שמאיים-מקצועיים, התערבות רחבה בעניינים משפטיים
 - הפניה ל-בר"ם 3644/13 גלר או פסיקה דומה
 ### ב. המסגרת הנורמטיבית
 - התוספת השלישית לחוק התכנון והבנייה
- אירוע מס — מה יצר את ההשבחה?
+- סעיפי הפטור הרלוונטיים (ס' 19(ג), ס' 19(ב) וכו')
 - אירוע מס — מה יצר את ההשבחה? (תכנית, היתר, מכר)
 - מועד המימוש ומועד הקובע
-### ב. שומה
+### ג. שומה ומתודולוגיה שמאית
- שיטת השומה (שומה מכרעת / שמאי מייעץ)
+- שיטת השומה (שומה מכרעת / שומה מוסכמת / שמאי מייעץ)
- מועד הקובע
+- מבחן השימוש הטוב והיעיל (highest and best use) — מצב קודם ומצב חדש
- זכויות בנייה — לפני ואחרי
+- זכויות בנייה — לפני ואחרי (אחוזי בנייה, שטחים עיקריים, תמהיל שימושים)
 - שווי מקרקעין — מצב קודם ומצב חדש (שיטת השוואה / יחידות תועלת)
 - עלויות עודפות (חניה, מטלות ציבוריות, תשתיות)
 - מקדמי זמינות, שיעורי הפקעה
-### ג. שאלות משפטיות
+### ד. שאלות משפטיות (לפי רלוונטיות)
- פטורים (ס' 19)
+- פטורים — דירת מגורים (ס' 19(ג)(1)), שטח עד 140 מ"ר, תא משפחתי
- מועדי תשלום
+- מועד מימוש — זיכרון דברים vs הסכם מכר, העברת זכויות
- שיערוך
+- זהות החייב — בעלים, חוכר, יזם, חברה בבעלות יזם
 - מקרקעי ישראל — הסדרים מיוחדים (ס' 21 לתוספת השלישית)
 - שומות מוסכמות — תוקף, משמעות, "בלתי נצפה מראש"
 - פרשנות תכניות — ייעוד, שימושים מותרים, מדיניות ועדה מקומית
-### ד. ניתוח שמאי
+### ה. ניתוח שמאי (כשיש שומה מכרעת)
- האם השומה תקינה?
+- האם השומה מבוססת על מסד עובדתי הולם?
- פערים בין השומות
+- האם השיטה השמאית מקובלת?
 - האם ההנחות סבירות והגיוניות?
 - טעות מהותית / דופי חמור?
 - פגם מינהלי (ניגוד עניינים, משוא פנים)?
 """,
 }
--- a/mcp-server/src/legal_mcp/services/practice_area.py
+++ b/mcp-server/src/legal_mcp/services/practice_area.py
@@ -43,14 +43,17 @@ SUBTYPES_BY_AREA: dict[str, set[str]] = {
 # ── Derivation ─────────────────────────────────────────────────────
 _FIRST_DIGIT = re.compile(r"^\s*(\d)")
 _APPEALS_COMMITTEE_DIGIT_TO_SUBTYPE = {
    "1": "building_permit",
    "8": "betterment_levy",
    "9": "compensation_197",
 }
 # Match the case number (last numeric group) in formats like:
 #   ARAR-25-8126, ARAR-24-01-8007-33, 8126/25, 1170, ערר 1024-25
 _CASE_NUM = re.compile(r"(?:ARAR[-\s]*\d{2}[-\s]*(?:\d{2}[-\s]*)?)(\d{4})", re.IGNORECASE)
 _PLAIN_NUM = re.compile(r"(\d{4})")
 def derive_subtype(case_number: str, practice_area: str = DEFAULT_PRACTICE_AREA) -> str:
    """Infer the appeal_subtype from case_number.
@@ -58,15 +61,20 @@ def derive_subtype(case_number: str, practice_area: str = DEFAULT_PRACTICE_AREA)
    For appeals_committee, the convention is:
      1xxx → building_permit, 8xxx → betterment_levy, 9xxx → compensation_197.
-    For other practice areas there is no public numbering convention yet,
+    Handles multiple formats: ARAR-25-8126, 8126/25, 1170, ערר 1024-25.
    so we return 'unknown' until a real rule is defined.
    """
    if practice_area != "appeals_committee":
        return "unknown"
-    m = _FIRST_DIGIT.match(case_number or "")
+    cn = case_number or ""
    # Try ARAR format first (extracts the 4-digit case number after year prefix)
    m = _CASE_NUM.search(cn)
    if not m:
        # Fallback: first 4-digit number in the string
        m = _PLAIN_NUM.search(cn)
    if not m:
        return "unknown"
-    return _APPEALS_COMMITTEE_DIGIT_TO_SUBTYPE.get(m.group(1), "unknown")
+    first_digit = m.group(1)[0]
    return _APPEALS_COMMITTEE_DIGIT_TO_SUBTYPE.get(first_digit, "unknown")
 # ── Validation ─────────────────────────────────────────────────────
--- a/mcp-server/src/legal_mcp/services/style_analyzer.py
+++ b/mcp-server/src/legal_mcp/services/style_analyzer.py
@@ -109,13 +109,24 @@ SYNTHESIS_PROMPT = """\
 """
-async def analyze_corpus() -> dict:
+async def analyze_corpus(appeal_subtype: str = "") -> dict:
    """Analyze the style corpus and extract/update patterns.
    Args:
        appeal_subtype: filter by appeal subtype (e.g. 'betterment_levy', 'building_permit').
                        Empty string = all decisions.
    Returns summary of patterns found.
    """
    pool = await db.get_pool()
    async with pool.acquire() as conn:
        if appeal_subtype:
            rows = await conn.fetch(
                "SELECT full_text, decision_number FROM style_corpus "
                "WHERE appeal_subtype = $1 ORDER BY decision_date DESC LIMIT 20",
                appeal_subtype,
            )
        else:
            rows = await conn.fetch(
                "SELECT full_text, decision_number FROM style_corpus ORDER BY decision_date DESC LIMIT 20"
            )
--- a/mcp-server/src/legal_mcp/tools/documents.py
+++ b/mcp-server/src/legal_mcp/tools/documents.py
@@ -139,9 +139,16 @@ async def document_upload_training(
        appeal_subtype = pa.derive_subtype(decision_number, practice_area)
    pa.validate(practice_area, appeal_subtype)
-    # Copy to training directory (skip if already there)
+    # Copy to training directory, organized by subtype
-    config.TRAINING_DIR.mkdir(parents=True, exist_ok=True)
+    _SUBTYPE_DIRS = {
-    dest = config.TRAINING_DIR / source.name
+        "betterment_levy": "cmpa",
        "compensation_197": "cmpa",
        "building_permit": "cmp",
    }
    subdir = _SUBTYPE_DIRS.get(appeal_subtype, "")
    training_dest = config.TRAINING_DIR / subdir if subdir else config.TRAINING_DIR
    training_dest.mkdir(parents=True, exist_ok=True)
    dest = training_dest / source.name
    if source.resolve() != dest.resolve():
        shutil.copy2(str(source), str(dest))
@@ -174,11 +181,12 @@ async def document_upload_training(
            title=f"[קורפוס] {title}",
            file_path=str(dest),
            page_count=page_count,
            practice_area=practice_area,
            appeal_subtype=appeal_subtype,
        )
        doc_id = UUID(doc["id"])
-        await db.update_document(doc_id, extracted_text=text, extraction_status="completed")
+        await db.update_document(
            doc_id, extracted_text=text, extraction_status="completed",
            metadata={"practice_area": practice_area, "appeal_subtype": appeal_subtype},
        )
        # Generate embeddings and store chunks
        texts = [c.content for c in chunks]
@@ -193,10 +201,7 @@ async def document_upload_training(
            }
            for c, emb in zip(chunks, embs)
        ]
-        await db.store_chunks(
+        await db.store_chunks(doc_id, None, chunk_dicts)
            doc_id, None, chunk_dicts,
            practice_area=practice_area, appeal_subtype=appeal_subtype,
        )
    return json.dumps({
        "corpus_id": str(corpus_id),
--- a/mcp-server/src/legal_mcp/tools/drafting.py
+++ b/mcp-server/src/legal_mcp/tools/drafting.py
@@ -454,11 +454,16 @@ async def save_block_content(case_number: str, block_id: str, content: str) -> s
        return str(e)
-async def analyze_style() -> str:
+async def analyze_style(appeal_subtype: str = "") -> str:
-    """הרצת ניתוח סגנון על קורפוס ההחלטות של דפנה. מחלץ דפוסי כתיבה ושומר אותם."""
+    """הרצת ניתוח סגנון על קורפוס ההחלטות של דפנה. מחלץ דפוסי כתיבה ושומר אותם.
    Args:
        appeal_subtype: סינון לפי סוג ערר (building_permit / betterment_levy / compensation_197).
                        ריק = כל ההחלטות.
    """
    from legal_mcp.services.style_analyzer import analyze_corpus
-    result = await analyze_corpus()
+    result = await analyze_corpus(appeal_subtype)
    return json.dumps(result, ensure_ascii=False, indent=2)