Add CMPA (betterment levy) training support and update methodology

Support ingestion of betterment levy (היטל השבחה) decisions into a separate training corpus (CMPA). Key changes: - Add .doc file extraction via LibreOffice conversion in extractor - Add practice_area/appeal_subtype columns to style_corpus table - Route training files to cmp/ or cmpa/ subdirs based on appeal subtype - Fix derive_subtype to handle ARAR-YY-NNNN format (was matching year digit) - Expose practice_area/appeal_subtype params in MCP upload_training tool - Add appeal_subtype filter to analyze_style for per-type style analysis - Update betterment levy methodology in lessons.py: checklist (from generic to corpus-based), opening/closing strategies, and discussion rules Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 14:00:35 +00:00
parent 684a4cfd3b
commit ba39707c70
8 changed files with 145 additions and 51 deletions
--- a/mcp-server/src/legal_mcp/tools/documents.py
+++ b/mcp-server/src/legal_mcp/tools/documents.py
@@ -139,9 +139,16 @@ async def document_upload_training(
        appeal_subtype = pa.derive_subtype(decision_number, practice_area)
    pa.validate(practice_area, appeal_subtype)

-    # Copy to training directory (skip if already there)
-    config.TRAINING_DIR.mkdir(parents=True, exist_ok=True)
-    dest = config.TRAINING_DIR / source.name
+    # Copy to training directory, organized by subtype
+    _SUBTYPE_DIRS = {
+        "betterment_levy": "cmpa",
+        "compensation_197": "cmpa",
+        "building_permit": "cmp",
+    }
+    subdir = _SUBTYPE_DIRS.get(appeal_subtype, "")
+    training_dest = config.TRAINING_DIR / subdir if subdir else config.TRAINING_DIR
+    training_dest.mkdir(parents=True, exist_ok=True)
+    dest = training_dest / source.name
    if source.resolve() != dest.resolve():
        shutil.copy2(str(source), str(dest))

@@ -174,11 +181,12 @@ async def document_upload_training(
            title=f"[קורפוס] {title}",
            file_path=str(dest),
            page_count=page_count,
-            practice_area=practice_area,
-            appeal_subtype=appeal_subtype,
        )
        doc_id = UUID(doc["id"])
-        await db.update_document(doc_id, extracted_text=text, extraction_status="completed")
+        await db.update_document(
+            doc_id, extracted_text=text, extraction_status="completed",
+            metadata={"practice_area": practice_area, "appeal_subtype": appeal_subtype},
+        )

        # Generate embeddings and store chunks
        texts = [c.content for c in chunks]
@@ -193,10 +201,7 @@ async def document_upload_training(
            }
            for c, emb in zip(chunks, embs)
        ]
-        await db.store_chunks(
-            doc_id, None, chunk_dicts,
-            practice_area=practice_area, appeal_subtype=appeal_subtype,
-        )
+        await db.store_chunks(doc_id, None, chunk_dicts)

    return json.dumps({
        "corpus_id": str(corpus_id),
--- a/mcp-server/src/legal_mcp/tools/drafting.py
+++ b/mcp-server/src/legal_mcp/tools/drafting.py
@@ -454,11 +454,16 @@ async def save_block_content(case_number: str, block_id: str, content: str) -> s
        return str(e)


-async def analyze_style() -> str:
-    """הרצת ניתוח סגנון על קורפוס ההחלטות של דפנה. מחלץ דפוסי כתיבה ושומר אותם."""
+async def analyze_style(appeal_subtype: str = "") -> str:
+    """הרצת ניתוח סגנון על קורפוס ההחלטות של דפנה. מחלץ דפוסי כתיבה ושומר אותם.
+
+    Args:
+        appeal_subtype: סינון לפי סוג ערר (building_permit / betterment_levy / compensation_197).
+                        ריק = כל ההחלטות.
+    """
    from legal_mcp.services.style_analyzer import analyze_corpus

-    result = await analyze_corpus()
+    result = await analyze_corpus(appeal_subtype)
    return json.dumps(result, ensure_ascii=False, indent=2)