Add CMPA (betterment levy) training support and update methodology

Support ingestion of betterment levy (היטל השבחה) decisions into a
separate training corpus (CMPA). Key changes:

- Add .doc file extraction via LibreOffice conversion in extractor
- Add practice_area/appeal_subtype columns to style_corpus table
- Route training files to cmp/ or cmpa/ subdirs based on appeal subtype
- Fix derive_subtype to handle ARAR-YY-NNNN format (was matching year digit)
- Expose practice_area/appeal_subtype params in MCP upload_training tool
- Add appeal_subtype filter to analyze_style for per-type style analysis
- Update betterment levy methodology in lessons.py: checklist (from generic
  to corpus-based), opening/closing strategies, and discussion rules

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-15 14:00:35 +00:00
parent 684a4cfd3b
commit ba39707c70
8 changed files with 145 additions and 51 deletions

View File

@@ -139,9 +139,16 @@ async def document_upload_training(
appeal_subtype = pa.derive_subtype(decision_number, practice_area)
pa.validate(practice_area, appeal_subtype)
# Copy to training directory (skip if already there)
config.TRAINING_DIR.mkdir(parents=True, exist_ok=True)
dest = config.TRAINING_DIR / source.name
# Copy to training directory, organized by subtype
_SUBTYPE_DIRS = {
"betterment_levy": "cmpa",
"compensation_197": "cmpa",
"building_permit": "cmp",
}
subdir = _SUBTYPE_DIRS.get(appeal_subtype, "")
training_dest = config.TRAINING_DIR / subdir if subdir else config.TRAINING_DIR
training_dest.mkdir(parents=True, exist_ok=True)
dest = training_dest / source.name
if source.resolve() != dest.resolve():
shutil.copy2(str(source), str(dest))
@@ -174,11 +181,12 @@ async def document_upload_training(
title=f"[קורפוס] {title}",
file_path=str(dest),
page_count=page_count,
practice_area=practice_area,
appeal_subtype=appeal_subtype,
)
doc_id = UUID(doc["id"])
await db.update_document(doc_id, extracted_text=text, extraction_status="completed")
await db.update_document(
doc_id, extracted_text=text, extraction_status="completed",
metadata={"practice_area": practice_area, "appeal_subtype": appeal_subtype},
)
# Generate embeddings and store chunks
texts = [c.content for c in chunks]
@@ -193,10 +201,7 @@ async def document_upload_training(
}
for c, emb in zip(chunks, embs)
]
await db.store_chunks(
doc_id, None, chunk_dicts,
practice_area=practice_area, appeal_subtype=appeal_subtype,
)
await db.store_chunks(doc_id, None, chunk_dicts)
return json.dumps({
"corpus_id": str(corpus_id),

View File

@@ -454,11 +454,16 @@ async def save_block_content(case_number: str, block_id: str, content: str) -> s
return str(e)
async def analyze_style() -> str:
"""הרצת ניתוח סגנון על קורפוס ההחלטות של דפנה. מחלץ דפוסי כתיבה ושומר אותם."""
async def analyze_style(appeal_subtype: str = "") -> str:
"""הרצת ניתוח סגנון על קורפוס ההחלטות של דפנה. מחלץ דפוסי כתיבה ושומר אותם.
Args:
appeal_subtype: סינון לפי סוג ערר (building_permit / betterment_levy / compensation_197).
ריק = כל ההחלטות.
"""
from legal_mcp.services.style_analyzer import analyze_corpus
result = await analyze_corpus()
result = await analyze_corpus(appeal_subtype)
return json.dumps(result, ensure_ascii=False, indent=2)