Add CMPA (betterment levy) training support and update methodology

Support ingestion of betterment levy (היטל השבחה) decisions into a
separate training corpus (CMPA). Key changes:

- Add .doc file extraction via LibreOffice conversion in extractor
- Add practice_area/appeal_subtype columns to style_corpus table
- Route training files to cmp/ or cmpa/ subdirs based on appeal subtype
- Fix derive_subtype to handle ARAR-YY-NNNN format (was matching year digit)
- Expose practice_area/appeal_subtype params in MCP upload_training tool
- Add appeal_subtype filter to analyze_style for per-type style analysis
- Update betterment levy methodology in lessons.py: checklist (from generic
  to corpus-based), opening/closing strategies, and discussion rules

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-15 14:00:35 +00:00
parent 684a4cfd3b
commit ba39707c70
8 changed files with 145 additions and 51 deletions

View File

@@ -1,7 +1,8 @@
"""Text extraction from PDF, DOCX, and RTF files.
"""Text extraction from PDF, DOCX, DOC, and RTF files.
Primary PDF extraction: PyMuPDF direct text (for born-digital PDFs).
Fallback: Google Cloud Vision OCR (for scanned documents).
DOC files: converted to DOCX via LibreOffice before extraction.
Post-processing: Hebrew abbreviation quote fixer.
"""
@@ -10,6 +11,8 @@ from __future__ import annotations
import asyncio
import logging
import re
import subprocess
import tempfile
from pathlib import Path
import fitz # PyMuPDF
@@ -129,6 +132,8 @@ async def extract_text(file_path: str) -> tuple[str, int]:
return await _extract_pdf(path)
elif suffix == ".docx":
return _extract_docx(path), 0
elif suffix == ".doc":
return _extract_doc(path), 0
elif suffix == ".rtf":
return _extract_rtf(path), 0
elif suffix in (".txt", ".md"):
@@ -187,6 +192,21 @@ def _ocr_with_google_vision(image_bytes: bytes, page_num: int) -> str:
return _fix_hebrew_quotes(text)
def _extract_doc(path: Path) -> str:
"""Extract text from legacy .doc file by converting to .docx via LibreOffice."""
with tempfile.TemporaryDirectory() as tmp_dir:
result = subprocess.run(
["libreoffice", "--headless", "--convert-to", "docx", str(path), "--outdir", tmp_dir],
capture_output=True, text=True, timeout=120,
)
if result.returncode != 0:
raise RuntimeError(f"LibreOffice conversion failed: {result.stderr}")
docx_path = Path(tmp_dir) / f"{path.stem}.docx"
if not docx_path.exists():
raise FileNotFoundError(f"Converted file not found: {docx_path}")
return _extract_docx(docx_path)
def _extract_docx(path: Path) -> str:
"""Extract text from DOCX file."""
doc = DocxDocument(str(path))