Add CMPA (betterment levy) training support and update methodology

Support ingestion of betterment levy (היטל השבחה) decisions into a separate training corpus (CMPA). Key changes: - Add .doc file extraction via LibreOffice conversion in extractor - Add practice_area/appeal_subtype columns to style_corpus table - Route training files to cmp/ or cmpa/ subdirs based on appeal subtype - Fix derive_subtype to handle ARAR-YY-NNNN format (was matching year digit) - Expose practice_area/appeal_subtype params in MCP upload_training tool - Add appeal_subtype filter to analyze_style for per-type style analysis - Update betterment levy methodology in lessons.py: checklist (from generic to corpus-based), opening/closing strategies, and discussion rules Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 14:00:35 +00:00
parent 684a4cfd3b
commit ba39707c70
8 changed files with 145 additions and 51 deletions
--- a/mcp-server/src/legal_mcp/services/extractor.py
+++ b/mcp-server/src/legal_mcp/services/extractor.py
@@ -1,7 +1,8 @@
-"""Text extraction from PDF, DOCX, and RTF files.
+"""Text extraction from PDF, DOCX, DOC, and RTF files.

 Primary PDF extraction: PyMuPDF direct text (for born-digital PDFs).
 Fallback: Google Cloud Vision OCR (for scanned documents).
+DOC files: converted to DOCX via LibreOffice before extraction.
 Post-processing: Hebrew abbreviation quote fixer.
 """

@@ -10,6 +11,8 @@ from __future__ import annotations
 import asyncio
 import logging
 import re
+import subprocess
+import tempfile
 from pathlib import Path

 import fitz  # PyMuPDF
@@ -129,6 +132,8 @@ async def extract_text(file_path: str) -> tuple[str, int]:
        return await _extract_pdf(path)
    elif suffix == ".docx":
        return _extract_docx(path), 0
+    elif suffix == ".doc":
+        return _extract_doc(path), 0
    elif suffix == ".rtf":
        return _extract_rtf(path), 0
    elif suffix in (".txt", ".md"):
@@ -187,6 +192,21 @@ def _ocr_with_google_vision(image_bytes: bytes, page_num: int) -> str:
    return _fix_hebrew_quotes(text)


+def _extract_doc(path: Path) -> str:
+    """Extract text from legacy .doc file by converting to .docx via LibreOffice."""
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        result = subprocess.run(
+            ["libreoffice", "--headless", "--convert-to", "docx", str(path), "--outdir", tmp_dir],
+            capture_output=True, text=True, timeout=120,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"LibreOffice conversion failed: {result.stderr}")
+        docx_path = Path(tmp_dir) / f"{path.stem}.docx"
+        if not docx_path.exists():
+            raise FileNotFoundError(f"Converted file not found: {docx_path}")
+        return _extract_docx(docx_path)
+
+
 def _extract_docx(path: Path) -> str:
    """Extract text from DOCX file."""
    doc = DocxDocument(str(path))