Add CMPA (betterment levy) training support and update methodology
Support ingestion of betterment levy (היטל השבחה) decisions into a separate training corpus (CMPA). Key changes: - Add .doc file extraction via LibreOffice conversion in extractor - Add practice_area/appeal_subtype columns to style_corpus table - Route training files to cmp/ or cmpa/ subdirs based on appeal subtype - Fix derive_subtype to handle ARAR-YY-NNNN format (was matching year digit) - Expose practice_area/appeal_subtype params in MCP upload_training tool - Add appeal_subtype filter to analyze_style for per-type style analysis - Update betterment levy methodology in lessons.py: checklist (from generic to corpus-based), opening/closing strategies, and discussion rules Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
"""Text extraction from PDF, DOCX, and RTF files.
|
||||
"""Text extraction from PDF, DOCX, DOC, and RTF files.
|
||||
|
||||
Primary PDF extraction: PyMuPDF direct text (for born-digital PDFs).
|
||||
Fallback: Google Cloud Vision OCR (for scanned documents).
|
||||
DOC files: converted to DOCX via LibreOffice before extraction.
|
||||
Post-processing: Hebrew abbreviation quote fixer.
|
||||
"""
|
||||
|
||||
@@ -10,6 +11,8 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import fitz # PyMuPDF
|
||||
@@ -129,6 +132,8 @@ async def extract_text(file_path: str) -> tuple[str, int]:
|
||||
return await _extract_pdf(path)
|
||||
elif suffix == ".docx":
|
||||
return _extract_docx(path), 0
|
||||
elif suffix == ".doc":
|
||||
return _extract_doc(path), 0
|
||||
elif suffix == ".rtf":
|
||||
return _extract_rtf(path), 0
|
||||
elif suffix in (".txt", ".md"):
|
||||
@@ -187,6 +192,21 @@ def _ocr_with_google_vision(image_bytes: bytes, page_num: int) -> str:
|
||||
return _fix_hebrew_quotes(text)
|
||||
|
||||
|
||||
def _extract_doc(path: Path) -> str:
|
||||
"""Extract text from legacy .doc file by converting to .docx via LibreOffice."""
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
result = subprocess.run(
|
||||
["libreoffice", "--headless", "--convert-to", "docx", str(path), "--outdir", tmp_dir],
|
||||
capture_output=True, text=True, timeout=120,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"LibreOffice conversion failed: {result.stderr}")
|
||||
docx_path = Path(tmp_dir) / f"{path.stem}.docx"
|
||||
if not docx_path.exists():
|
||||
raise FileNotFoundError(f"Converted file not found: {docx_path}")
|
||||
return _extract_docx(docx_path)
|
||||
|
||||
|
||||
def _extract_docx(path: Path) -> str:
|
||||
"""Extract text from DOCX file."""
|
||||
doc = DocxDocument(str(path))
|
||||
|
||||
Reference in New Issue
Block a user