"""Local document classifier — rule-based, no API calls. Classifies legal documents by filename patterns and content keywords. Falls back to Claude Code headless (`claude -p`) for ambiguous cases. """ from __future__ import annotations import json import logging import re import subprocess from pathlib import Path logger = logging.getLogger(__name__) # ── Filename patterns (checked in order, first match wins) ──────── _FILENAME_RULES: list[tuple[str, str, float]] = [ # (regex pattern on filename, doc_type, confidence) (r"כתב.ערר|כתב-ערר", "appeal", 1.0), (r"תשובה|תשובת|תגובה|תגובת|השלמת.טיעון|בקשה.להשלמת|הודעת.עמדה", "response", 1.0), (r"פרוטוקול", "protocol", 1.0), (r"החלטת?.ביניים|החלטה.לתיקון", "decision", 0.95), (r"הוראות.תכנית|תכנית", "plan", 1.0), (r"היתר", "permit", 1.0), (r"שומה|חוו.ת.דעת", "appraisal", 1.0), (r"התנגדות", "objection", 1.0), # Court decisions: case number patterns (r"(?:עעם|עע.?מ|עתמ|עת.?מ|בג.?צ|בבנ|עא|ע.?א|רעא|רע.?א|עעמ|עתמ)", "court_decision", 1.0), # ערר + number that's NOT part of our case files (i.e. precedent references) (r"^ערר.?\d", "court_decision", 0.9), ] # ── Content patterns (first 500 chars) ─────────────────────────── _CONTENT_RULES: list[tuple[str, str, float]] = [ (r"בפני\s+ועדת\s+הערר|לפנינו\s+ערר|ניתנה?\s+היום", "decision", 0.85), (r"כתב\s+ערר|העורר.{0,20}מגיש", "appeal", 0.85), (r"כתב\s+תשובה|המשיב.{0,20}משיב", "response", 0.85), (r"פרוטוקול\s+(?:דיון|ישיבה|ועדה)", "protocol", 0.9), (r"בית\s+(?:ה)?משפט|פסק\s+דין|השופט", "court_decision", 0.85), (r"הוראות\s+(?:ה)?תכנית|תב.עה|ייעוד\s+הקרקע", "plan", 0.8), ] def classify(filename: str, text: str = "") -> tuple[str, float]: """Classify a legal document by filename and content. Returns (doc_type, confidence). Confidence > 0.8 means high certainty. """ name = Path(filename).stem # Try filename rules for pattern, doc_type, confidence in _FILENAME_RULES: if re.search(pattern, name): logger.info("Local classifier: '%s' → %s (filename, %.2f)", name, doc_type, confidence) return doc_type, confidence # Try content rules (first 500 chars) snippet = text[:500] if text else "" for pattern, doc_type, confidence in _CONTENT_RULES: if re.search(pattern, snippet): logger.info("Local classifier: '%s' → %s (content, %.2f)", name, doc_type, confidence) return doc_type, confidence logger.info("Local classifier: '%s' → reference (no match, 0.3)", name) return "reference", 0.3 def classify_with_claude_code(filename: str, text: str) -> tuple[str, float]: """Fallback: use Claude Code headless to classify ambiguous documents. Only works when `claude` CLI is available (not in Docker). """ prompt = ( "סווג את המסמך המשפטי הבא לאחת הקטגוריות הבאות בלבד:\n" "appeal, response, protocol, decision, plan, permit, appraisal, " "court_decision, exhibit, objection, reference\n\n" f"שם הקובץ: {filename}\n" f"תחילת המסמך:\n{text[:500]}\n\n" 'החזר JSON בלבד: {"doc_type": "...", "confidence": 0.9}' ) try: result = subprocess.run( ["claude", "-p", prompt, "--output-format", "json", "--max-turns", "1"], capture_output=True, text=True, timeout=60, ) if result.returncode == 0 and result.stdout.strip(): data = json.loads(result.stdout) # claude -p --output-format json wraps in {"result": "..."} inner = data.get("result", data) if isinstance(inner, str): inner = json.loads(inner) doc_type = inner.get("doc_type", "reference") confidence = float(inner.get("confidence", 0.7)) logger.info("Claude Code classifier: '%s' → %s (%.2f)", filename, doc_type, confidence) return doc_type, confidence except FileNotFoundError: logger.debug("Claude CLI not available — skipping headless fallback") except (subprocess.TimeoutExpired, json.JSONDecodeError, Exception) as e: logger.warning("Claude Code classifier failed: %s", e) return "reference", 0.3