Files
legal-ai/mcp-server/src/legal_mcp/services/local_classifier.py
Chaim 3f759d3610 Improve document processing pipeline and agent workflows
- Add delete_document_chunks for reprocessing, save extracted text to disk
- Expand case directory structure (original/extracted/proofread/backup)
- Update classifier patterns (תגובה, הודעת עמדה)
- Fix proofreader agent paths for new directory layout
- Update HEARTBEAT to notify on every task completion
- Improve bidi_table with LRE/PDF directional embedding
- Add Paperclip project verification and auto-close setup issue
- Add auto-sync-cases.sh for Gitea synchronization

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 16:45:49 +00:00

106 lines
4.6 KiB
Python

"""Local document classifier — rule-based, no API calls.
Classifies legal documents by filename patterns and content keywords.
Falls back to Claude Code headless (`claude -p`) for ambiguous cases.
"""
from __future__ import annotations
import json
import logging
import re
import subprocess
from pathlib import Path
logger = logging.getLogger(__name__)
# ── Filename patterns (checked in order, first match wins) ────────
_FILENAME_RULES: list[tuple[str, str, float]] = [
# (regex pattern on filename, doc_type, confidence)
(r"כתב.ערר|כתב-ערר", "appeal", 1.0),
(r"תשובה|תשובת|תגובה|תגובת|השלמת.טיעון|בקשה.להשלמת|הודעת.עמדה", "response", 1.0),
(r"פרוטוקול", "protocol", 1.0),
(r"החלטת?.ביניים|החלטה.לתיקון", "decision", 0.95),
(r"הוראות.תכנית|תכנית", "plan", 1.0),
(r"היתר", "permit", 1.0),
(r"שומה|חוו.ת.דעת", "appraisal", 1.0),
(r"התנגדות", "objection", 1.0),
# Court decisions: case number patterns
(r"(?:עעם|עע.?מ|עתמ|עת.?מ|בג.?צ|בבנ|עא|ע.?א|רעא|רע.?א|עעמ|עתמ)", "court_decision", 1.0),
# ערר + number that's NOT part of our case files (i.e. precedent references)
(r"^ערר.?\d", "court_decision", 0.9),
]
# ── Content patterns (first 500 chars) ───────────────────────────
_CONTENT_RULES: list[tuple[str, str, float]] = [
(r"בפני\s+ועדת\s+הערר|לפנינו\s+ערר|ניתנה?\s+היום", "decision", 0.85),
(r"כתב\s+ערר|העורר.{0,20}מגיש", "appeal", 0.85),
(r"כתב\s+תשובה|המשיב.{0,20}משיב", "response", 0.85),
(r"פרוטוקול\s+(?:דיון|ישיבה|ועדה)", "protocol", 0.9),
(r"בית\s+(?:ה)?משפט|פסק\s+דין|השופט", "court_decision", 0.85),
(r"הוראות\s+(?:ה)?תכנית|תב.עה|ייעוד\s+הקרקע", "plan", 0.8),
]
def classify(filename: str, text: str = "") -> tuple[str, float]:
"""Classify a legal document by filename and content.
Returns (doc_type, confidence). Confidence > 0.8 means high certainty.
"""
name = Path(filename).stem
# Try filename rules
for pattern, doc_type, confidence in _FILENAME_RULES:
if re.search(pattern, name):
logger.info("Local classifier: '%s'%s (filename, %.2f)", name, doc_type, confidence)
return doc_type, confidence
# Try content rules (first 500 chars)
snippet = text[:500] if text else ""
for pattern, doc_type, confidence in _CONTENT_RULES:
if re.search(pattern, snippet):
logger.info("Local classifier: '%s'%s (content, %.2f)", name, doc_type, confidence)
return doc_type, confidence
logger.info("Local classifier: '%s' → reference (no match, 0.3)", name)
return "reference", 0.3
def classify_with_claude_code(filename: str, text: str) -> tuple[str, float]:
"""Fallback: use Claude Code headless to classify ambiguous documents.
Only works when `claude` CLI is available (not in Docker).
"""
prompt = (
"סווג את המסמך המשפטי הבא לאחת הקטגוריות הבאות בלבד:\n"
"appeal, response, protocol, decision, plan, permit, appraisal, "
"court_decision, exhibit, objection, reference\n\n"
f"שם הקובץ: {filename}\n"
f"תחילת המסמך:\n{text[:500]}\n\n"
'החזר JSON בלבד: {"doc_type": "...", "confidence": 0.9}'
)
try:
result = subprocess.run(
["claude", "-p", prompt, "--output-format", "json", "--max-turns", "1"],
capture_output=True, text=True, timeout=60,
)
if result.returncode == 0 and result.stdout.strip():
data = json.loads(result.stdout)
# claude -p --output-format json wraps in {"result": "..."}
inner = data.get("result", data)
if isinstance(inner, str):
inner = json.loads(inner)
doc_type = inner.get("doc_type", "reference")
confidence = float(inner.get("confidence", 0.7))
logger.info("Claude Code classifier: '%s'%s (%.2f)", filename, doc_type, confidence)
return doc_type, confidence
except FileNotFoundError:
logger.debug("Claude CLI not available — skipping headless fallback")
except (subprocess.TimeoutExpired, json.JSONDecodeError, Exception) as e:
logger.warning("Claude Code classifier failed: %s", e)
return "reference", 0.3