- Add delete_document_chunks for reprocessing, save extracted text to disk - Expand case directory structure (original/extracted/proofread/backup) - Update classifier patterns (תגובה, הודעת עמדה) - Fix proofreader agent paths for new directory layout - Update HEARTBEAT to notify on every task completion - Improve bidi_table with LRE/PDF directional embedding - Add Paperclip project verification and auto-close setup issue - Add auto-sync-cases.sh for Gitea synchronization Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
106 lines
4.6 KiB
Python
106 lines
4.6 KiB
Python
"""Local document classifier — rule-based, no API calls.
|
|
|
|
Classifies legal documents by filename patterns and content keywords.
|
|
Falls back to Claude Code headless (`claude -p`) for ambiguous cases.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ── Filename patterns (checked in order, first match wins) ────────
|
|
|
|
_FILENAME_RULES: list[tuple[str, str, float]] = [
|
|
# (regex pattern on filename, doc_type, confidence)
|
|
(r"כתב.ערר|כתב-ערר", "appeal", 1.0),
|
|
(r"תשובה|תשובת|תגובה|תגובת|השלמת.טיעון|בקשה.להשלמת|הודעת.עמדה", "response", 1.0),
|
|
(r"פרוטוקול", "protocol", 1.0),
|
|
(r"החלטת?.ביניים|החלטה.לתיקון", "decision", 0.95),
|
|
(r"הוראות.תכנית|תכנית", "plan", 1.0),
|
|
(r"היתר", "permit", 1.0),
|
|
(r"שומה|חוו.ת.דעת", "appraisal", 1.0),
|
|
(r"התנגדות", "objection", 1.0),
|
|
# Court decisions: case number patterns
|
|
(r"(?:עעם|עע.?מ|עתמ|עת.?מ|בג.?צ|בבנ|עא|ע.?א|רעא|רע.?א|עעמ|עתמ)", "court_decision", 1.0),
|
|
# ערר + number that's NOT part of our case files (i.e. precedent references)
|
|
(r"^ערר.?\d", "court_decision", 0.9),
|
|
]
|
|
|
|
# ── Content patterns (first 500 chars) ───────────────────────────
|
|
|
|
_CONTENT_RULES: list[tuple[str, str, float]] = [
|
|
(r"בפני\s+ועדת\s+הערר|לפנינו\s+ערר|ניתנה?\s+היום", "decision", 0.85),
|
|
(r"כתב\s+ערר|העורר.{0,20}מגיש", "appeal", 0.85),
|
|
(r"כתב\s+תשובה|המשיב.{0,20}משיב", "response", 0.85),
|
|
(r"פרוטוקול\s+(?:דיון|ישיבה|ועדה)", "protocol", 0.9),
|
|
(r"בית\s+(?:ה)?משפט|פסק\s+דין|השופט", "court_decision", 0.85),
|
|
(r"הוראות\s+(?:ה)?תכנית|תב.עה|ייעוד\s+הקרקע", "plan", 0.8),
|
|
]
|
|
|
|
|
|
def classify(filename: str, text: str = "") -> tuple[str, float]:
|
|
"""Classify a legal document by filename and content.
|
|
|
|
Returns (doc_type, confidence). Confidence > 0.8 means high certainty.
|
|
"""
|
|
name = Path(filename).stem
|
|
|
|
# Try filename rules
|
|
for pattern, doc_type, confidence in _FILENAME_RULES:
|
|
if re.search(pattern, name):
|
|
logger.info("Local classifier: '%s' → %s (filename, %.2f)", name, doc_type, confidence)
|
|
return doc_type, confidence
|
|
|
|
# Try content rules (first 500 chars)
|
|
snippet = text[:500] if text else ""
|
|
for pattern, doc_type, confidence in _CONTENT_RULES:
|
|
if re.search(pattern, snippet):
|
|
logger.info("Local classifier: '%s' → %s (content, %.2f)", name, doc_type, confidence)
|
|
return doc_type, confidence
|
|
|
|
logger.info("Local classifier: '%s' → reference (no match, 0.3)", name)
|
|
return "reference", 0.3
|
|
|
|
|
|
def classify_with_claude_code(filename: str, text: str) -> tuple[str, float]:
|
|
"""Fallback: use Claude Code headless to classify ambiguous documents.
|
|
|
|
Only works when `claude` CLI is available (not in Docker).
|
|
"""
|
|
prompt = (
|
|
"סווג את המסמך המשפטי הבא לאחת הקטגוריות הבאות בלבד:\n"
|
|
"appeal, response, protocol, decision, plan, permit, appraisal, "
|
|
"court_decision, exhibit, objection, reference\n\n"
|
|
f"שם הקובץ: {filename}\n"
|
|
f"תחילת המסמך:\n{text[:500]}\n\n"
|
|
'החזר JSON בלבד: {"doc_type": "...", "confidence": 0.9}'
|
|
)
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
["claude", "-p", prompt, "--output-format", "json", "--max-turns", "1"],
|
|
capture_output=True, text=True, timeout=60,
|
|
)
|
|
if result.returncode == 0 and result.stdout.strip():
|
|
data = json.loads(result.stdout)
|
|
# claude -p --output-format json wraps in {"result": "..."}
|
|
inner = data.get("result", data)
|
|
if isinstance(inner, str):
|
|
inner = json.loads(inner)
|
|
doc_type = inner.get("doc_type", "reference")
|
|
confidence = float(inner.get("confidence", 0.7))
|
|
logger.info("Claude Code classifier: '%s' → %s (%.2f)", filename, doc_type, confidence)
|
|
return doc_type, confidence
|
|
except FileNotFoundError:
|
|
logger.debug("Claude CLI not available — skipping headless fallback")
|
|
except (subprocess.TimeoutExpired, json.JSONDecodeError, Exception) as e:
|
|
logger.warning("Claude Code classifier failed: %s", e)
|
|
|
|
return "reference", 0.3
|