feat(ops): /operations — מוני-תור אחידים, "מה רץ עכשיו", וניהול-תהליכים

הדף הציג את התורים באופן לא-אחיד (by_status גולמי), בלי הבחנה בין "ממתין"
(בקלוג: status=pending) ל"בתור" (התור הפעיל: requested_at IS NOT NULL), בלי
הצגת הפריט שרץ כרגע, ובלי שום שליטה בתהליכים.

מה נוסף:
1. כרטיסי-תור אחידים — בתור / ממתין(בקלוג) / בעיבוד / הושלם / נכשל + "רץ עכשיו"
   (citation/case_number של הפריט בעיבוד) לכל drain (אחזור-פסיקה, מטא-דאטה,
   הלכות, יומונים). שערי-אנוש (אישור-הלכות, פסיקה-חסרה) נשארים מוני-סטטוס.
2. פאנל ניהול-תהליכים בסגנון "שירותי Windows":
   - דמון (court-fetch-service/xvfb/chat/reaper): הפעל-מחדש / עצור / הפעל.
   - cron drain: "הרץ עכשיו" (pm2 restart) + מתג הפעל/כבה תזמון.
3. כל תגי-הסטטוס מתורגמים לעברית.

מנגנון:
- הפעל/כבה תזמון = דגל ב-DB (טבלה drain_controls). pm2 cron_restart מחיה תהליך
  שעוצר ב-stop, לכן ה"כיבוי" האמין הוא דגל שכל drain בודק ב-startup (no-op מיידי
  כשכבוי). הקונטיינר כותב/קורא ישירות מ-DB.
- הרץ-עכשיו + restart/stop/start = proxy ל-pm2 דרך endpoint חדש בגשר-המארח
  (court_fetch_service /pm2/control), מאובטח Bearer + whitelist ל-legal-* בלבד.
- יומונים: drain_digests הועבר מ-crontab ל-pm2 (legal-digest-drain.config.cjs)
  כדי שיופיע ויהיה שליט כמו כל drain. drain_halacha_queue.py הובא לבקרת-גרסאות.

Invariants: מקיים G2 (הרחבת /operations + הגשר הקיים, לא מסלול מקביל) ו-G1
(drain_controls = מקור-אמת יחיד לכיבוי, נורמליזציה במקור ולא תיקון-בקריאה).
אין בליעת שגיאות שקטה (הגשר מחזיר {ok,error}; המוטציות מציגות toast).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-08 08:57:23 +00:00
parent 6647aa92e6
commit 638eef6803
11 changed files with 676 additions and 98 deletions

View File

@@ -9,6 +9,9 @@ Endpoints:
{ok, content_b64, filename, source_url, court, reason}
REQUIRES Authorization: Bearer <COURT_FETCH_SHARED_SECRET>.
GET /health liveness (no auth); reports camofox + VNC URL if available.
GET /pm2 read-only pm2 status of legal-* / paperclip services (no auth).
POST /pm2/control body {name, action: restart|stop|start} → run pm2 on a
whitelisted legal-* process. REQUIRES Bearer (mutating).
Run with pm2:
pm2 start scripts/legal-court-fetch-service.config.cjs
@@ -63,17 +66,38 @@ async def health(request: web.Request) -> web.Response:
_PM2_PREFIXES = ("legal-", "paperclip")
async def pm2_status(request: web.Request) -> web.Response:
"""Return a trimmed ``pm2 jlist`` for the legal-ai background services."""
def _trim_service(a: dict) -> dict:
"""Project a pm2 jlist app entry into the fields the dashboard needs."""
env = a.get("pm2_env", {}) or {}
return {
"name": a.get("name", ""),
"status": env.get("status", ""),
"restarts": env.get("restart_time", 0),
"uptime_ms": env.get("pm_uptime", 0),
"cpu": (a.get("monit") or {}).get("cpu", 0),
"memory_bytes": (a.get("monit") or {}).get("memory", 0),
"cron": env.get("cron_restart") or "",
"autorestart": env.get("autorestart", True),
}
async def _pm2_run(*args: str, timeout: float = 10) -> tuple[int, bytes, bytes]:
"""Run a pm2 subcommand; returns (returncode, stdout, stderr)."""
import asyncio as _asyncio
proc = await _asyncio.create_subprocess_exec(
"pm2", *args,
stdout=_asyncio.subprocess.PIPE, stderr=_asyncio.subprocess.PIPE,
)
out, err = await _asyncio.wait_for(proc.communicate(), timeout=timeout)
return proc.returncode or 0, out, err
async def pm2_status(request: web.Request) -> web.Response:
"""Return a trimmed ``pm2 jlist`` for the legal-ai background services."""
try:
proc = await _asyncio.create_subprocess_exec(
"pm2", "jlist",
stdout=_asyncio.subprocess.PIPE, stderr=_asyncio.subprocess.PIPE,
)
out, err = await _asyncio.wait_for(proc.communicate(), timeout=10)
if proc.returncode != 0:
rc, out, err = await _pm2_run("jlist")
if rc != 0:
return web.json_response(
{"error": f"pm2 jlist failed: {err.decode('utf-8','replace')[:200]}"},
status=502,
@@ -84,26 +108,65 @@ async def pm2_status(request: web.Request) -> web.Response:
except Exception as e: # never throw
return web.json_response({"error": f"pm2 error: {e}"}, status=502)
services = []
for a in apps:
name = a.get("name", "")
if not any(name.startswith(p) for p in _PM2_PREFIXES):
continue
env = a.get("pm2_env", {}) or {}
services.append({
"name": name,
"status": env.get("status", ""),
"restarts": env.get("restart_time", 0),
"uptime_ms": env.get("pm_uptime", 0),
"cpu": (a.get("monit") or {}).get("cpu", 0),
"memory_bytes": (a.get("monit") or {}).get("memory", 0),
"cron": env.get("cron_restart") or "",
"autorestart": env.get("autorestart", True),
})
services = [
_trim_service(a) for a in apps
if any(str(a.get("name", "")).startswith(p) for p in _PM2_PREFIXES)
]
services.sort(key=lambda s: s["name"])
return web.json_response({"services": services})
# Process control (restart/stop/start) for the dashboard's "Windows-services"
# panel. Mutating, so it requires the Bearer secret (unlike read-only /pm2).
# Whitelisted to ``legal-`` names only — never paperclip or arbitrary processes.
_PM2_ACTIONS = {"restart", "stop", "start"}
async def pm2_control(request: web.Request) -> web.Response:
"""Run ``pm2 <action> <name>`` for a whitelisted legal-* process."""
unauth = _check_bearer(request)
if unauth is not None:
return unauth
try:
body = await request.json()
except json.JSONDecodeError:
return web.json_response({"error": "invalid JSON body"}, status=400)
name = str(body.get("name", "")).strip()
action = str(body.get("action", "")).strip()
if action not in _PM2_ACTIONS:
return web.json_response(
{"error": f"action must be one of {sorted(_PM2_ACTIONS)}"}, status=400
)
if not name.startswith("legal-"):
return web.json_response(
{"error": "name must be a legal-* process"}, status=403
)
try:
rc, out, err = await _pm2_run(action, name, "--silent", timeout=30)
if rc != 0:
return web.json_response(
{"ok": False,
"error": f"pm2 {action} {name} failed: "
f"{err.decode('utf-8','replace')[:200]}"},
status=502,
)
# Re-read just this process so the UI settles on the real new state.
rc2, out2, _ = await _pm2_run("jlist")
svc = None
if rc2 == 0:
for a in json.loads(out2.decode("utf-8", "replace")):
if a.get("name") == name:
svc = _trim_service(a)
break
return web.json_response({"ok": True, "action": action, "service": svc})
except FileNotFoundError:
return web.json_response({"error": "pm2 not found on PATH"}, status=502)
except Exception as e: # never throw
return web.json_response({"ok": False, "error": f"pm2 error: {e}"}, status=502)
def _check_bearer(request: web.Request) -> web.Response | None:
auth = request.headers.get("Authorization", "")
expected = "Bearer " + _SHARED_SECRET
@@ -156,6 +219,7 @@ def build_app() -> web.Application:
app = web.Application(client_max_size=64 * 1024 * 1024)
app.router.add_get("/health", health)
app.router.add_get("/pm2", pm2_status)
app.router.add_post("/pm2/control", pm2_control)
app.router.add_post("/fetch", fetch)
return app

View File

@@ -1401,6 +1401,21 @@ UPDATE digests SET digest_kind =
CREATE INDEX IF NOT EXISTS idx_digests_kind ON digests(digest_kind);
"""
SCHEMA_V33_SQL = """
-- drain_controls: a per-drain "startup type" switch for the /operations
-- dashboard's process-management panel. pm2 cron_restart resurrects a stopped
-- cron job at the next tick, so `pm2 stop` is NOT a durable "disable" for the
-- drains. Instead each drain checks this flag at startup and no-ops when
-- disabled (like a Windows service set to Disabled). The container writes it
-- directly (no host roundtrip); the drains read it. name = the pm2 process
-- name (e.g. 'legal-metadata-drain').
CREATE TABLE IF NOT EXISTS drain_controls (
name TEXT PRIMARY KEY,
disabled BOOLEAN NOT NULL DEFAULT false,
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
"""
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
async with pool.acquire() as conn:
@@ -1437,7 +1452,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
await conn.execute(SCHEMA_V30_SQL)
await conn.execute(SCHEMA_V31_SQL)
await conn.execute(SCHEMA_V32_SQL)
logger.info("Database schema initialized (v1-v32)")
await conn.execute(SCHEMA_V33_SQL)
logger.info("Database schema initialized (v1-v33)")
async def init_schema() -> None:
@@ -6144,3 +6160,34 @@ async def court_fetch_job_list(status: str | None = None, limit: int = 100) -> l
limit,
)
return [_row_to_court_fetch_job(r) for r in rows]
# ── Drain controls (/operations process-management panel) ──────────────────
async def is_drain_disabled(name: str) -> bool:
"""True if the named drain is switched off (drains check this at startup)."""
pool = await get_pool()
async with pool.acquire() as conn:
val = await conn.fetchval(
"SELECT disabled FROM drain_controls WHERE name = $1", name
)
return bool(val)
async def set_drain_disabled(name: str, disabled: bool) -> None:
"""Switch a drain on/off (upsert). name = pm2 process name."""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(
"INSERT INTO drain_controls (name, disabled, updated_at) "
"VALUES ($1, $2, now()) "
"ON CONFLICT (name) DO UPDATE SET disabled = $2, updated_at = now()",
name, disabled,
)
async def get_drain_controls() -> dict[str, bool]:
"""Map of drain name → disabled flag (only rows that were ever toggled)."""
pool = await get_pool()
async with pool.acquire() as conn:
rows = await conn.fetch("SELECT name, disabled FROM drain_controls")
return {r["name"]: bool(r["disabled"]) for r in rows}