feat(operations): מסך "סוכנים פעילים" + ניהול ריצות (live-runs/log/cancel) (G12/X15, #119)
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 6s
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 6s
פאנל ב-/operations שמראה אילו סוכני Paperclip עובדים כעת (רצים+בתור), הפלט החי
שלהם, ושליטה מבוקרת: עצירת ריצה, איפוס session. סוגר את הנקודה-העיוורת שבה drain
מונע-סוכן (למשל ריקון תור הלכות ע"י ה-CEO heartbeat) עוקף את בקרת /operations
שמכירה רק שירותי pm2, והפלט הגולמי נגיש רק ב-Paperclip UI.
מקור-נתונים: Paperclip heartbeat-runs API (אומת חי):
GET /api/companies/{cid}/live-runs — רצים+בתור (agentName/status/issue/outputSilence)
GET /api/heartbeat-runs/{id}/log — NDJSON של פלט הסוכן
GET /api/heartbeat-runs/{id}/events — timeline
POST /api/heartbeat-runs/{id}/cancel — עצירה מבוקרת (לא kill — מכבד watchdog+checkpoint)
POST /api/agents/{id}/runtime-state/reset-session
ארכיטקטורה (G12/INV-PORT1): כל המגע החדש עם Paperclip דרך השער בלבד —
web/paperclip_client.py (shell) → re-export ב-web/agent_platform_port.py →
web/app.py צורך מהשער. leak_guard.py עובר (seam שלם). אסור kill ישיר על
process_pid (עוקף את השער).
Backend:
- paperclip_client: list_live_runs / get_run_log / get_run_events / cancel_run / reset_agent_session
- agent_platform_port: re-export pc_list_live_runs / pc_get_run_log / pc_get_run_events / pc_cancel_run / pc_reset_agent_session
- app.py: GET /api/operations/agents (אגרגציה CMP+CMPA, עמיד לכשל-חברה),
GET .../runs/{id}/log, GET .../runs/{id}/events, POST .../runs/{id}/cancel,
POST .../agents/{id}/reset-session
Frontend: פאנל "סוכנים פעילים" ב-/operations (polling 4s) + dialog ללוג חי
(פרסור NDJSON→טקסט קריא) + כפתורי עצור/אפס. הוספת hooks ל-operations.ts.
בטיחות: cancel על דריינר הלכות בטוח — חילוץ checkpointed per-chunk + resumable
+ self-heal לשורות processing.
Invariants: מקיים G12/INV-PORT1 (שער-הפלטפורמה). נוגע X6 (UI↔API).
api:types יורץ אחרי deploy (openapi.json חי).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
112
web/app.py
112
web/app.py
@@ -55,6 +55,7 @@ from web.agent_platform_port import (
|
||||
get_project_url,
|
||||
pc_accept_interaction,
|
||||
pc_archive_project,
|
||||
pc_cancel_run,
|
||||
pc_create_project,
|
||||
pc_create_workflow_issue,
|
||||
pc_get_agents,
|
||||
@@ -62,9 +63,13 @@ from web.agent_platform_port import (
|
||||
pc_get_case_issues,
|
||||
pc_get_issue_comments,
|
||||
pc_get_issue_interactions,
|
||||
pc_get_run_events,
|
||||
pc_get_run_log,
|
||||
pc_list_live_runs,
|
||||
pc_post_comment,
|
||||
pc_reject_interaction,
|
||||
pc_request,
|
||||
pc_reset_agent_session,
|
||||
pc_respond_to_interaction,
|
||||
pc_restore_project,
|
||||
pc_wake_analyst_for_appraiser_facts,
|
||||
@@ -6565,6 +6570,113 @@ async def operations_drain_toggle(name: str, body: dict = Body(...)):
|
||||
return {"ok": True, "name": name, "disabled": disabled}
|
||||
|
||||
|
||||
# ── Live agents (/operations "סוכנים פעילים") ──────────────────────────────
|
||||
# What the pm2/queue panels can't show: WHICH agent is doing the work right now
|
||||
# and its live output. An agent-driven drain (e.g. the CEO heartbeat draining
|
||||
# the halacha queue) is neither a pm2 service nor visible per-case, so this
|
||||
# pulls Paperclip's own heartbeat-run view through the platform Port (G12) and
|
||||
# adds the controls to manage a stuck/runaway run.
|
||||
|
||||
_OPS_COMPANY_LABELS = {
|
||||
PAPERCLIP_COMPANIES["licensing"]: "CMP — רישוי ובניה",
|
||||
PAPERCLIP_COMPANIES["betterment"]: "CMPA — היטלי השבחה",
|
||||
}
|
||||
|
||||
|
||||
def _shape_live_run(raw: dict, company_id: str) -> dict:
|
||||
"""Flatten one Paperclip live-run into the dashboard's snake_case shape."""
|
||||
silence = raw.get("outputSilence") or {}
|
||||
return {
|
||||
"run_id": raw.get("id"),
|
||||
"agent_id": raw.get("agentId"),
|
||||
"agent_name": raw.get("agentName") or "—",
|
||||
"company_id": company_id,
|
||||
"company_label": _OPS_COMPANY_LABELS.get(company_id, ""),
|
||||
"status": raw.get("status") or "unknown",
|
||||
"invocation_source": raw.get("invocationSource") or "",
|
||||
"trigger_detail": raw.get("triggerDetail") or "",
|
||||
"issue_id": raw.get("issueId"),
|
||||
"adapter_type": raw.get("adapterType") or "",
|
||||
"started_at": raw.get("startedAt"),
|
||||
"created_at": raw.get("createdAt"),
|
||||
"last_output_at": raw.get("lastOutputAt") or silence.get("lastOutputAt"),
|
||||
"continuation_attempt": raw.get("continuationAttempt") or 0,
|
||||
# Platform's own liveness signal: ok | suspicion | critical.
|
||||
"silence_level": silence.get("level") or "",
|
||||
"silence_age_ms": silence.get("silenceAgeMs") or 0,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/operations/agents")
|
||||
async def operations_agents():
|
||||
"""Queued + running heartbeat runs across all companies (read-only).
|
||||
|
||||
Tolerates a per-company Paperclip hiccup: returns whatever it could fetch
|
||||
plus an ``errors`` list, so one company's outage never blanks the panel."""
|
||||
company_ids = list(_OPS_COMPANY_LABELS.keys())
|
||||
results = await asyncio.gather(
|
||||
*(pc_list_live_runs(cid) for cid in company_ids),
|
||||
return_exceptions=True,
|
||||
)
|
||||
runs: list[dict] = []
|
||||
errors: list[str] = []
|
||||
for cid, res in zip(company_ids, results):
|
||||
if isinstance(res, Exception):
|
||||
logger.warning("live-runs fetch failed for company %s: %s", cid, res)
|
||||
errors.append(f"{_OPS_COMPANY_LABELS.get(cid, cid)}: {type(res).__name__}")
|
||||
continue
|
||||
for raw in res:
|
||||
runs.append(_shape_live_run(raw, cid))
|
||||
|
||||
# Running first, then queued; within each, oldest start first.
|
||||
order = {"running": 0, "queued": 1}
|
||||
runs.sort(key=lambda r: (order.get(r["status"], 2), r["started_at"] or r["created_at"] or ""))
|
||||
return {
|
||||
"runs": runs,
|
||||
"running": sum(1 for r in runs if r["status"] == "running"),
|
||||
"queued": sum(1 for r in runs if r["status"] != "running"),
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/operations/agents/runs/{run_id}/log")
|
||||
async def operations_agent_run_log(run_id: str):
|
||||
"""Full output log (NDJSON stream) of one heartbeat run."""
|
||||
try:
|
||||
return await pc_get_run_log(run_id)
|
||||
except httpx.HTTPError as e:
|
||||
raise HTTPException(502, f"שגיאת Paperclip בשליפת לוג: {type(e).__name__}") from e
|
||||
|
||||
|
||||
@app.get("/api/operations/agents/runs/{run_id}/events")
|
||||
async def operations_agent_run_events(run_id: str):
|
||||
"""Lifecycle/event timeline of one heartbeat run."""
|
||||
try:
|
||||
return {"events": await pc_get_run_events(run_id)}
|
||||
except httpx.HTTPError as e:
|
||||
raise HTTPException(502, f"שגיאת Paperclip בשליפת אירועים: {type(e).__name__}") from e
|
||||
|
||||
|
||||
@app.post("/api/operations/agents/runs/{run_id}/cancel")
|
||||
async def operations_agent_run_cancel(run_id: str):
|
||||
"""Gracefully cancel a queued/running heartbeat run (not a raw kill)."""
|
||||
try:
|
||||
result = await pc_cancel_run(run_id)
|
||||
except httpx.HTTPError as e:
|
||||
raise HTTPException(502, f"שגיאת Paperclip בעצירת ריצה: {type(e).__name__}") from e
|
||||
return {"ok": True, "run_id": run_id, "result": result}
|
||||
|
||||
|
||||
@app.post("/api/operations/agents/{agent_id}/reset-session")
|
||||
async def operations_agent_reset_session(agent_id: str):
|
||||
"""Reset a wedged agent session so its next wakeup starts clean."""
|
||||
try:
|
||||
result = await pc_reset_agent_session(agent_id)
|
||||
except httpx.HTTPError as e:
|
||||
raise HTTPException(502, f"שגיאת Paperclip באיפוס session: {type(e).__name__}") from e
|
||||
return {"ok": True, "agent_id": agent_id, "result": result}
|
||||
|
||||
|
||||
@app.get("/api/digests/{digest_id}")
|
||||
async def digest_get(digest_id: str):
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user