diff --git a/docs/spec/X13-court-fetch.md b/docs/spec/X13-court-fetch.md index 5431fab..edd6483 100644 --- a/docs/spec/X13-court-fetch.md +++ b/docs/spec/X13-court-fetch.md @@ -162,3 +162,5 @@ Service / responsible automation) | סטטוס: verified - F5/anti-bot עלול לחסום IP → politeness סדרתי + Camoufox (INV-CF4). - שבירות מול שינויי-אתר → ריכוז selectors במקום אחד + בדיקות-עשן תקופתיות. - גבול-ToS על אתר .gov → INV-CF7 + שיקול-יו"ר. +- **דליפת-זיכרון מדפדפנים יתומים** (fetch שנתקע/נהרג משאיר `camoufox-bin`) → שלוש שכבות-הגנה: + (א) `async with` סוגר את הדפדפן בכל exception; (ב) `asyncio.wait_for` קשיח (`COURT_FETCH_HARD_TIMEOUT_S`, ברירת-מחדל 180ש') מבטל hang + reap; (ג) reaper של `camoufox-bin` יתומים (`ppid=1`) לפני/אחרי כל fetch + דמון `legal-reaper` (pm2) + תקרת `max_memory_restart`. סדרתיות (INV-CF4) מבטיחה שכל דפדפן `ppid=1` הוא שארית בטוחה-להריגה. **הערה:** הדליפה הגדולה בפועל בשרת היא `task-master-mcp` (כלי נפרד), שגם אותו ה-reaper מנקה. diff --git a/mcp-server/src/legal_mcp/court_fetch_service/camofox_client.py b/mcp-server/src/legal_mcp/court_fetch_service/camofox_client.py index ba604ce..dffc96a 100644 --- a/mcp-server/src/legal_mcp/court_fetch_service/camofox_client.py +++ b/mcp-server/src/legal_mcp/court_fetch_service/camofox_client.py @@ -53,6 +53,50 @@ _DISPLAY = os.environ.get("DISPLAY", "") _NAV_TIMEOUT_MS = int(float(os.environ.get("COURT_FETCH_BROWSER_TIMEOUT_S", "60")) * 1000) _PAGE_BATCH = 4 # the viewer's GetImages batch size _MAX_PAGES = 400 # hard cap on a single document +# Hard wall-clock cap on a single fetch so a hung browser can't pin a Firefox +# process forever (anti-leak; INV-CF4 politeness). The async-with cleanup runs +# on the resulting CancelledError, tearing the browser down. +_FETCH_HARD_TIMEOUT_S = float(os.environ.get("COURT_FETCH_HARD_TIMEOUT_S", "180")) + + +def _reap_orphan_browsers() -> int: + """Kill any ``camoufox-bin`` orphaned to ``ppid=1`` before we launch. + + Fetching is serial (INV-CF4), so any browser not owned by a live parent is + a leftover from a prior crashed/killed fetch. Pure /proc, best-effort — + never raises into the fetch path. + """ + killed = 0 + try: + for pid in os.listdir("/proc"): + if not pid.isdigit(): + continue + try: + with open(f"/proc/{pid}/status", "rb") as f: + status = f.read().decode("utf-8", "replace") + with open(f"/proc/{pid}/cmdline", "rb") as f: + cmd = f.read().decode("utf-8", "replace") + except OSError: + continue + if "camoufox-bin" not in cmd: + continue + ppid = 0 + for line in status.splitlines(): + if line.startswith("PPid:"): + try: ppid = int(line.split()[1]) + except (IndexError, ValueError): pass + break + if ppid == 1: + try: + os.kill(int(pid), 9) + killed += 1 + except OSError: + pass + except OSError: + pass + if killed: + logger.warning("reaped %d orphaned camoufox-bin before fetch", killed) + return killed class CamofoxUnavailable(RuntimeError): @@ -178,74 +222,93 @@ async def fetch_admin_verdict( ) month_year = f"{int(month):02d}-{year[-2:]}" - doc_num = {"v": None} - async def on_resp(resp): - if "GetImages" in resp.url and not doc_num["v"]: - try: - doc_num["v"] = json.loads(resp.request.post_data).get("documentNumber") - except Exception: - pass + # Belt-and-suspenders against browser leaks: kill any orphaned browser from + # a prior crashed fetch before we launch a new one (serial → safe). + _reap_orphan_browsers() - async with AsyncCamoufox( - headless=True, geoip=False, humanize=True, locale="he-IL" - ) as browser: - page = await browser.new_page() - page.context.on("response", lambda r: asyncio.create_task(on_resp(r))) - vp = await _reach_viewer(page, case_number=file_number, month_year=month_year) - source_url = vp.url - await vp.wait_for_timeout(9000) - if not doc_num["v"]: - raise NgcsFlowError("לא נלכד documentNumber מהצופה (ייתכן שהמסמך לא נטען)") + async def _run() -> dict: + doc_num = {"v": None} - # Pull every page batch through fetch() with X-Requested-With (WAF-safe). - imgs = await vp.evaluate( - """async (args) => { - const [dn, maxPages, batch] = args; - const url = window.location.href.split('?')[0] + '/GetImages'; - const out = {}; - for (let f = 0; f < maxPages; f += batch) { - let d; - try { - const r = await fetch(url, {method:'POST', credentials:'include', - headers:{'Content-Type':'application/json; charset=utf-8', - 'X-Requested-With':'XMLHttpRequest'}, - body: JSON.stringify({documentNumber:dn, fromIndex:f, toIndex:f+batch-1})}); - if (!r.ok) break; - const j = await r.json(); d = (j.d !== undefined) ? j.d : j; - } catch (e) { break; } - if (!Array.isArray(d) || d.length === 0) break; - d.forEach((html, k) => { if (html) out[f+k] = html; }); - if (d.length < batch) break; - await new Promise(r => setTimeout(r, 350)); - } - return out; - }""", - [doc_num["v"], _MAX_PAGES, _PAGE_BATCH], + async def on_resp(resp): + if "GetImages" in resp.url and not doc_num["v"]: + try: + doc_num["v"] = json.loads(resp.request.post_data).get("documentNumber") + except Exception: + pass + + async with AsyncCamoufox( + headless=True, geoip=False, humanize=True, locale="he-IL" + ) as browser: + page = await browser.new_page() + page.context.on("response", lambda r: asyncio.create_task(on_resp(r))) + vp = await _reach_viewer(page, case_number=file_number, month_year=month_year) + source_url = vp.url + await vp.wait_for_timeout(9000) + if not doc_num["v"]: + raise NgcsFlowError("לא נלכד documentNumber מהצופה (ייתכן שהמסמך לא נטען)") + + # Pull every page batch through fetch() with X-Requested-With (WAF-safe). + imgs = await vp.evaluate( + """async (args) => { + const [dn, maxPages, batch] = args; + const url = window.location.href.split('?')[0] + '/GetImages'; + const out = {}; + for (let f = 0; f < maxPages; f += batch) { + let d; + try { + const r = await fetch(url, {method:'POST', credentials:'include', + headers:{'Content-Type':'application/json; charset=utf-8', + 'X-Requested-With':'XMLHttpRequest'}, + body: JSON.stringify({documentNumber:dn, fromIndex:f, toIndex:f+batch-1})}); + if (!r.ok) break; + const j = await r.json(); d = (j.d !== undefined) ? j.d : j; + } catch (e) { break; } + if (!Array.isArray(d) || d.length === 0) break; + d.forEach((html, k) => { if (html) out[f+k] = html; }); + if (d.length < batch) break; + await new Promise(r => setTimeout(r, 350)); + } + return out; + }""", + [doc_num["v"], _MAX_PAGES, _PAGE_BATCH], + ) + + if not imgs: + raise NgcsFlowError("לא התקבלו עמודי-מסמך מ-GetImages") + from PIL import Image + + pages = [] + for idx in sorted(imgs, key=lambda x: int(x)): + m = re.search(r"base64,([A-Za-z0-9+/=]+)", imgs[idx] or "") + if not m: + continue + pages.append(Image.open(io.BytesIO(base64.b64decode(m.group(1)))).convert("RGB")) + if not pages: + raise NgcsFlowError("עמודי-המסמך לא ניתנים לפענוח (base64)") + + buf = io.BytesIO() + pages[0].save(buf, format="PDF", save_all=True, append_images=pages[1:]) + content = buf.getvalue() + logger.info("נט המשפט: fetched %s — %d pages, %d bytes", + case_number, len(pages), len(content)) + return { + "content": content, + "filename": f"{case_number}.pdf", + "source_url": source_url, + "court": court or "בית משפט מחוזי", + "pages": len(pages), + } + + # Hard wall-clock cap: on a hung browser, the timeout cancels _run(); the + # async-with __aexit__ tears the browser down, and the reap below sweeps any + # process that outlived the cancellation. + try: + return await asyncio.wait_for(_run(), _FETCH_HARD_TIMEOUT_S) + except asyncio.TimeoutError: + _reap_orphan_browsers() + raise NgcsFlowError( + f"אחזור עבר את מגבלת-הזמן ({_FETCH_HARD_TIMEOUT_S:.0f}ש') ובוטל" ) - - if not imgs: - raise NgcsFlowError("לא התקבלו עמודי-מסמך מ-GetImages") - from PIL import Image - - pages = [] - for idx in sorted(imgs, key=lambda x: int(x)): - m = re.search(r"base64,([A-Za-z0-9+/=]+)", imgs[idx] or "") - if not m: - continue - pages.append(Image.open(io.BytesIO(base64.b64decode(m.group(1)))).convert("RGB")) - if not pages: - raise NgcsFlowError("עמודי-המסמך לא ניתנים לפענוח (base64)") - - buf = io.BytesIO() - pages[0].save(buf, format="PDF", save_all=True, append_images=pages[1:]) - content = buf.getvalue() - logger.info("נט המשפט: fetched %s — %d pages, %d bytes", - case_number, len(pages), len(content)) - return { - "content": content, - "filename": f"{case_number}.pdf", - "source_url": source_url, - "court": court or "בית משפט מחוזי", - "pages": len(pages), - } + finally: + _reap_orphan_browsers() diff --git a/scripts/SCRIPTS.md b/scripts/SCRIPTS.md index d95587f..7cf8350 100644 --- a/scripts/SCRIPTS.md +++ b/scripts/SCRIPTS.md @@ -20,6 +20,8 @@ | `eval_gold_bootstrap.py` | python | **FU-5 (GAP-11) — bootstrap ל-gold-set** של הערכת-אחזור ל-`data/eval/gold-set.jsonl`. שני מקורות: `--source citations` (cited==relevant מ-`search_relevance_feedback`; ריק עד שייצברו ציטוטים) ו-`--source known_item` (query=שם-תיק → relevant=עצמו; אות אמיתי היום). Idempotent — שומר שורות `source=chair`, מחדש `bootstrap_*`. דורש POSTGRES. | לפני eval; חוזר כשנצבר ground-truth | | `eval_retrieval.py` | python | **FU-5 (GAP-11, INV-RET4/G8) — harness הערכת-אחזור** — מריץ את מסלול-האחזור בייצור (`search_library`/`search_internal`) על ה-gold-set, מחשב precision@k/recall@k/MRR/nDCG@k (k=5,10), מצרף overall+per-corpus+per-PA ל-`data/eval/eval-report-.{json,md}` + delta מול `data/eval/baseline.json` (מתעד retrieval_config). `--self-test` בודק את המטריקות offline; `--update-baseline` מאמץ snapshot. **שער-CI במשמעת:** הרץ לפני/אחרי כל שינוי בשכבת-האחזור באותו קונפיג. דורש POSTGRES+VOYAGE_API_KEY. | לפני/אחרי שינוי RRF/k/embedder/rerank | | `legal-court-fetch-service.config.cjs` | pm2/js | **שירות-מארח Tier-1 לאחזור פסקי-דין מנט המשפט (X13)** — 2 apps: (א) `legal-court-fetch-xvfb` (Xvfb :99, צג-וירטואלי ל-Camoufox); (ב) `legal-court-fetch-service` (`python -m legal_mcp.court_fetch_service.server`, bound `10.0.1.1:8771`, Bearer `COURT_FETCH_SHARED_SECRET` מ-`~/.legal-court-fetch-service.env`, `DISPLAY=:99`). מריץ Camoufox דרך חבילת-הפייתון (in-process) כי הקונטיינר לא יכול דפדפן. תלות: `pip install -e "mcp-server[court-fetch]" && python -m camoufox fetch`. אחזור = ניווט→צופה→`GetImages`(X-Requested-With)→PDF, ללא CAPTCHA; כשל→`ok:false`→orchestrator מסלים ל-fallback אנושי. **אומת על עת"מ 46111-12-22 (34 עמ').** מראָה לדפוס `legal-chat-service.config.cjs`. ספ: `docs/spec/X13-court-fetch.md`. התקנה: `pm2 start scripts/legal-court-fetch-service.config.cjs && pm2 save`. בריאות: `curl http://10.0.1.1:8771/health`. | pm2 (host-side) | +| `reap_orphan_procs.py` | python | **reaper לתהליכים-יתומים שמרווים את שרת Nautilus** — הורג `task-master-mcp` (Node, מתנפח ל~3GB) ו-`camoufox-bin` (Firefox מ-X13 fetch שקרס) **רק כשהם יתומים (`ppid=1`)** — תהליך עם הורה-חי לעולם לא נוגעים בו. `/proc` טהור, בלי psutil. `--dry-run` (דיווח), `--loop N` (דמון כל N ש'). ראה זיכרון [[project_taskmaster_mcp_memory_leak]]. | דרך `legal-reaper.config.cjs` (pm2) | +| `legal-reaper.config.cjs` | pm2/js | **דמון pm2 ל-`reap_orphan_procs.py --loop`** (ברירת-מחדל 180ש', `REAP_INTERVAL_S` לעקיפה). `max_memory_restart 100M` (ה-reaper עצמו לא ידלוף). התקנה: `pm2 start scripts/legal-reaper.config.cjs && pm2 save`. לוגים: `pm2 logs legal-reaper`. | pm2 (host-side) | | `auto-sync-cases.sh` | bash | סנכרון תיקי ערר ל-Gitea — רץ כל דקה | `* * * * *` (cron) | | `backup-db.sh` | bash | גיבוי PostgreSQL יומי ל-`data/backups/` (gzip) | לתזמן: `0 2 * * *` | | `restore-db.sh` | bash | שחזור DB מגיבוי (companion ל-backup-db.sh) | ידני | diff --git a/scripts/legal-reaper.config.cjs b/scripts/legal-reaper.config.cjs new file mode 100644 index 0000000..621234a --- /dev/null +++ b/scripts/legal-reaper.config.cjs @@ -0,0 +1,35 @@ +/** + * pm2 ecosystem entry for legal-reaper — a host-side daemon that periodically + * reaps orphaned, runaway processes that saturate the Nautilus box: + * - task-master-mcp (Node) orphaned to ppid=1, ballooning to ~3GB each + * (memory: project_taskmaster_mcp_memory_leak). + * - camoufox-bin (Firefox) leftover from a crashed/killed X13 court fetch. + * Only ppid=1 orphans are killed — live, parented processes are never touched. + * See scripts/reap_orphan_procs.py for the safety rationale. + * + * Install (once): + * pm2 start /home/chaim/legal-ai/scripts/legal-reaper.config.cjs + * pm2 save + * Logs: + * pm2 logs legal-reaper --lines 50 + * + * Interval defaults to 180s; override with REAP_INTERVAL_S. + */ +const interval = process.env.REAP_INTERVAL_S || "180"; + +module.exports = { + apps: [ + { + name: "legal-reaper", + cwd: "/home/chaim/legal-ai", + script: "/home/chaim/legal-ai/mcp-server/.venv/bin/python", + args: `scripts/reap_orphan_procs.py --loop ${interval}`, + env: { HOME: "/home/chaim", PYTHONUNBUFFERED: "1" }, + autorestart: true, + max_restarts: 20, + restart_delay: 5000, + // The reaper itself is tiny and must never be the thing that leaks. + max_memory_restart: "100M", + }, + ], +}; diff --git a/scripts/reap_orphan_procs.py b/scripts/reap_orphan_procs.py new file mode 100644 index 0000000..07da200 --- /dev/null +++ b/scripts/reap_orphan_procs.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +"""Reap orphaned/runaway processes that saturate the Nautilus box. + +Two known offenders (2026-06-07): + 1. ``task-master-mcp`` (Node) — spawned by the Claude Code VSCode extension, + orphaned to ``ppid=1`` when its session ends, then **balloons to ~3GB + each**. They accrue as sessions cycle and exhaust RAM within minutes, + risking the OOM-killer hitting Postgres/Paperclip. See memory + ``project_taskmaster_mcp_memory_leak``. + 2. ``camoufox-bin`` (Firefox) — the X13 court-fetch browser. A fetch that + hangs or is killed mid-flight can leave a stray browser orphaned to + ``ppid=1``. Serial-only fetching means any ``ppid=1`` camoufox-bin is + stale and safe to kill. + +Safety: only processes **orphaned to ``ppid=1``** are reaped — a process still +owned by a live parent (an attached MCP server, or a browser a fetch is +actively using) is never touched. Pure ``/proc`` parsing, no psutil dependency. + +Usage: + python scripts/reap_orphan_procs.py # one pass, print what was reaped + python scripts/reap_orphan_procs.py --dry-run # report only + python scripts/reap_orphan_procs.py --loop 180 # daemon: reap every 180s +""" + +from __future__ import annotations + +import argparse +import os +import signal +import sys +import time + +# Process-name substrings to reap when orphaned (ppid==1). +TARGETS = ("task-master-mcp", "camoufox-bin") + + +def _read(path: str) -> str: + try: + with open(path, "rb") as f: + return f.read().decode("utf-8", "replace") + except OSError: + return "" + + +def _proc_info(pid: str) -> tuple[int, str, int] | None: + """Return (ppid, cmdline, rss_kb) for a pid, or None if it vanished.""" + status = _read(f"/proc/{pid}/status") + if not status: + return None + ppid, rss = 0, 0 + for line in status.splitlines(): + if line.startswith("PPid:"): + try: ppid = int(line.split()[1]) + except (IndexError, ValueError): pass + elif line.startswith("VmRSS:"): + try: rss = int(line.split()[1]) + except (IndexError, ValueError): pass + cmd = _read(f"/proc/{pid}/cmdline").replace("\x00", " ").strip() + return ppid, cmd, rss + + +def find_orphans() -> list[tuple[str, str, int]]: + """Return [(pid, cmd, rss_kb)] of ppid==1 processes matching TARGETS.""" + out = [] + for pid in os.listdir("/proc"): + if not pid.isdigit(): + continue + info = _proc_info(pid) + if not info: + continue + ppid, cmd, rss = info + if ppid == 1 and any(t in cmd for t in TARGETS): + out.append((pid, cmd, rss)) + return out + + +def reap(dry_run: bool = False) -> int: + orphans = find_orphans() + freed_mb = 0 + for pid, cmd, rss in orphans: + name = next((t for t in TARGETS if t in cmd), cmd[:30]) + freed_mb += rss // 1024 + if dry_run: + print(f"[dry-run] would reap pid={pid} ({name}) rss={rss//1024}MB", flush=True) + continue + try: + os.kill(int(pid), signal.SIGKILL) + print(f"reaped pid={pid} ({name}) rss={rss//1024}MB", flush=True) + except ProcessLookupError: + pass + except PermissionError: + print(f" permission denied for pid={pid} ({name})", flush=True) + if orphans: + print(f"{'would free' if dry_run else 'freed'} ~{freed_mb}MB " + f"from {len(orphans)} orphan(s)", flush=True) + return len(orphans) + + +def main() -> int: + ap = argparse.ArgumentParser(description="Reap orphaned task-master-mcp / camoufox-bin") + ap.add_argument("--dry-run", action="store_true", help="report only, kill nothing") + ap.add_argument("--loop", type=int, default=0, metavar="SECONDS", + help="run forever, reaping every N seconds") + args = ap.parse_args() + if args.loop: + print(f"reaper loop: every {args.loop}s targets={TARGETS}", flush=True) + while True: + try: + reap(args.dry_run) + except Exception as e: # never let the daemon die + print(f"reap error: {e}", flush=True) + time.sleep(args.loop) + else: + reap(args.dry_run) + return 0 + + +if __name__ == "__main__": + sys.exit(main())