fix(X13): harden court-fetch against browser leaks + reaper for task-master-mcp leak

שלוש שכבות-הגנה נגד דליפת-זיכרון מדפדפנים יתומים, + טיפול בדליפה הגדולה בפועל בשרת (task-master-mcp). - camofox_client.py: - asyncio.wait_for קשיח סביב כל ה-fetch (COURT_FETCH_HARD_TIMEOUT_S=180ש') — hang → ביטול → async-with tear-down → reap. - _reap_orphan_browsers(): הורג camoufox-bin יתומים (ppid=1) לפני ואחרי כל fetch. סדרתיות (INV-CF4) → כל ppid=1 הוא שארית בטוחה. - scripts/reap_orphan_procs.py: reaper כללי ל-task-master-mcp (~3GB יתומים) + camoufox-bin. רק ppid=1; /proc טהור. --dry-run / --loop N. - scripts/legal-reaper.config.cjs: דמון pm2 (loop 180s, max_memory_restart 100M). - X13 spec + SCRIPTS.md: תיעוד שכבות-ההגנה. max_memory_restart בשירות (1.5G) כבר נותן רשת-ביטחון ברמת-התהליך. Invariants: מקיים INV-CF4 (politeness/serial) — ללא שינוי חוזה. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-07 19:43:53 +00:00
parent 61b9d72bcf
commit e186183527
5 changed files with 288 additions and 67 deletions
--- a/mcp-server/src/legal_mcp/court_fetch_service/camofox_client.py
+++ b/mcp-server/src/legal_mcp/court_fetch_service/camofox_client.py
@@ -53,6 +53,50 @@ _DISPLAY = os.environ.get("DISPLAY", "")
 _NAV_TIMEOUT_MS = int(float(os.environ.get("COURT_FETCH_BROWSER_TIMEOUT_S", "60")) * 1000)
 _PAGE_BATCH = 4          # the viewer's GetImages batch size
 _MAX_PAGES = 400         # hard cap on a single document
+# Hard wall-clock cap on a single fetch so a hung browser can't pin a Firefox
+# process forever (anti-leak; INV-CF4 politeness). The async-with cleanup runs
+# on the resulting CancelledError, tearing the browser down.
+_FETCH_HARD_TIMEOUT_S = float(os.environ.get("COURT_FETCH_HARD_TIMEOUT_S", "180"))
+
+
+def _reap_orphan_browsers() -> int:
+    """Kill any ``camoufox-bin`` orphaned to ``ppid=1`` before we launch.
+
+    Fetching is serial (INV-CF4), so any browser not owned by a live parent is
+    a leftover from a prior crashed/killed fetch. Pure /proc, best-effort —
+    never raises into the fetch path.
+    """
+    killed = 0
+    try:
+        for pid in os.listdir("/proc"):
+            if not pid.isdigit():
+                continue
+            try:
+                with open(f"/proc/{pid}/status", "rb") as f:
+                    status = f.read().decode("utf-8", "replace")
+                with open(f"/proc/{pid}/cmdline", "rb") as f:
+                    cmd = f.read().decode("utf-8", "replace")
+            except OSError:
+                continue
+            if "camoufox-bin" not in cmd:
+                continue
+            ppid = 0
+            for line in status.splitlines():
+                if line.startswith("PPid:"):
+                    try: ppid = int(line.split()[1])
+                    except (IndexError, ValueError): pass
+                    break
+            if ppid == 1:
+                try:
+                    os.kill(int(pid), 9)
+                    killed += 1
+                except OSError:
+                    pass
+    except OSError:
+        pass
+    if killed:
+        logger.warning("reaped %d orphaned camoufox-bin before fetch", killed)
+    return killed


 class CamofoxUnavailable(RuntimeError):
@@ -178,74 +222,93 @@ async def fetch_admin_verdict(
        )

    month_year = f"{int(month):02d}-{year[-2:]}"
-    doc_num = {"v": None}

-    async def on_resp(resp):
-        if "GetImages" in resp.url and not doc_num["v"]:
-            try:
-                doc_num["v"] = json.loads(resp.request.post_data).get("documentNumber")
-            except Exception:
-                pass
+    # Belt-and-suspenders against browser leaks: kill any orphaned browser from
+    # a prior crashed fetch before we launch a new one (serial → safe).
+    _reap_orphan_browsers()

-    async with AsyncCamoufox(
-        headless=True, geoip=False, humanize=True, locale="he-IL"
-    ) as browser:
-        page = await browser.new_page()
-        page.context.on("response", lambda r: asyncio.create_task(on_resp(r)))
-        vp = await _reach_viewer(page, case_number=file_number, month_year=month_year)
-        source_url = vp.url
-        await vp.wait_for_timeout(9000)
-        if not doc_num["v"]:
-            raise NgcsFlowError("לא נלכד documentNumber מהצופה (ייתכן שהמסמך לא נטען)")
+    async def _run() -> dict:
+        doc_num = {"v": None}

-        # Pull every page batch through fetch() with X-Requested-With (WAF-safe).
-        imgs = await vp.evaluate(
-            """async (args) => {
-                const [dn, maxPages, batch] = args;
-                const url = window.location.href.split('?')[0] + '/GetImages';
-                const out = {};
-                for (let f = 0; f < maxPages; f += batch) {
-                    let d;
-                    try {
-                        const r = await fetch(url, {method:'POST', credentials:'include',
-                            headers:{'Content-Type':'application/json; charset=utf-8',
-                                     'X-Requested-With':'XMLHttpRequest'},
-                            body: JSON.stringify({documentNumber:dn, fromIndex:f, toIndex:f+batch-1})});
-                        if (!r.ok) break;
-                        const j = await r.json(); d = (j.d !== undefined) ? j.d : j;
-                    } catch (e) { break; }
-                    if (!Array.isArray(d) || d.length === 0) break;
-                    d.forEach((html, k) => { if (html) out[f+k] = html; });
-                    if (d.length < batch) break;
-                    await new Promise(r => setTimeout(r, 350));
-                }
-                return out;
-            }""",
-            [doc_num["v"], _MAX_PAGES, _PAGE_BATCH],
+        async def on_resp(resp):
+            if "GetImages" in resp.url and not doc_num["v"]:
+                try:
+                    doc_num["v"] = json.loads(resp.request.post_data).get("documentNumber")
+                except Exception:
+                    pass
+
+        async with AsyncCamoufox(
+            headless=True, geoip=False, humanize=True, locale="he-IL"
+        ) as browser:
+            page = await browser.new_page()
+            page.context.on("response", lambda r: asyncio.create_task(on_resp(r)))
+            vp = await _reach_viewer(page, case_number=file_number, month_year=month_year)
+            source_url = vp.url
+            await vp.wait_for_timeout(9000)
+            if not doc_num["v"]:
+                raise NgcsFlowError("לא נלכד documentNumber מהצופה (ייתכן שהמסמך לא נטען)")
+
+            # Pull every page batch through fetch() with X-Requested-With (WAF-safe).
+            imgs = await vp.evaluate(
+                """async (args) => {
+                    const [dn, maxPages, batch] = args;
+                    const url = window.location.href.split('?')[0] + '/GetImages';
+                    const out = {};
+                    for (let f = 0; f < maxPages; f += batch) {
+                        let d;
+                        try {
+                            const r = await fetch(url, {method:'POST', credentials:'include',
+                                headers:{'Content-Type':'application/json; charset=utf-8',
+                                         'X-Requested-With':'XMLHttpRequest'},
+                                body: JSON.stringify({documentNumber:dn, fromIndex:f, toIndex:f+batch-1})});
+                            if (!r.ok) break;
+                            const j = await r.json(); d = (j.d !== undefined) ? j.d : j;
+                        } catch (e) { break; }
+                        if (!Array.isArray(d) || d.length === 0) break;
+                        d.forEach((html, k) => { if (html) out[f+k] = html; });
+                        if (d.length < batch) break;
+                        await new Promise(r => setTimeout(r, 350));
+                    }
+                    return out;
+                }""",
+                [doc_num["v"], _MAX_PAGES, _PAGE_BATCH],
+            )
+
+        if not imgs:
+            raise NgcsFlowError("לא התקבלו עמודי-מסמך מ-GetImages")
+        from PIL import Image
+
+        pages = []
+        for idx in sorted(imgs, key=lambda x: int(x)):
+            m = re.search(r"base64,([A-Za-z0-9+/=]+)", imgs[idx] or "")
+            if not m:
+                continue
+            pages.append(Image.open(io.BytesIO(base64.b64decode(m.group(1)))).convert("RGB"))
+        if not pages:
+            raise NgcsFlowError("עמודי-המסמך לא ניתנים לפענוח (base64)")
+
+        buf = io.BytesIO()
+        pages[0].save(buf, format="PDF", save_all=True, append_images=pages[1:])
+        content = buf.getvalue()
+        logger.info("נט המשפט: fetched %s — %d pages, %d bytes",
+                    case_number, len(pages), len(content))
+        return {
+            "content": content,
+            "filename": f"{case_number}.pdf",
+            "source_url": source_url,
+            "court": court or "בית משפט מחוזי",
+            "pages": len(pages),
+        }
+
+    # Hard wall-clock cap: on a hung browser, the timeout cancels _run(); the
+    # async-with __aexit__ tears the browser down, and the reap below sweeps any
+    # process that outlived the cancellation.
+    try:
+        return await asyncio.wait_for(_run(), _FETCH_HARD_TIMEOUT_S)
+    except asyncio.TimeoutError:
+        _reap_orphan_browsers()
+        raise NgcsFlowError(
+            f"אחזור עבר את מגבלת-הזמן ({_FETCH_HARD_TIMEOUT_S:.0f}ש') ובוטל"
        )
-
-    if not imgs:
-        raise NgcsFlowError("לא התקבלו עמודי-מסמך מ-GetImages")
-    from PIL import Image
-
-    pages = []
-    for idx in sorted(imgs, key=lambda x: int(x)):
-        m = re.search(r"base64,([A-Za-z0-9+/=]+)", imgs[idx] or "")
-        if not m:
-            continue
-        pages.append(Image.open(io.BytesIO(base64.b64decode(m.group(1)))).convert("RGB"))
-    if not pages:
-        raise NgcsFlowError("עמודי-המסמך לא ניתנים לפענוח (base64)")
-
-    buf = io.BytesIO()
-    pages[0].save(buf, format="PDF", save_all=True, append_images=pages[1:])
-    content = buf.getvalue()
-    logger.info("נט המשפט: fetched %s — %d pages, %d bytes",
-                case_number, len(pages), len(content))
-    return {
-        "content": content,
-        "filename": f"{case_number}.pdf",
-        "source_url": source_url,
-        "court": court or "בית משפט מחוזי",
-        "pages": len(pages),
-    }
+    finally:
+        _reap_orphan_browsers()