From f4f110f0d1b598497c312ae1ea116d380cb9641a Mon Sep 17 00:00:00 2001 From: Chaim Date: Sun, 7 Jun 2026 20:31:53 +0000 Subject: [PATCH] =?UTF-8?q?feat(X13):=20scheduled=20drain=20=E2=80=94=20fu?= =?UTF-8?q?lly-autonomous=20digest=E2=86=92fetch=E2=86=92ingest=20loop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - scripts/drain_court_fetch.py: drives orchestrator.drain_pending (host-only; no-op when queue empty). Mirrors drain_halacha_queue.py. - scripts/legal-court-fetch-drain.config.cjs: pm2 cron (hourly :17, one-shot), COURT_FETCH_DRAIN_CRON override. - fix: orchestrator default service URL 127.0.0.1 → 10.0.1.1 (the service binds the docker0 gateway; the host can't reach it on loopback). Found live — the first drain failed "connection refused" until corrected. - SCRIPTS.md entries. Validated end-to-end in PRODUCTION on a real digest: עת"מ 43830-12-24 (החברה להגנת הטבע) fetched from נט המשפט → case_law (79 chunks, source_url), digest relinked (INV-DIG3 closed), halacha queued pending_review. job=done. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../services/court_fetch_orchestrator.py | 9 ++-- scripts/SCRIPTS.md | 2 + scripts/drain_court_fetch.py | 43 +++++++++++++++++++ scripts/legal-court-fetch-drain.config.cjs | 40 +++++++++++++++++ 4 files changed, 90 insertions(+), 4 deletions(-) create mode 100644 scripts/drain_court_fetch.py create mode 100644 scripts/legal-court-fetch-drain.config.cjs diff --git a/mcp-server/src/legal_mcp/services/court_fetch_orchestrator.py b/mcp-server/src/legal_mcp/services/court_fetch_orchestrator.py index 833f03e..2cc8750 100644 --- a/mcp-server/src/legal_mcp/services/court_fetch_orchestrator.py +++ b/mcp-server/src/legal_mcp/services/court_fetch_orchestrator.py @@ -41,11 +41,12 @@ logger = logging.getLogger(__name__) # human (INV-CF3). Kept low — the .gov site shouldn't be hammered (INV-CF4). MAX_AUTONOMOUS_ATTEMPTS = int(os.environ.get("COURT_FETCH_MAX_ATTEMPTS", "2")) -# The host-side Tier-1 browser service (pm2). The MCP server runs on the host, -# so it reaches the service over loopback directly (the container bridge in -# web/court_fetch_proxy.py is a separate, optional entry point). +# The host-side Tier-1 browser service (pm2). It binds the docker0 bridge +# gateway (10.0.1.1) — same as legal-chat-service — so both the host MCP server +# and containers can reach it; the host reaches 10.0.1.1 as a local interface. +# Override with COURT_FETCH_SERVICE_URL. COURT_FETCH_SERVICE_URL = os.environ.get( - "COURT_FETCH_SERVICE_URL", "http://127.0.0.1:8771" + "COURT_FETCH_SERVICE_URL", "http://10.0.1.1:8771" ) _SHARED_SECRET = os.environ.get("COURT_FETCH_SHARED_SECRET", "").strip() _TIER1_TIMEOUT_S = float(os.environ.get("COURT_FETCH_TIER1_TIMEOUT_S", "300")) diff --git a/scripts/SCRIPTS.md b/scripts/SCRIPTS.md index 76e0d63..e0a7527 100644 --- a/scripts/SCRIPTS.md +++ b/scripts/SCRIPTS.md @@ -22,6 +22,8 @@ | `legal-court-fetch-service.config.cjs` | pm2/js | **שירות-מארח Tier-1 לאחזור פסקי-דין מנט המשפט (X13)** — 2 apps: (א) `legal-court-fetch-xvfb` (Xvfb :99, צג-וירטואלי ל-Camoufox); (ב) `legal-court-fetch-service` (`python -m legal_mcp.court_fetch_service.server`, bound `10.0.1.1:8771`, Bearer `COURT_FETCH_SHARED_SECRET` מ-`~/.legal-court-fetch-service.env`, `DISPLAY=:99`). מריץ Camoufox דרך חבילת-הפייתון (in-process) כי הקונטיינר לא יכול דפדפן. תלות: `pip install -e "mcp-server[court-fetch]" && python -m camoufox fetch`. אחזור = ניווט→צופה→`GetImages`(X-Requested-With)→PDF, ללא CAPTCHA; כשל→`ok:false`→orchestrator מסלים ל-fallback אנושי. **אומת על עת"מ 46111-12-22 (34 עמ').** מראָה לדפוס `legal-chat-service.config.cjs`. ספ: `docs/spec/X13-court-fetch.md`. התקנה: `pm2 start scripts/legal-court-fetch-service.config.cjs && pm2 save`. בריאות: `curl http://10.0.1.1:8771/health`. | pm2 (host-side) | | `reap_orphan_procs.py` | python | **reaper לתהליכים-יתומים שמרווים את שרת Nautilus** — הורג `task-master-mcp` (Node, מתנפח ל~3GB) ו-`camoufox-bin` (Firefox מ-X13 fetch שקרס) **רק כשהם יתומים (`ppid=1`)** — תהליך עם הורה-חי לעולם לא נוגעים בו. `/proc` טהור, בלי psutil. `--dry-run` (דיווח), `--loop N` (דמון כל N ש'). ראה זיכרון [[project_taskmaster_mcp_memory_leak]]. | דרך `legal-reaper.config.cjs` (pm2) | | `legal-reaper.config.cjs` | pm2/js | **דמון pm2 ל-`reap_orphan_procs.py --loop`** (ברירת-מחדל 180ש', `REAP_INTERVAL_S` לעקיפה). `max_memory_restart 100M` (ה-reaper עצמו לא ידלוף). התקנה: `pm2 start scripts/legal-reaper.config.cjs && pm2 save`. לוגים: `pm2 logs legal-reaper`. | pm2 (host-side) | +| `drain_court_fetch.py` | python | **ריקון תור-אחזור הפסיקה (X13)** — קורא ל-`court_fetch_orchestrator.drain_pending(limit)` שמוריד+קולט כל job ממתין שהיומונים מילאו, וקושר חזרה ליומון. מקומי בלבד (ingest = claude CLI). no-op מהיר כשהתור ריק. הרצה ידנית: `mcp-server/.venv/bin/python scripts/drain_court_fetch.py [limit]`. | דרך `legal-court-fetch-drain.config.cjs` (pm2 cron) | +| `legal-court-fetch-drain.config.cjs` | pm2/js | **תזמון שעתי של `drain_court_fetch.py`** (cron `17 * * * *`, `COURT_FETCH_DRAIN_CRON` לעקיפה) — הופך את לולאת יומון→אחזור→קליטה ל-fully-autonomous. `autorestart:false` (one-shot per tick). דורש `legal-court-fetch-service` רץ. התקנה: `pm2 start scripts/legal-court-fetch-drain.config.cjs && pm2 save`. | pm2 cron (host-side) | | `auto-sync-cases.sh` | bash | סנכרון תיקי ערר ל-Gitea — רץ כל דקה | `* * * * *` (cron) | | `backup-db.sh` | bash | גיבוי PostgreSQL יומי ל-`data/backups/` (gzip) | לתזמן: `0 2 * * *` | | `restore-db.sh` | bash | שחזור DB מגיבוי (companion ל-backup-db.sh) | ידני | diff --git a/scripts/drain_court_fetch.py b/scripts/drain_court_fetch.py new file mode 100644 index 0000000..41f26c6 --- /dev/null +++ b/scripts/drain_court_fetch.py @@ -0,0 +1,43 @@ +"""Drain the X13 court-verdict fetch queue (jobs the digest trigger fills). + +When a digest points at a court ruling not yet in the corpus, the digest +trigger enqueues a ``court_fetch_jobs`` row (status=pending). This script +drains those: for each pending/failed job it runs the full Tier-0/Tier-1 fetch +(via the host browser service) + the canonical ingest, then links the verdict +back to its source digest. Serial with a cooldown (INV-CF4); failures are +recorded and retried until they escalate to ``manual`` (INV-CF3). + +Host-only: ingest drives halacha extraction via the local ``claude`` CLI (same +constraint as ``drain_halacha_queue.py``). A no-op (fast) when the queue is +empty. Scheduled hourly by ``legal-court-fetch-drain`` (pm2 cron); also runnable +by hand: + + mcp-server/.venv/bin/python scripts/drain_court_fetch.py [limit] +""" + +import asyncio +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "mcp-server", "src")) + +from legal_mcp.services import court_fetch_orchestrator as orch + + +async def main() -> int: + limit = int(sys.argv[1]) if len(sys.argv) > 1 else 5 + res = await orch.drain_pending(limit=limit) + print(f"===court-fetch drain=== processed={res.get('processed', 0)} " + f"ingested={res.get('done', 0)}", flush=True) + for r in res.get("results", []): + line = f" [{r.get('status')}] {r.get('citation', '')}" + if r.get("error"): + line += f" — {r['error'][:120]}" + if r.get("case_law_id"): + line += f" → case_law {r['case_law_id']}" + print(line, flush=True) + return 0 + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) diff --git a/scripts/legal-court-fetch-drain.config.cjs b/scripts/legal-court-fetch-drain.config.cjs new file mode 100644 index 0000000..228d510 --- /dev/null +++ b/scripts/legal-court-fetch-drain.config.cjs @@ -0,0 +1,40 @@ +/** + * pm2 ecosystem entry for legal-court-fetch-drain — a scheduled (hourly) one-shot + * that drains the X13 court-verdict fetch queue the digest trigger fills, making + * the digest → fetch → ingest loop fully autonomous (no manual court_fetch_drain). + * + * Pattern: cron_restart fires the script on schedule; autorestart:false means it + * runs once and exits (pm2 shows it "stopped" between ticks — expected for a cron + * job). A no-op (fast) when the queue is empty, so hourly is cheap. + * + * Requires (already deployed): legal-court-fetch-service (+xvfb) running for the + * browser fetch, and the host env (~/.env: POSTGRES_URL, VOYAGE_API_KEY, + * COURT_FETCH_SHARED_SECRET) the venv loads via legal_mcp.config. Ingest uses the + * local claude CLI for halacha extraction (halachot land pending_review — the + * chair's approval gate is untouched). + * + * Install (once): + * pm2 start /home/chaim/legal-ai/scripts/legal-court-fetch-drain.config.cjs + * pm2 save + * Logs: pm2 logs legal-court-fetch-drain --lines 50 + * Run now (manual): mcp-server/.venv/bin/python scripts/drain_court_fetch.py + * + * Schedule override: COURT_FETCH_DRAIN_CRON (default hourly at :17 to avoid the + * top-of-hour stampede with other jobs). + */ +const cron = process.env.COURT_FETCH_DRAIN_CRON || "17 * * * *"; + +module.exports = { + apps: [ + { + name: "legal-court-fetch-drain", + cwd: "/home/chaim/legal-ai", + script: "/home/chaim/legal-ai/mcp-server/.venv/bin/python", + args: "scripts/drain_court_fetch.py 5", + env: { HOME: "/home/chaim", PYTHONUNBUFFERED: "1" }, + autorestart: false, // one-shot per cron tick + cron_restart: cron, + max_memory_restart: "800M", + }, + ], +}; -- 2.49.1