"""Camoufox driver for נט המשפט — calibrated, proven flow (X13, Tier 1). Open-source, zero-API-cost: drives a **Camoufox** stealth browser (a Firefox fork with C++ fingerprint spoofing) via its official Python package (``camoufox.async_api``) — in-process, no separate Node server. The full flow was reverse-engineered and validated end-to-end against עת"מ 46111-12-22 (2026-06-07): a 34-page verdict PDF retrieved with **no smart-card and no CAPTCHA-solving**. The proven path: 1. homepage → DOM-click ``btnExternalSearchCases`` ("תיקים לפי מס' תיק מקור"). 2. Fill the visible header case-locator: ``BamaCaseNumberTextBoxH`` = case number, ``BamaMonthYearTextBoxHT`` = "MM-YY"; click ``SearchHeaderCaseButton``. → lands on ``FolderCaseDetails/CaseDetails.aspx`` for the case. 3. Click the "פסקי דין" sidebar tab → ``Decisions/DecisionList.aspx``. 4. Click the document → popup ``Viewer/NGCSViewerPage.aspx?DocumentNumber=…``. 5. The viewer renders pages as PNG images via the ``GetImages`` PageMethod — **served without reCAPTCHA** (the reCAPTCHA on the viewer only gates the explicit save/print, which we don't use). Capture the internal ``documentNumber`` from the viewer's first ``GetImages`` call, then pull every 4-page batch via ``fetch`` **with header ``X-Requested-With: XMLHttpRequest``** (required — the F5 WAF blocks AJAX calls without it). 6. Decode the base64 PNGs → assemble a PDF (Pillow). The existing ingest pipeline OCRs it (Google Vision) → text → corpus. Operational requirements (see scripts/legal-court-fetch-service.config.cjs): * a virtual display — Camoufox/Firefox crashes headless on this server without one. Set ``DISPLAY`` to a running Xvfb (e.g. ``:99``). * RAM — a Firefox content process loading the heavy ASP.NET pages needs ~0.5–1 GB; keep the box from swapping. reCAPTCHA note: ``recaptcha_audio`` (local Whisper) remains as a fallback for the explicit-PDF-download path, but the primary image-API path needs no solving, so it is normally unused. """ from __future__ import annotations import asyncio import base64 import io import json import logging import os import re logger = logging.getLogger(__name__) NGCS_HOME = "https://www.court.gov.il/ngcs.web.site/homepage.aspx" # Headless Camoufox needs a virtual display on this server. _DISPLAY = os.environ.get("DISPLAY", "") _NAV_TIMEOUT_MS = int(float(os.environ.get("COURT_FETCH_BROWSER_TIMEOUT_S", "60")) * 1000) _PAGE_BATCH = 4 # the viewer's GetImages batch size _MAX_PAGES = 400 # hard cap on a single document # Hard wall-clock cap on a single fetch so a hung browser can't pin a Firefox # process forever (anti-leak; INV-CF4 politeness). The async-with cleanup runs # on the resulting CancelledError, tearing the browser down. _FETCH_HARD_TIMEOUT_S = float(os.environ.get("COURT_FETCH_HARD_TIMEOUT_S", "180")) def _reap_orphan_browsers() -> int: """Kill any ``camoufox-bin`` orphaned to ``ppid=1`` before we launch. Fetching is serial (INV-CF4), so any browser not owned by a live parent is a leftover from a prior crashed/killed fetch. Pure /proc, best-effort — never raises into the fetch path. """ killed = 0 try: for pid in os.listdir("/proc"): if not pid.isdigit(): continue try: with open(f"/proc/{pid}/status", "rb") as f: status = f.read().decode("utf-8", "replace") with open(f"/proc/{pid}/cmdline", "rb") as f: cmd = f.read().decode("utf-8", "replace") except OSError: continue if "camoufox-bin" not in cmd: continue ppid = 0 for line in status.splitlines(): if line.startswith("PPid:"): try: ppid = int(line.split()[1]) except (IndexError, ValueError): pass break if ppid == 1: try: os.kill(int(pid), 9) killed += 1 except OSError: pass except OSError: pass if killed: logger.warning("reaped %d orphaned camoufox-bin before fetch", killed) return killed class CamofoxUnavailable(RuntimeError): """Camoufox (or its virtual display) isn't available.""" class NgcsFlowError(RuntimeError): """A step in the נט-המשפט flow failed (navigation / not found / blocked).""" def is_enabled() -> bool: """True if the Camoufox package imports (browser binary present).""" try: import camoufox.async_api # noqa: F401 return True except Exception: return False async def health() -> dict: return {"camoufox_import": is_enabled(), "display": _DISPLAY or "(none)"} async def _fill_visible(page, id_substr: str, value: str) -> bool: for el in await page.locator(f"input[id*='{id_substr}']").all(): try: if await el.is_visible() and await el.is_editable(): await el.fill(value) return True except Exception: continue return False async def _reach_viewer(page, *, case_number: str, month_year: str): """Drive home → search → case → פסקי דין → viewer popup. Returns the popup page.""" await page.goto(NGCS_HOME, wait_until="domcontentloaded", timeout=_NAV_TIMEOUT_MS) await page.wait_for_timeout(2500) await page.eval_on_selector( "#Header1_UpperMenu1_btnExternalSearchCases", "el => el.click()" ) try: await page.wait_for_load_state("domcontentloaded", timeout=_NAV_TIMEOUT_MS) except Exception: pass await page.wait_for_timeout(4500) if not await _fill_visible(page, "BamaCaseNumberTextBoxH", case_number): raise NgcsFlowError("שדה מספר-תיק לא נמצא בעמוד החיפוש") my_filled = False for el in await page.locator("input[id*='BamaMonthYearTextBoxHT']").all(): if await el.is_visible(): await el.click() await page.keyboard.type(month_year, delay=60) my_filled = True break if not my_filled: raise NgcsFlowError("שדה חודש-שנה לא נמצא") clicked = False for b in await page.locator("[id*='SearchHeaderCaseButton']").all(): if await b.is_visible(): await b.click() clicked = True break if not clicked: raise NgcsFlowError("כפתור החיפוש לא נמצא") await page.wait_for_timeout(6000) if "CaseDetails" not in page.url: raise NgcsFlowError( f"לא הגענו לעמוד-התיק (URL={page.url[:80]}) — ייתכן שהתיק לא נמצא/לא פתוח לעיון" ) # פסקי דין tab → DecisionList psak = page.locator("a:has-text('פסקי דין')") opened = False for i in range(await psak.count()): el = psak.nth(i) if await el.is_visible(): await el.click() opened = True break if not opened: raise NgcsFlowError("לשונית 'פסקי דין' לא נמצאה בעמוד-התיק") await page.wait_for_timeout(6000) # open the verdict document viewer (popup) viewers = page.locator( "a[href*='Viewer'],[onclick*='Viewer'],a[href*='Document'],a:has-text('צפייה')" ) async with page.context.expect_page(timeout=15000) as pop: clicked = False for i in range(await viewers.count()): el = viewers.nth(i) if await el.is_visible(): await el.click() clicked = True break if not clicked: raise NgcsFlowError("לא נמצא מסמך פסק-דין לצפייה") return await pop.value async def fetch_admin_verdict( *, file_number: str, month: str, year: str, case_number: str, court: str ) -> dict: """Fetch an admin/district court verdict as a PDF. Returns ``{content: bytes, filename, source_url, court}``; raises on failure. ``file_number``/``month``/``year`` are the נט-המשפט triple (e.g. 46111/12/22). """ try: from camoufox.async_api import AsyncCamoufox except Exception as e: raise CamofoxUnavailable( "חבילת camoufox אינה מותקנת/זמינה. הרץ `pip install camoufox` ו-" "`python -m camoufox fetch`. ראה docs/spec/X13-court-fetch.md." ) from e if not _DISPLAY: # Headless Firefox crashes here without a virtual display. raise CamofoxUnavailable( "אין DISPLAY — Camoufox דורש Xvfb על שרת ללא מסך. הפעל Xvfb (למשל :99) " "והגדר DISPLAY (ראה pm2 config)." ) month_year = f"{int(month):02d}-{year[-2:]}" # Belt-and-suspenders against browser leaks: kill any orphaned browser from # a prior crashed fetch before we launch a new one (serial → safe). _reap_orphan_browsers() async def _run() -> dict: doc_num = {"v": None} async def on_resp(resp): if "GetImages" in resp.url and not doc_num["v"]: try: doc_num["v"] = json.loads(resp.request.post_data).get("documentNumber") except Exception: pass async with AsyncCamoufox( headless=True, geoip=False, humanize=True, locale="he-IL" ) as browser: page = await browser.new_page() page.context.on("response", lambda r: asyncio.create_task(on_resp(r))) vp = await _reach_viewer(page, case_number=file_number, month_year=month_year) source_url = vp.url await vp.wait_for_timeout(9000) if not doc_num["v"]: raise NgcsFlowError("לא נלכד documentNumber מהצופה (ייתכן שהמסמך לא נטען)") # Pull every page batch through fetch() with X-Requested-With (WAF-safe). imgs = await vp.evaluate( """async (args) => { const [dn, maxPages, batch] = args; const url = window.location.href.split('?')[0] + '/GetImages'; const out = {}; for (let f = 0; f < maxPages; f += batch) { let d; try { const r = await fetch(url, {method:'POST', credentials:'include', headers:{'Content-Type':'application/json; charset=utf-8', 'X-Requested-With':'XMLHttpRequest'}, body: JSON.stringify({documentNumber:dn, fromIndex:f, toIndex:f+batch-1})}); if (!r.ok) break; const j = await r.json(); d = (j.d !== undefined) ? j.d : j; } catch (e) { break; } if (!Array.isArray(d) || d.length === 0) break; d.forEach((html, k) => { if (html) out[f+k] = html; }); if (d.length < batch) break; await new Promise(r => setTimeout(r, 350)); } return out; }""", [doc_num["v"], _MAX_PAGES, _PAGE_BATCH], ) if not imgs: raise NgcsFlowError("לא התקבלו עמודי-מסמך מ-GetImages") from PIL import Image pages = [] for idx in sorted(imgs, key=lambda x: int(x)): m = re.search(r"base64,([A-Za-z0-9+/=]+)", imgs[idx] or "") if not m: continue pages.append(Image.open(io.BytesIO(base64.b64decode(m.group(1)))).convert("RGB")) if not pages: raise NgcsFlowError("עמודי-המסמך לא ניתנים לפענוח (base64)") buf = io.BytesIO() pages[0].save(buf, format="PDF", save_all=True, append_images=pages[1:]) content = buf.getvalue() logger.info("נט המשפט: fetched %s — %d pages, %d bytes", case_number, len(pages), len(content)) return { "content": content, "filename": f"{case_number}.pdf", "source_url": source_url, "court": court or "בית משפט מחוזי", "pages": len(pages), } # Hard wall-clock cap: on a hung browser, the timeout cancels _run(); the # async-with __aexit__ tears the browser down, and the reap below sweeps any # process that outlived the cancellation. try: return await asyncio.wait_for(_run(), _FETCH_HARD_TIMEOUT_S) except asyncio.TimeoutError: _reap_orphan_browsers() raise NgcsFlowError( f"אחזור עבר את מגבלת-הזמן ({_FETCH_HARD_TIMEOUT_S:.0f}ש') ובוטל" ) finally: _reap_orphan_browsers()