"""Tier 0 — Supreme Court verdict fetcher (X13), via supremedecisions.court.gov.il. Pulls a published Supreme Court verdict PDF from the **public** decisions portal — no smart-card, no CAPTCHA, no browser (pure httpx). Used for serial-format citations (בג"ץ/בר"מ/עע"מ NNNN/YY) that have no נט-format triple and so can't go through the Tier-1 נט-המשפט flow. The portal is an AngularJS SPA over a small ASP.NET JSON API, reverse-engineered and validated live (2026-06-08 on בג"ץ 3483/05 → 75 KB PDF). The flow: POST Home/SearchVerdicts body: {"document": {"Year": "YYYY", "CaseNum": "NNNN", "Month": {}, "dateType": 1, "publishDate": 8, "SearchText": [], "OldMainNumFormat": true}, "lan": 1} → {"data": [{Path, FileName, CaseName, Type, Pages, VerdictDt, ...}, ...]} GET Home/Download?path=&fileName=&type=4 → the verdict PDF Two things are required to get JSON instead of an F5 WAF block (verified): * the **X-Requested-With: XMLHttpRequest** header on every AJAX call; * a **complete** browser header set (UA + Accept + Accept-Language). A case can have many documents (interim החלטות + the final פסק דין). We pick the verdict: prefer a record whose Type contains "פסק דין", else the most-paginated / latest one. Politeness (INV-CF4): serial, with a cooldown. """ from __future__ import annotations import asyncio import datetime as _dt import logging import os import re import urllib.parse import httpx logger = logging.getLogger(__name__) _BASE = "https://supremedecisions.court.gov.il" _HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/126.0 Safari/537.36" ), "Accept": "application/json, text/plain, */*", "Accept-Language": "he-IL,he;q=0.9,en;q=0.8", "X-Requested-With": "XMLHttpRequest", # required — F5 WAF blocks AJAX without it "Referer": _BASE + "/", } _REQUEST_TIMEOUT_S = float(os.environ.get("COURT_FETCH_HTTP_TIMEOUT_S", "30")) _INTER_REQUEST_COOLDOWN_S = float(os.environ.get("COURT_FETCH_COOLDOWN_S", "2")) _DOC_TYPE_PDF = "4" # Empty search clause the portal expects inside the document. _EMPTY_CLAUSE = { "Text": "", "textOperator": 1, "option": 2, "Inverted": False, "Synonym": False, "NearDistance": 3, "MatchOrder": False, } class FetchedVerdict: """A downloaded verdict file held in memory, ready for ingest.""" def __init__(self, content: bytes, filename: str, source_url: str, court: str = "בית המשפט העליון", case_name: str = ""): self.content = content self.filename = filename self.source_url = source_url self.court = court self.case_name = case_name class SupremeFetchError(RuntimeError): """The public portal returned an unexpected shape / no document. Carries a Hebrew reason for the job row (INV-CF2).""" def _four_digit_year(yy: str) -> str: """2-digit citation year → 4-digit. Pivot on the current year: a 2-digit value above (this year + 4) is last century. e.g. 05→2005, 87→1987, 16→2016.""" yy = re.sub(r"\D", "", yy or "") if len(yy) == 4: return yy if len(yy) != 2: return yy n = int(yy) cutoff = (_dt.date.today().year % 100) + 4 return f"20{yy}" if n <= cutoff else f"19{yy}" def _parse_serial(case_number_norm: str, citation: str) -> tuple[str, str]: """Extract (CaseNum, YYYY) from a serial citation like 'בג"ץ 3483/05'. Works off the normalized number (e.g. '3483-05') with the raw citation as a fallback. Raises SupremeFetchError if it can't find a NNNN/YY pair. """ m = re.search(r"(\d{1,5})[-/](\d{2,4})\b", case_number_norm or "") if not m: m = re.search(r"(\d{1,5})/(\d{2,4})", citation or "") if not m: raise SupremeFetchError( f"לא ניתן לפרק '{citation}' למספר-תיק/שנה (פורמט עליון סדרתי)" ) return m.group(1), _four_digit_year(m.group(2)) def _dt_key(r: dict) -> int: m = re.search(r"/Date\((\d+)", str(r.get("VerdictDt") or "")) return int(m.group(1)) if m else 0 def _rank_candidates(records: list[dict]) -> list[dict]: """Order a case's documents by how good a corpus target each is, best first. Preference: the reasoned ruling (Type contains 'פסק') over interim החלטות; then more pages (substantive over one-liners); then most recent. We return a *ranked list*, not one pick, because the formally-labeled פסק-דין is sometimes a published-report ('s'-prefix) file that the free Download endpoint blocks (WAF) — the caller tries each until one downloads as a PDF. Records without a Path/FileName are dropped. """ usable = [r for r in records if r.get("Path") and r.get("FileName")] def _score(r: dict) -> tuple: is_verdict = 1 if "פסק" in str(r.get("Type") or "") else 0 return (is_verdict, int(r.get("Pages") or 0), _dt_key(r)) return sorted(usable, key=_score, reverse=True) async def fetch_supreme_verdict( *, citation: str, case_number_norm: str ) -> FetchedVerdict: """Fetch a Supreme Court verdict PDF by serial citation. Raises on failure.""" case_num, yyyy = _parse_serial(case_number_norm, citation) async with httpx.AsyncClient( http2=False, headers=_HEADERS, timeout=_REQUEST_TIMEOUT_S, follow_redirects=True, ) as client: document = { "Year": yyyy, "CaseNum": case_num, "Month": {}, "dateType": 1, "publishDate": 8, "SearchText": [dict(_EMPTY_CLAUSE)], "OldMainNumFormat": True, } try: await asyncio.sleep(_INTER_REQUEST_COOLDOWN_S) resp = await client.post( f"{_BASE}/Home/SearchVerdicts", json={"document": document, "lan": 1} ) resp.raise_for_status() payload = resp.json() except httpx.HTTPError as e: raise SupremeFetchError(f"חיפוש בפורטל העליון נכשל עבור {citation}: {e}") from e except ValueError as e: raise SupremeFetchError(f"תשובת-חיפוש לא-JSON מהפורטל עבור {citation}") from e records = payload.get("data") if isinstance(payload, dict) else None candidates = _rank_candidates(records or []) if not candidates: raise SupremeFetchError( f"לא נמצא מסמך-פסק עבור {citation} בפורטל העליון " f"(תיק {case_num}/{yyyy[-2:]}; ייתכן שאינו פורסם או טרם דיגיטציה)." ) # Try documents best-first until one downloads as a real PDF. The # formally-labeled פסק-דין is sometimes a published-report file the free # Download endpoint blocks (WAF) — fall back to the next substantive doc. last_reason = "" for rec in candidates[:6]: path, fname = str(rec["Path"]), str(rec["FileName"]) qs = urllib.parse.urlencode( {"path": path, "fileName": fname, "type": _DOC_TYPE_PDF} ) try: await asyncio.sleep(_INTER_REQUEST_COOLDOWN_S) dl = await client.get(f"{_BASE}/Home/Download?{qs}") dl.raise_for_status() except httpx.HTTPError as e: last_reason = f"הורדה נכשלה ({e})" continue if dl.content[:4] == b"%PDF": return FetchedVerdict( content=dl.content, filename=f"{case_number_norm}.pdf", source_url=f"{_BASE}/Home/Download?{qs}", case_name=str(rec.get("CaseName") or ""), ) last_reason = f"מסמך {fname} חסום/לא-PDF ({len(dl.content)}B)" raise SupremeFetchError( f"אף מסמך של {citation} לא ירד כ-PDF ({len(candidates)} מועמדים) — {last_reason}" )