Files
legal-ai/mcp-server/src/legal_mcp/services/court_fetch_supreme.py
Chaim 8d2f1ea0a2 feat(X13 Tier-0): decode supremedecisions API — fetch serial-format Supreme verdicts
The 211 open missing_precedents include 99 Supreme serial-format rulings
(בג"ץ/בר"מ/עע"מ NNNN/YY) with no נט-format triple — fetchable only from
supremedecisions.court.gov.il. Decoded its public JSON API (no browser, no
CAPTCHA, no smart-card); validated live on בג"ץ 3483/05 + בר"מ 10212/16.

- court_fetch_supreme.py: rewrite. POST Home/SearchVerdicts with a structured
  `document` ({Year:"YYYY", CaseNum, OldMainNumFormat:true, SearchText:[…]}) +
  X-Requested-With header → records; GET Home/Download?path=&fileName=&type=4 →
  PDF. The earlier attempt failed only on the request shape (string vs object).
  2-digit→4-digit year; try candidate docs best-first (פסק-דין→pages), skipping
  the published-report 's'-prefix files the free endpoint WAF-blocks.
- orchestrator: on successful ingest, close matching open missing_precedents
  (link to the new case_law). End-to-end validated (בר"מ 10212/16 → corpus).
- backfill_missing_precedents.py: enqueue fetchable open gaps (supreme + net)
  into court_fetch_jobs; the drainer fetches+ingests+closes. dry-run default.
- X13 spec + SCRIPTS.md updated (Tier-0 decoded, no longer a limitation).

Very old un-digitized Supreme cases (e.g. בג"ץ 389/87 → 0 records) → manual.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 06:53:31 +00:00

198 lines
8.0 KiB
Python

"""Tier 0 — Supreme Court verdict fetcher (X13), via supremedecisions.court.gov.il.
Pulls a published Supreme Court verdict PDF from the **public** decisions portal
— no smart-card, no CAPTCHA, no browser (pure httpx). Used for serial-format
citations (בג"ץ/בר"מ/עע"מ NNNN/YY) that have no נט-format triple and so can't go
through the Tier-1 נט-המשפט flow.
The portal is an AngularJS SPA over a small ASP.NET JSON API, reverse-engineered
and validated live (2026-06-08 on בג"ץ 3483/05 → 75 KB PDF). The flow:
POST Home/SearchVerdicts
body: {"document": {"Year": "YYYY", "CaseNum": "NNNN", "Month": {},
"dateType": 1, "publishDate": 8,
"SearchText": [<empty clause>],
"OldMainNumFormat": true}, "lan": 1}
{"data": [{Path, FileName, CaseName, Type, Pages, VerdictDt, ...}, ...]}
GET Home/Download?path=<Path>&fileName=<FileName>&type=4 → the verdict PDF
Two things are required to get JSON instead of an F5 WAF block (verified):
* the **X-Requested-With: XMLHttpRequest** header on every AJAX call;
* a **complete** browser header set (UA + Accept + Accept-Language).
A case can have many documents (interim החלטות + the final פסק דין). We pick the
verdict: prefer a record whose Type contains "פסק דין", else the most-paginated /
latest one. Politeness (INV-CF4): serial, with a cooldown.
"""
from __future__ import annotations
import asyncio
import datetime as _dt
import logging
import os
import re
import urllib.parse
import httpx
logger = logging.getLogger(__name__)
_BASE = "https://supremedecisions.court.gov.il"
_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/126.0 Safari/537.36"
),
"Accept": "application/json, text/plain, */*",
"Accept-Language": "he-IL,he;q=0.9,en;q=0.8",
"X-Requested-With": "XMLHttpRequest", # required — F5 WAF blocks AJAX without it
"Referer": _BASE + "/",
}
_REQUEST_TIMEOUT_S = float(os.environ.get("COURT_FETCH_HTTP_TIMEOUT_S", "30"))
_INTER_REQUEST_COOLDOWN_S = float(os.environ.get("COURT_FETCH_COOLDOWN_S", "2"))
_DOC_TYPE_PDF = "4"
# Empty search clause the portal expects inside the document.
_EMPTY_CLAUSE = {
"Text": "", "textOperator": 1, "option": 2, "Inverted": False,
"Synonym": False, "NearDistance": 3, "MatchOrder": False,
}
class FetchedVerdict:
"""A downloaded verdict file held in memory, ready for ingest."""
def __init__(self, content: bytes, filename: str, source_url: str,
court: str = "בית המשפט העליון", case_name: str = ""):
self.content = content
self.filename = filename
self.source_url = source_url
self.court = court
self.case_name = case_name
class SupremeFetchError(RuntimeError):
"""The public portal returned an unexpected shape / no document. Carries a
Hebrew reason for the job row (INV-CF2)."""
def _four_digit_year(yy: str) -> str:
"""2-digit citation year → 4-digit. Pivot on the current year: a 2-digit
value above (this year + 4) is last century. e.g. 05→2005, 87→1987, 16→2016."""
yy = re.sub(r"\D", "", yy or "")
if len(yy) == 4:
return yy
if len(yy) != 2:
return yy
n = int(yy)
cutoff = (_dt.date.today().year % 100) + 4
return f"20{yy}" if n <= cutoff else f"19{yy}"
def _parse_serial(case_number_norm: str, citation: str) -> tuple[str, str]:
"""Extract (CaseNum, YYYY) from a serial citation like 'בג"ץ 3483/05'.
Works off the normalized number (e.g. '3483-05') with the raw citation as a
fallback. Raises SupremeFetchError if it can't find a NNNN/YY pair.
"""
m = re.search(r"(\d{1,5})[-/](\d{2,4})\b", case_number_norm or "")
if not m:
m = re.search(r"(\d{1,5})/(\d{2,4})", citation or "")
if not m:
raise SupremeFetchError(
f"לא ניתן לפרק '{citation}' למספר-תיק/שנה (פורמט עליון סדרתי)"
)
return m.group(1), _four_digit_year(m.group(2))
def _dt_key(r: dict) -> int:
m = re.search(r"/Date\((\d+)", str(r.get("VerdictDt") or ""))
return int(m.group(1)) if m else 0
def _rank_candidates(records: list[dict]) -> list[dict]:
"""Order a case's documents by how good a corpus target each is, best first.
Preference: the reasoned ruling (Type contains 'פסק') over interim החלטות;
then more pages (substantive over one-liners); then most recent. We return
a *ranked list*, not one pick, because the formally-labeled פסק-דין is
sometimes a published-report ('s'-prefix) file that the free Download
endpoint blocks (WAF) — the caller tries each until one downloads as a PDF.
Records without a Path/FileName are dropped.
"""
usable = [r for r in records if r.get("Path") and r.get("FileName")]
def _score(r: dict) -> tuple:
is_verdict = 1 if "פסק" in str(r.get("Type") or "") else 0
return (is_verdict, int(r.get("Pages") or 0), _dt_key(r))
return sorted(usable, key=_score, reverse=True)
async def fetch_supreme_verdict(
*, citation: str, case_number_norm: str
) -> FetchedVerdict:
"""Fetch a Supreme Court verdict PDF by serial citation. Raises on failure."""
case_num, yyyy = _parse_serial(case_number_norm, citation)
async with httpx.AsyncClient(
http2=False, headers=_HEADERS, timeout=_REQUEST_TIMEOUT_S,
follow_redirects=True,
) as client:
document = {
"Year": yyyy, "CaseNum": case_num, "Month": {},
"dateType": 1, "publishDate": 8, "SearchText": [dict(_EMPTY_CLAUSE)],
"OldMainNumFormat": True,
}
try:
await asyncio.sleep(_INTER_REQUEST_COOLDOWN_S)
resp = await client.post(
f"{_BASE}/Home/SearchVerdicts", json={"document": document, "lan": 1}
)
resp.raise_for_status()
payload = resp.json()
except httpx.HTTPError as e:
raise SupremeFetchError(f"חיפוש בפורטל העליון נכשל עבור {citation}: {e}") from e
except ValueError as e:
raise SupremeFetchError(f"תשובת-חיפוש לא-JSON מהפורטל עבור {citation}") from e
records = payload.get("data") if isinstance(payload, dict) else None
candidates = _rank_candidates(records or [])
if not candidates:
raise SupremeFetchError(
f"לא נמצא מסמך-פסק עבור {citation} בפורטל העליון "
f"(תיק {case_num}/{yyyy[-2:]}; ייתכן שאינו פורסם או טרם דיגיטציה)."
)
# Try documents best-first until one downloads as a real PDF. The
# formally-labeled פסק-דין is sometimes a published-report file the free
# Download endpoint blocks (WAF) — fall back to the next substantive doc.
last_reason = ""
for rec in candidates[:6]:
path, fname = str(rec["Path"]), str(rec["FileName"])
qs = urllib.parse.urlencode(
{"path": path, "fileName": fname, "type": _DOC_TYPE_PDF}
)
try:
await asyncio.sleep(_INTER_REQUEST_COOLDOWN_S)
dl = await client.get(f"{_BASE}/Home/Download?{qs}")
dl.raise_for_status()
except httpx.HTTPError as e:
last_reason = f"הורדה נכשלה ({e})"
continue
if dl.content[:4] == b"%PDF":
return FetchedVerdict(
content=dl.content,
filename=f"{case_number_norm}.pdf",
source_url=f"{_BASE}/Home/Download?{qs}",
case_name=str(rec.get("CaseName") or ""),
)
last_reason = f"מסמך {fname} חסום/לא-PDF ({len(dl.content)}B)"
raise SupremeFetchError(
f"אף מסמך של {citation} לא ירד כ-PDF ({len(candidates)} מועמדים) — {last_reason}"
)