feat: production MCP server with Israeli legislation (multi-source)

Complete production implementation with shell+adapter architecture, 13 MCP tools, SQLite FTS5 search, and multi-source ingestion pipeline. Ingestion fetches from UCI mirror, UNODC SHERLOC PDFs, and Knesset mobile PDFs (135 provisions, 33 definitions). 3 acts with full text, 7 acts metadata-only due to gov.il/nevo.co.il access restrictions. Knesset OData API used for metadata enrichment. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 20:40:01 +01:00
parent 21aa81d2b0
commit 1e28f8a6b1
41 changed files with 9136 additions and 51 deletions
--- a/scripts/lib/fetcher.ts
+++ b/scripts/lib/fetcher.ts
@@ -0,0 +1,239 @@
+/**
+ * Multi-source fetcher for Israeli legislation.
+ *
+ * Sources (in priority order):
+ * 1. Knesset OData API     -- structured metadata (always accessible)
+ * 2. Accessible English PDFs -- UNODC, Knesset mobile PDFs
+ * 3. Accessible HTML pages  -- UCI mirror, etc.
+ *
+ * gov.il and nevo.co.il are Cloudflare-blocked for automated access;
+ * knesset.gov.il HTML pages use bot protection. We therefore use the
+ * OData API (no bot protection) for metadata and known accessible
+ * mirrors for the actual law text.
+ *
+ * - 500ms minimum delay between requests
+ * - User-Agent header identifying the MCP
+ * - No auth needed (Government Open Data / public mirrors)
+ */
+
+import { execSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+
+const USER_AGENT =
+  'Israel-Law-MCP/1.0 (https://github.com/Ansvar-Systems/israel-law-mcp; hello@ansvar.ai)';
+const MIN_DELAY_MS = 500;
+
+let lastRequestTime = 0;
+
+async function rateLimit(): Promise<void> {
+  const now = Date.now();
+  const elapsed = now - lastRequestTime;
+  if (elapsed < MIN_DELAY_MS) {
+    await new Promise((resolve) => setTimeout(resolve, MIN_DELAY_MS - elapsed));
+  }
+  lastRequestTime = Date.now();
+}
+
+export interface FetchResult {
+  status: number;
+  body: string;
+  contentType: string;
+}
+
+// ---------------------------------------------------------------------------
+// Generic HTTP fetch with rate limiting + retries
+// ---------------------------------------------------------------------------
+
+export async function fetchWithRateLimit(
+  url: string,
+  maxRetries = 3,
+): Promise<FetchResult> {
+  await rateLimit();
+
+  for (let attempt = 0; attempt <= maxRetries; attempt++) {
+    try {
+      const response = await fetch(url, {
+        headers: {
+          'User-Agent': USER_AGENT,
+          Accept: 'text/html, application/xhtml+xml, application/json, application/pdf, */*',
+        },
+        redirect: 'follow',
+      });
+
+      if (response.status === 429 || response.status >= 500) {
+        if (attempt < maxRetries) {
+          const backoff = Math.pow(2, attempt + 1) * 1000;
+          console.log(`  HTTP ${response.status} for ${url}, retrying in ${backoff}ms...`);
+          await new Promise((resolve) => setTimeout(resolve, backoff));
+          continue;
+        }
+      }
+
+      const body = await response.text();
+      return {
+        status: response.status,
+        body,
+        contentType: response.headers.get('content-type') ?? '',
+      };
+    } catch (err) {
+      if (attempt < maxRetries) {
+        const backoff = Math.pow(2, attempt + 1) * 1000;
+        const msg = err instanceof Error ? err.message : String(err);
+        console.log(`  Network error for ${url}: ${msg}, retrying in ${backoff}ms...`);
+        await new Promise((resolve) => setTimeout(resolve, backoff));
+        continue;
+      }
+      throw err;
+    }
+  }
+
+  throw new Error(`Failed to fetch ${url} after ${maxRetries} retries`);
+}
+
+// ---------------------------------------------------------------------------
+// PDF download + text extraction via pdftotext
+// ---------------------------------------------------------------------------
+
+export async function fetchPdfAsText(
+  url: string,
+  cacheDir: string,
+  cacheKey: string,
+): Promise<string | null> {
+  const pdfPath = path.join(cacheDir, `${cacheKey}.pdf`);
+  const txtPath = path.join(cacheDir, `${cacheKey}.txt`);
+
+  // Use cached text if available
+  if (fs.existsSync(txtPath)) {
+    return fs.readFileSync(txtPath, 'utf-8');
+  }
+
+  await rateLimit();
+
+  try {
+    // Download PDF via curl (follows redirects, handles binary)
+    execSync(
+      `curl -sL -o "${pdfPath}" "${url}"`,
+      { timeout: 30_000 },
+    );
+
+    if (!fs.existsSync(pdfPath) || fs.statSync(pdfPath).size < 100) {
+      console.log(`  PDF download failed or empty for ${url}`);
+      return null;
+    }
+
+    // Extract text with pdftotext
+    try {
+      const text = execSync(`pdftotext "${pdfPath}" -`, {
+        timeout: 30_000,
+        maxBuffer: 5 * 1024 * 1024,
+      }).toString('utf-8');
+
+      if (text.trim().length > 50) {
+        fs.writeFileSync(txtPath, text);
+        return text;
+      }
+    } catch {
+      console.log(`  pdftotext extraction failed for ${pdfPath}`);
+    }
+
+    return null;
+  } catch (err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    console.log(`  PDF fetch error for ${url}: ${msg}`);
+    return null;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Knesset OData API -- always accessible, returns JSON metadata
+// ---------------------------------------------------------------------------
+
+export interface KnessetLawMetadata {
+  IsraelLawID: number;
+  KnessetNum: number | null;
+  Name: string; // Hebrew name
+  IsBasicLaw: boolean;
+  IsFavoriteLaw: boolean;
+  PublicationDate: string;
+  LatestPublicationDate: string;
+  LawValidityID: number;
+  LawValidityDesc: string; // Hebrew validity description
+  ValidityStartDate: string | null;
+  LastUpdatedDate: string;
+}
+
+const KNESSET_ODATA_BASE = 'https://knesset.gov.il/Odata/ParliamentInfo.svc';
+
+export async function fetchKnessetODataLaw(
+  israelLawId: number,
+): Promise<KnessetLawMetadata | null> {
+  const url = `${KNESSET_ODATA_BASE}/KNS_IsraelLaw?$filter=IsraelLawID%20eq%20${israelLawId}&$format=json`;
+  const result = await fetchWithRateLimit(url);
+
+  if (result.status !== 200) return null;
+
+  try {
+    const data = JSON.parse(result.body);
+    const values = data.value as KnessetLawMetadata[];
+    return values.length > 0 ? values[0] : null;
+  } catch {
+    return null;
+  }
+}
+
+export async function searchKnessetODataLaws(
+  nameSubstring: string,
+): Promise<KnessetLawMetadata[]> {
+  const url = `${KNESSET_ODATA_BASE}/KNS_IsraelLaw?$filter=substringof('${encodeURIComponent(nameSubstring)}',Name)&$format=json`;
+  const result = await fetchWithRateLimit(url);
+
+  if (result.status !== 200) return [];
+
+  try {
+    const data = JSON.parse(result.body);
+    return (data.value as KnessetLawMetadata[]) ?? [];
+  } catch {
+    return [];
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Source URL registry -- maps act IDs to known accessible English sources
+// ---------------------------------------------------------------------------
+
+export interface SourceConfig {
+  /** Primary English text URL (HTML or PDF) */
+  url: string;
+  /** 'html' | 'pdf' */
+  format: 'html' | 'pdf';
+  /** Knesset OData IsraelLawID for metadata enrichment */
+  knessetLawId?: number;
+  /** Description of the source for provenance tracking */
+  sourceNote: string;
+}
+
+/**
+ * Known accessible English translation sources for each act.
+ * These are verified to be reachable without Cloudflare blocks.
+ */
+export const SOURCE_REGISTRY: Record<string, SourceConfig> = {
+  'privacy-protection-law-1981': {
+    url: 'https://ics.uci.edu/~kobsa/privacy/israel.htm',
+    format: 'html',
+    knessetLawId: 2000234,
+    sourceNote: 'UCI mirror of English translation by Haim Ravia Law Offices',
+  },
+  'computer-law-1995': {
+    url: 'https://www.unodc.org/cld/uploads/res/document/computer-law_html/Israel_Computers_Law_5755_1995.pdf',
+    format: 'pdf',
+    knessetLawId: 2000357,
+    sourceNote: 'UNODC SHERLOC database English translation',
+  },
+  'basic-law-human-dignity-1992': {
+    url: 'https://m.knesset.gov.il/EN/activity/documents/BasicLawsPDF/BasicLawLiberty.pdf',
+    format: 'pdf',
+    knessetLawId: undefined, // Basic Laws have different numbering
+    sourceNote: 'Official Knesset English translation PDF',
+  },
+};
--- a/scripts/lib/parser.ts
+++ b/scripts/lib/parser.ts
@@ -0,0 +1,682 @@
+/**
+ * Multi-format parser for Israeli legislation.
+ *
+ * Handles two content formats:
+ *   1. HTML  -- from UCI mirror (Privacy Protection Law)
+ *   2. Plain text -- from pdftotext extraction (Computer Law, Basic Law)
+ *
+ * Israeli laws use "Section N" numbering, not "Article N".
+ * Basic Laws use numbered sections without the "Section" prefix in some formats.
+ */
+
+// ---------------------------------------------------------------------------
+// Interfaces
+// ---------------------------------------------------------------------------
+
+export interface ActIndexEntry {
+  id: string;
+  lawName: string;
+  year: number;
+  title: string;
+  titleEn: string;
+  abbreviation: string;
+  status: 'in_force' | 'amended' | 'repealed' | 'not_yet_in_force';
+  issuedDate: string;
+  inForceDate: string;
+  url: string;
+}
+
+export interface ParsedProvision {
+  provision_ref: string;
+  chapter?: string;
+  section: string;
+  title: string;
+  content: string;
+}
+
+export interface ParsedDefinition {
+  term: string;
+  definition: string;
+  source_provision?: string;
+}
+
+export interface ParsedAct {
+  id: string;
+  type: 'statute';
+  title: string;
+  title_en: string;
+  short_name: string;
+  status: 'in_force' | 'amended' | 'repealed' | 'not_yet_in_force';
+  issued_date: string;
+  in_force_date: string;
+  url: string;
+  description?: string;
+  provisions: ParsedProvision[];
+  definitions: ParsedDefinition[];
+}
+
+// ---------------------------------------------------------------------------
+// HTML utilities
+// ---------------------------------------------------------------------------
+
+function stripHtml(html: string): string {
+  return html
+    .replace(/<[^>]+>/g, ' ')
+    .replace(/&nbsp;/g, ' ')
+    .replace(/&amp;/g, '&')
+    .replace(/&lt;/g, '<')
+    .replace(/&gt;/g, '>')
+    .replace(/&quot;/g, '"')
+    .replace(/&#39;/g, "'")
+    .replace(/\s+/g, ' ')
+    .trim();
+}
+
+function normalizeText(text: string): string {
+  return text.replace(/\s+/g, ' ').trim();
+}
+
+// ---------------------------------------------------------------------------
+// HTML Parser -- for UCI mirror format (Privacy Protection Law)
+//
+// Structure: <B>N. Title</B> ... <P><B>N+1. Title</B>
+// Chapters: <B>CHAPTER ...: ...</B>
+// ---------------------------------------------------------------------------
+
+export function parsePrivacyLawHtml(html: string, act: ActIndexEntry): ParsedAct {
+  const provisions: ParsedProvision[] = [];
+  const definitions: ParsedDefinition[] = [];
+
+  // Extract the law body (inside the main table)
+  const bodyMatch = html.match(/PROTECTION OF PRIVACY LAW[\s\S]*?(?=<\/TD>\s*<\/TR>\s*<\/TABLE>\s*<BR>)/i);
+  const body = bodyMatch ? bodyMatch[0] : html;
+
+  let currentChapter = '';
+
+  // Split by bold section numbers: <B>N. or <B><a name=...>N.
+  // Pattern: <B> optionally <a name="..."></a> then section number. title</B>
+  const sectionPattern = /<B>(?:<a[^>]*><\/a>)?\s*(\d+[A-Z]?)\.\s+([^<]+)<\/B>/gi;
+  const chapterPattern = /<B>\s*(CHAPTER\s+[^:]+:\s*[^<]+)<\/B>/gi;
+
+  // First, collect chapter positions
+  const chapters: Array<{ pos: number; name: string }> = [];
+  let chMatch;
+  while ((chMatch = chapterPattern.exec(body)) !== null) {
+    chapters.push({ pos: chMatch.index, name: normalizeText(stripHtml(chMatch[1])) });
+  }
+
+  // Also collect article positions (Article One: Data Bases, Article Two: Direct Mail)
+  const articlePattern = /<B>\s*(Article\s+[^:]+:\s*[^<]+)<\/B>/gi;
+  while ((chMatch = articlePattern.exec(body)) !== null) {
+    chapters.push({ pos: chMatch.index, name: normalizeText(stripHtml(chMatch[1])) });
+  }
+  chapters.sort((a, b) => a.pos - b.pos);
+
+  // Collect all section matches
+  const sectionMatches: Array<{ pos: number; num: string; title: string }> = [];
+  let secMatch;
+  while ((secMatch = sectionPattern.exec(body)) !== null) {
+    sectionMatches.push({
+      pos: secMatch.index,
+      num: secMatch[1].trim(),
+      title: normalizeText(stripHtml(secMatch[2])),
+    });
+  }
+
+  // For each section, determine its chapter and extract content
+  for (let i = 0; i < sectionMatches.length; i++) {
+    const sec = sectionMatches[i];
+    const nextSec = sectionMatches[i + 1];
+
+    // Determine chapter for this section
+    for (const ch of chapters) {
+      if (ch.pos < sec.pos) {
+        currentChapter = ch.name;
+      }
+    }
+
+    // Extract content between this section and next section
+    const startPos = sec.pos;
+    const endPos = nextSec ? nextSec.pos : body.length;
+    const rawContent = body.substring(startPos, endPos);
+    const content = normalizeText(stripHtml(rawContent));
+
+    if (content.length > 10) {
+      const provRef = `sec${sec.num}`;
+
+      provisions.push({
+        provision_ref: provRef,
+        chapter: currentChapter || undefined,
+        section: sec.num,
+        title: sec.title,
+        content: content.substring(0, 8000),
+      });
+
+      // Extract definitions from Section 3 and Section 7 (definition sections)
+      if (sec.num === '3' || sec.num === '7' || sec.num === '17C') {
+        extractDefinitionsFromContent(content, provRef, definitions);
+      }
+    }
+  }
+
+  return {
+    id: act.id,
+    type: 'statute',
+    title: act.title,
+    title_en: act.titleEn,
+    short_name: act.abbreviation,
+    status: act.status,
+    issued_date: act.issuedDate,
+    in_force_date: act.inForceDate,
+    url: act.url,
+    provisions,
+    definitions,
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Plain-text Parser -- for pdftotext output (Computer Law, Basic Law)
+//
+// Computer Law format:
+//   "Section N\n\nTitle text\n\nN. content..."
+//   or just "N. content..."
+//
+// Basic Law format:
+//   "Title label\n\nN. content..."
+// ---------------------------------------------------------------------------
+
+export function parseComputerLawText(text: string, act: ActIndexEntry): ParsedAct {
+  const provisions: ParsedProvision[] = [];
+  const definitions: ParsedDefinition[] = [];
+
+  // The Computer Law PDF from UNODC has a two-part structure:
+  //   1. Table of Contents (contains "Section N" + "Go" lines)
+  //   2. Actual law text starting with "Computers Law, 5755"
+  // We skip the ToC and parse only the actual law text.
+
+  const lawTextStart = text.indexOf('Computers Law, 5755');
+  const lawText = lawTextStart >= 0 ? text.substring(lawTextStart) : text;
+
+  let currentChapter = '';
+  const lines = lawText.split('\n');
+  const sections: Array<{ num: string; title: string; content: string; chapter: string }> = [];
+  let currentSection: { num: string; title: string; content: string; chapter: string } | null = null;
+
+  // Track marginal note lines (title labels that appear before section numbers)
+  let marginalNoteLines: string[] = [];
+
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i].trim();
+
+    // Detect chapter headings
+    const chapterMatch = line.match(/^Chapter\s+(One|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten|\w+):\s*(.+)/i);
+    if (chapterMatch) {
+      currentChapter = normalizeText(line);
+      marginalNoteLines = [];
+      continue;
+    }
+
+    // Pattern 1: "N." alone on a line (section number with content on next line)
+    // This is the common PDF format where the number is isolated
+    const sectionAloneMatch = line.match(/^(\d+[A-Za-z]?)\.\s*$/);
+    if (sectionAloneMatch) {
+      // Save previous section
+      if (currentSection) {
+        sections.push(currentSection);
+      }
+
+      // The marginal note lines before this number are the title
+      const titleCandidates = marginalNoteLines.filter((l) =>
+        l.length > 0 && l.length < 100
+        && !l.match(/^Chapter\s+/i) && !l.match(/^Go$/i)
+        && !l.match(/^Section\s+\d+/) && !l.match(/^Clause\s+/i)
+        && !l.match(/^\*/) && !l.match(/^Contents$/)
+        && !l.match(/^\d+$/) && !l.match(/^Computers Law/i)
+        && !l.match(/^Published in/i)
+      );
+      const title = normalizeText(titleCandidates.join(' '));
+
+      currentSection = {
+        num: sectionAloneMatch[1],
+        title: title,
+        content: '',
+        chapter: currentChapter,
+      };
+      marginalNoteLines = [];
+      continue;
+    }
+
+    // Pattern 2: "N. (a) content" or "N. Content text" on the same line
+    const sectionInlineMatch = line.match(/^(\d+[A-Za-z]?)\.\s+(.+)/);
+    if (sectionInlineMatch) {
+      // Check this isn't a page footnote like "* Published in..."
+      if (sectionInlineMatch[2].match(/^Published in/i)) {
+        continue;
+      }
+
+      // Save previous section
+      if (currentSection) {
+        sections.push(currentSection);
+      }
+
+      // Marginal note = title
+      const titleCandidates = marginalNoteLines.filter((l) =>
+        l.length > 0 && l.length < 100
+        && !l.match(/^Chapter\s+/i) && !l.match(/^Go$/i)
+        && !l.match(/^Section\s+\d+/) && !l.match(/^Clause\s+/i)
+        && !l.match(/^\*/) && !l.match(/^Contents$/)
+        && !l.match(/^\d+$/) && !l.match(/^Computers Law/i)
+      );
+      const title = normalizeText(titleCandidates.join(' '));
+
+      currentSection = {
+        num: sectionInlineMatch[1],
+        title: title,
+        content: normalizeText(sectionInlineMatch[0]),
+        chapter: currentChapter,
+      };
+      marginalNoteLines = [];
+      continue;
+    }
+
+    // Accumulate content for current section
+    if (currentSection && line.length > 0) {
+      // Skip page numbers (standalone digits), headers, and footnote markers
+      if (line.match(/^\d+$/) && line.length <= 3) continue;
+      if (line.match(/^Computers Law, 1995/i)) continue;
+
+      currentSection.content += ' ' + normalizeText(line);
+    } else if (!currentSection && line.length > 0) {
+      // Track marginal note lines (before any section starts, or between sections)
+      if (!line.match(/^Go$/i) && !line.match(/^Section\s+\d+/)
+          && !line.match(/^Computers Law/i) && !line.match(/^\d+$/)
+          && !line.match(/^\*$/) && !line.match(/^Published in/i)) {
+        marginalNoteLines.push(line);
+      } else {
+        // Reset on non-title lines
+        if (line.match(/^Go$/i) || line.match(/^Section\s+\d+/)) {
+          marginalNoteLines = [];
+        }
+      }
+    } else if (line.length === 0 && !currentSection) {
+      // Empty line resets marginal notes only if we haven't started collecting them recently
+      // Keep them -- marginal notes can span across blank lines in PDF
+    }
+  }
+
+  // Save last section
+  if (currentSection) {
+    sections.push(currentSection);
+  }
+
+  for (const sec of sections) {
+    const content = normalizeText(sec.content);
+    if (content.length > 10) {
+      provisions.push({
+        provision_ref: `sec${sec.num}`,
+        chapter: sec.chapter || undefined,
+        section: sec.num,
+        title: sec.title,
+        content: content.substring(0, 8000),
+      });
+
+      // Extract definitions from Section 1 (uses regular quotes in PDF text)
+      if (sec.num === '1') {
+        extractDefinitionsFromPlainText(content, `sec${sec.num}`, definitions);
+      }
+    }
+  }
+
+  return {
+    id: act.id,
+    type: 'statute',
+    title: act.title,
+    title_en: act.titleEn,
+    short_name: act.abbreviation,
+    status: act.status,
+    issued_date: act.issuedDate,
+    in_force_date: act.inForceDate,
+    url: act.url,
+    provisions,
+    definitions,
+  };
+}
+
+export function parseBasicLawText(text: string, act: ActIndexEntry): ParsedAct {
+  const provisions: ParsedProvision[] = [];
+  const definitions: ParsedDefinition[] = [];
+
+  // Basic Law format from Knesset PDF:
+  // "Title label\n\n1.\n\nContent text..."
+  // or "Title label\n\n1. Content text..."
+
+  const lines = text.split('\n');
+  const sections: Array<{ num: string; title: string; content: string }> = [];
+  let currentSection: { num: string; title: string; content: string } | null = null;
+
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i].trim();
+
+    // Detect section start: "N." at start of line
+    const sectionMatch = line.match(/^(\d+[a-z]?)\.\s*(.*)/);
+    if (sectionMatch) {
+      // Look back for title (marginal label)
+      let title = '';
+      for (let j = i - 1; j >= Math.max(0, i - 4); j--) {
+        const prevLine = lines[j].trim();
+        if (prevLine.length > 0 && !prevLine.match(/^\d+[a-z]?\.\s/)
+            && !prevLine.match(/^\(Amendment/) && prevLine.length < 100) {
+          title = prevLine + (title ? ' ' + title : '');
+        } else if (prevLine.length === 0 && title.length > 0) {
+          break;
+        }
+      }
+
+      // Save previous section
+      if (currentSection) {
+        sections.push(currentSection);
+      }
+
+      currentSection = {
+        num: sectionMatch[1],
+        title: normalizeText(title),
+        content: sectionMatch[2] ? normalizeText(sectionMatch[0]) : '',
+      };
+      continue;
+    }
+
+    // Accumulate content
+    if (currentSection && line.length > 0) {
+      // Skip header / footer lines
+      if (line.match(/^BASIC-LAW:/i)) continue;
+      if (line.match(/^This unofficial/i)) continue;
+      if (line.match(/^For the full/i)) continue;
+      if (line.match(/^Special thanks/i)) continue;
+
+      currentSection.content += ' ' + normalizeText(line);
+    }
+  }
+
+  if (currentSection) {
+    sections.push(currentSection);
+  }
+
+  for (const sec of sections) {
+    const content = normalizeText(sec.content);
+    if (content.length > 10) {
+      provisions.push({
+        provision_ref: `sec${sec.num}`,
+        chapter: undefined,
+        section: sec.num,
+        title: sec.title,
+        content: content.substring(0, 8000),
+      });
+    }
+  }
+
+  return {
+    id: act.id,
+    type: 'statute',
+    title: act.title,
+    title_en: act.titleEn,
+    short_name: act.abbreviation,
+    status: act.status,
+    issued_date: act.issuedDate,
+    in_force_date: act.inForceDate,
+    url: act.url,
+    provisions,
+    definitions,
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Definition extractors
+// ---------------------------------------------------------------------------
+
+function extractDefinitionsFromContent(
+  content: string,
+  sourceProvision: string,
+  definitions: ParsedDefinition[],
+): void {
+  // Pattern: "term" - definition text; or "term" has the meaning...
+  const defPattern = /["\u201c]([^"\u201d]+)["\u201d]\s*[-\u2013\u2014]\s*([^;]+(?:;|$))/g;
+  let match;
+  while ((match = defPattern.exec(content)) !== null) {
+    const term = normalizeText(match[1]);
+    const definition = normalizeText(match[2]).replace(/;$/, '').trim();
+    if (term.length > 1 && term.length < 80 && definition.length > 5) {
+      // Avoid duplicates
+      if (!definitions.some((d) => d.term === term)) {
+        definitions.push({ term, definition, source_provision: sourceProvision });
+      }
+    }
+  }
+}
+
+function extractDefinitionsFromPlainText(
+  content: string,
+  sourceProvision: string,
+  definitions: ParsedDefinition[],
+): void {
+  // Computer Law definitions format (PDF uses regular quotes):
+  // "computer material" - software or information;
+  // Also handle curly quotes from other sources
+  const patterns = [
+    /["\u201c]([^"\u201d]+)["\u201d]\s*[-\u2013\u2014]+\s*([^;]+;)/g,
+    /"([^"]+)"\s*[-\u2013\u2014]+\s*([^;]+;)/g,
+  ];
+  const seen = new Set<string>();
+  for (const defPattern of patterns) {
+    let match;
+    while ((match = defPattern.exec(content)) !== null) {
+      const term = normalizeText(match[1]);
+      const definition = normalizeText(match[2]).replace(/;$/, '').trim();
+      if (term.length > 1 && term.length < 80 && definition.length > 5 && !seen.has(term)) {
+        seen.add(term);
+        if (!definitions.some((d) => d.term === term)) {
+          definitions.push({ term, definition, source_provision: sourceProvision });
+        }
+      }
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Generic HTML parser (fallback for future sources)
+// ---------------------------------------------------------------------------
+
+export function parseIsraeliLawHtml(html: string, act: ActIndexEntry): ParsedAct {
+  // Route to the appropriate specific parser based on act ID
+  if (act.id === 'privacy-protection-law-1981') {
+    return parsePrivacyLawHtml(html, act);
+  }
+
+  // Generic fallback: try to split by bold section numbers
+  const provisions: ParsedProvision[] = [];
+  const definitions: ParsedDefinition[] = [];
+
+  let currentChapter = '';
+  const sectionPattern = /<B>(?:<a[^>]*><\/a>)?\s*(\d+[A-Z]?)\.\s+([^<]+)<\/B>/gi;
+  const chapterPattern = /<B>\s*(CHAPTER\s+[^:]+:\s*[^<]+)<\/B>/gi;
+
+  const chapters: Array<{ pos: number; name: string }> = [];
+  let chMatch;
+  while ((chMatch = chapterPattern.exec(html)) !== null) {
+    chapters.push({ pos: chMatch.index, name: normalizeText(stripHtml(chMatch[1])) });
+  }
+
+  const sectionMatches: Array<{ pos: number; num: string; title: string }> = [];
+  let secMatch;
+  while ((secMatch = sectionPattern.exec(html)) !== null) {
+    sectionMatches.push({
+      pos: secMatch.index,
+      num: secMatch[1].trim(),
+      title: normalizeText(stripHtml(secMatch[2])),
+    });
+  }
+
+  for (let i = 0; i < sectionMatches.length; i++) {
+    const sec = sectionMatches[i];
+    const nextSec = sectionMatches[i + 1];
+
+    for (const ch of chapters) {
+      if (ch.pos < sec.pos) currentChapter = ch.name;
+    }
+
+    const startPos = sec.pos;
+    const endPos = nextSec ? nextSec.pos : html.length;
+    const content = normalizeText(stripHtml(html.substring(startPos, endPos)));
+
+    if (content.length > 10) {
+      provisions.push({
+        provision_ref: `sec${sec.num}`,
+        chapter: currentChapter || undefined,
+        section: sec.num,
+        title: sec.title,
+        content: content.substring(0, 8000),
+      });
+    }
+  }
+
+  return {
+    id: act.id,
+    type: 'statute',
+    title: act.title,
+    title_en: act.titleEn,
+    short_name: act.abbreviation,
+    status: act.status,
+    issued_date: act.issuedDate,
+    in_force_date: act.inForceDate,
+    url: act.url,
+    provisions,
+    definitions,
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Key Israeli Acts -- updated with correct Knesset OData IDs and
+// accessible English source URLs
+// ---------------------------------------------------------------------------
+
+export const KEY_ISRAELI_ACTS: ActIndexEntry[] = [
+  {
+    id: 'privacy-protection-law-1981',
+    lawName: 'Privacy Protection Law',
+    year: 1981,
+    title: '\u05d7\u05d5\u05e7 \u05d4\u05d2\u05e0\u05ea \u05d4\u05e4\u05e8\u05d8\u05d9\u05d5\u05ea, \u05ea\u05e9\u05de"\u05d0-1981',
+    titleEn: 'Protection of Privacy Law, 5741-1981',
+    abbreviation: 'PPL',
+    status: 'in_force',
+    issuedDate: '1981-03-11',
+    inForceDate: '1981-09-11',
+    url: 'https://ics.uci.edu/~kobsa/privacy/israel.htm',
+  },
+  {
+    id: 'data-security-regulations-2017',
+    lawName: 'Protection of Privacy Regulations (Data Security)',
+    year: 2017,
+    title: '\u05ea\u05e7\u05e0\u05d5\u05ea \u05d4\u05d2\u05e0\u05ea \u05d4\u05e4\u05e8\u05d8\u05d9\u05d5\u05ea (\u05d0\u05d1\u05d8\u05d7\u05ea \u05de\u05d9\u05d3\u05e2), \u05ea\u05e9\u05e2"\u05d6-2017',
+    titleEn: 'Protection of Privacy Regulations (Data Security), 5777-2017',
+    abbreviation: 'DSR',
+    status: 'in_force',
+    issuedDate: '2017-03-21',
+    inForceDate: '2018-05-08',
+    url: 'https://www.gov.il/en/departments/legalinfo/data_security_regulation',
+  },
+  {
+    id: 'computer-law-1995',
+    lawName: 'Computers Law',
+    year: 1995,
+    title: '\u05d7\u05d5\u05e7 \u05d4\u05de\u05d7\u05e9\u05d1\u05d9\u05dd, \u05ea\u05e9\u05e0"\u05d4-1995',
+    titleEn: 'Computers Law, 5755-1995',
+    abbreviation: 'CL',
+    status: 'in_force',
+    issuedDate: '1995-07-25',
+    inForceDate: '1995-10-25',
+    url: 'https://www.unodc.org/cld/uploads/res/document/computer-law_html/Israel_Computers_Law_5755_1995.pdf',
+  },
+  {
+    id: 'basic-law-human-dignity-1992',
+    lawName: 'Basic Law: Human Dignity and Liberty',
+    year: 1992,
+    title: '\u05d7\u05d5\u05e7 \u05d9\u05e1\u05d5\u05d3: \u05db\u05d1\u05d5\u05d3 \u05d4\u05d0\u05d3\u05dd \u05d5\u05d7\u05d9\u05e8\u05d5\u05ea\u05d5',
+    titleEn: 'Basic Law: Human Dignity and Liberty, 5752-1992',
+    abbreviation: 'BL-HDL',
+    status: 'in_force',
+    issuedDate: '1992-03-17',
+    inForceDate: '1992-03-17',
+    url: 'https://m.knesset.gov.il/EN/activity/documents/BasicLawsPDF/BasicLawLiberty.pdf',
+  },
+  {
+    id: 'companies-law-1999',
+    lawName: 'Companies Law',
+    year: 1999,
+    title: '\u05d7\u05d5\u05e7 \u05d4\u05d7\u05d1\u05e8\u05d5\u05ea, \u05ea\u05e9\u05e0"\u05d8-1999',
+    titleEn: 'Companies Law, 5759-1999',
+    abbreviation: 'CoL',
+    status: 'in_force',
+    issuedDate: '1999-02-15',
+    inForceDate: '2000-02-01',
+    url: 'https://www.gov.il/en/departments/legalinfo/companies_law',
+  },
+  {
+    id: 'electronic-signature-law-2001',
+    lawName: 'Electronic Signature Law',
+    year: 2001,
+    title: '\u05d7\u05d5\u05e7 \u05d7\u05ea\u05d9\u05de\u05d4 \u05d0\u05dc\u05e7\u05d8\u05e8\u05d5\u05e0\u05d9\u05ea, \u05ea\u05e1"\u05d0-2001',
+    titleEn: 'Electronic Signature Law, 5761-2001',
+    abbreviation: 'ESL',
+    status: 'in_force',
+    issuedDate: '2001-08-07',
+    inForceDate: '2001-08-07',
+    url: 'https://www.gov.il/en/departments/legalinfo/electronic_signature_law',
+  },
+  {
+    id: 'credit-data-law-2002',
+    lawName: 'Credit Data Law',
+    year: 2002,
+    title: '\u05d7\u05d5\u05e7 \u05e0\u05ea\u05d5\u05e0\u05d9 \u05d0\u05e9\u05e8\u05d0\u05d9, \u05ea\u05e1"\u05d1-2002',
+    titleEn: 'Credit Data Law, 5762-2002',
+    abbreviation: 'CDL',
+    status: 'in_force',
+    issuedDate: '2002-01-01',
+    inForceDate: '2002-01-01',
+    url: 'https://www.nevo.co.il/law_html/law01/999_611.htm',
+  },
+  {
+    id: 'freedom-of-information-law-1998',
+    lawName: 'Freedom of Information Law',
+    year: 1998,
+    title: '\u05d7\u05d5\u05e7 \u05d7\u05d5\u05e4\u05e9 \u05d4\u05de\u05d9\u05d3\u05e2, \u05ea\u05e9\u05e0"\u05d7-1998',
+    titleEn: 'Freedom of Information Law, 5758-1998',
+    abbreviation: 'FoIL',
+    status: 'in_force',
+    issuedDate: '1998-05-19',
+    inForceDate: '1999-05-19',
+    url: 'https://www.gov.il/en/departments/legalinfo/freedom_of_information_law',
+  },
+  {
+    id: 'regulation-of-security-1998',
+    lawName: 'Regulation of Security in Public Bodies Law',
+    year: 1998,
+    title: '\u05d7\u05d5\u05e7 \u05d4\u05e1\u05d3\u05e8\u05ea \u05d4\u05d0\u05d1\u05d8\u05d7\u05d4 \u05d1\u05d2\u05d5\u05e4\u05d9\u05dd \u05e6\u05d9\u05d1\u05d5\u05e8\u05d9\u05d9\u05dd, \u05ea\u05e9\u05e0"\u05d7-1998',
+    titleEn: 'Regulation of Security in Public Bodies Law, 5758-1998',
+    abbreviation: 'RSPBL',
+    status: 'in_force',
+    issuedDate: '1998-01-01',
+    inForceDate: '1998-01-01',
+    url: 'https://www.nevo.co.il/law_html/law01/999_574.htm',
+  },
+  {
+    id: 'communications-law-1982',
+    lawName: 'Communications Law (Telecommunications and Broadcasting)',
+    year: 1982,
+    title: '\u05d7\u05d5\u05e7 \u05d4\u05ea\u05e7\u05e9\u05d5\u05e8\u05ea (\u05d1\u05d6\u05e7 \u05d5\u05e9\u05d9\u05d3\u05d5\u05e8\u05d9\u05dd), \u05ea\u05e9\u05de"\u05d1-1982',
+    titleEn: 'Communications Law (Telecommunications and Broadcasting), 5742-1982',
+    abbreviation: 'CommL',
+    status: 'in_force',
+    issuedDate: '1982-01-01',
+    inForceDate: '1984-02-01',
+    url: 'https://www.nevo.co.il/law_html/law01/044_001.htm',
+  },
+];