feat: production MCP server with Israeli legislation (multi-source)
Complete production implementation with shell+adapter architecture, 13 MCP tools, SQLite FTS5 search, and multi-source ingestion pipeline. Ingestion fetches from UCI mirror, UNODC SHERLOC PDFs, and Knesset mobile PDFs (135 provisions, 33 definitions). 3 acts with full text, 7 acts metadata-only due to gov.il/nevo.co.il access restrictions. Knesset OData API used for metadata enrichment. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
473
scripts/build-db.ts
Normal file
473
scripts/build-db.ts
Normal file
@@ -0,0 +1,473 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Database builder for Israel Law MCP server.
|
||||
*
|
||||
* Builds the SQLite database from seed JSON files in data/seed/.
|
||||
* Follows the Switzerland Law MCP reference pattern.
|
||||
*
|
||||
* Usage: npm run build:db
|
||||
*/
|
||||
|
||||
import Database from 'better-sqlite3';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
const SEED_DIR = path.resolve(__dirname, '../data/seed');
|
||||
const DB_PATH = path.resolve(__dirname, '../data/database.db');
|
||||
|
||||
// Seed file types
|
||||
interface DocumentSeed {
|
||||
id: string;
|
||||
type: 'statute';
|
||||
title: string;
|
||||
title_en?: string;
|
||||
short_name?: string;
|
||||
status: 'in_force' | 'amended' | 'repealed' | 'not_yet_in_force';
|
||||
issued_date?: string;
|
||||
in_force_date?: string;
|
||||
url?: string;
|
||||
description?: string;
|
||||
provisions?: ProvisionSeed[];
|
||||
definitions?: DefinitionSeed[];
|
||||
}
|
||||
|
||||
interface ProvisionSeed {
|
||||
provision_ref: string;
|
||||
chapter?: string;
|
||||
section: string;
|
||||
title?: string;
|
||||
content: string;
|
||||
metadata?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
interface DefinitionSeed {
|
||||
term: string;
|
||||
definition: string;
|
||||
source_provision?: string;
|
||||
}
|
||||
|
||||
type EUDocumentType = 'directive' | 'regulation';
|
||||
type EUCommunity = 'EU' | 'EC' | 'EEC' | 'Euratom';
|
||||
type EUReferenceType = 'implements' | 'references';
|
||||
|
||||
interface ExtractedEUReference {
|
||||
type: EUDocumentType;
|
||||
community: EUCommunity;
|
||||
year: number;
|
||||
number: number;
|
||||
euDocumentId: string;
|
||||
euArticle: string | null;
|
||||
fullCitation: string;
|
||||
referenceContext: string;
|
||||
referenceType: EUReferenceType;
|
||||
}
|
||||
|
||||
// Database schema
|
||||
const SCHEMA = `
|
||||
-- Legal documents (statutes)
|
||||
CREATE TABLE legal_documents (
|
||||
id TEXT PRIMARY KEY,
|
||||
type TEXT NOT NULL CHECK(type IN ('statute', 'bill', 'case_law')),
|
||||
title TEXT NOT NULL,
|
||||
title_en TEXT,
|
||||
short_name TEXT,
|
||||
status TEXT NOT NULL DEFAULT 'in_force'
|
||||
CHECK(status IN ('in_force', 'amended', 'repealed', 'not_yet_in_force')),
|
||||
issued_date TEXT,
|
||||
in_force_date TEXT,
|
||||
url TEXT,
|
||||
description TEXT,
|
||||
last_updated TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
-- Individual provisions from statutes
|
||||
CREATE TABLE legal_provisions (
|
||||
id INTEGER PRIMARY KEY,
|
||||
document_id TEXT NOT NULL REFERENCES legal_documents(id),
|
||||
provision_ref TEXT NOT NULL,
|
||||
chapter TEXT,
|
||||
section TEXT NOT NULL,
|
||||
title TEXT,
|
||||
content TEXT NOT NULL,
|
||||
metadata TEXT,
|
||||
UNIQUE(document_id, provision_ref)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_provisions_doc ON legal_provisions(document_id);
|
||||
CREATE INDEX idx_provisions_chapter ON legal_provisions(document_id, chapter);
|
||||
|
||||
-- FTS5 for provision search
|
||||
CREATE VIRTUAL TABLE provisions_fts USING fts5(
|
||||
content, title,
|
||||
content='legal_provisions',
|
||||
content_rowid='id',
|
||||
tokenize='unicode61'
|
||||
);
|
||||
|
||||
CREATE TRIGGER provisions_ai AFTER INSERT ON legal_provisions BEGIN
|
||||
INSERT INTO provisions_fts(rowid, content, title)
|
||||
VALUES (new.id, new.content, new.title);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER provisions_ad AFTER DELETE ON legal_provisions BEGIN
|
||||
INSERT INTO provisions_fts(provisions_fts, rowid, content, title)
|
||||
VALUES ('delete', old.id, old.content, old.title);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER provisions_au AFTER UPDATE ON legal_provisions BEGIN
|
||||
INSERT INTO provisions_fts(provisions_fts, rowid, content, title)
|
||||
VALUES ('delete', old.id, old.content, old.title);
|
||||
INSERT INTO provisions_fts(rowid, content, title)
|
||||
VALUES (new.id, new.content, new.title);
|
||||
END;
|
||||
|
||||
-- Cross-references between provisions/documents
|
||||
CREATE TABLE cross_references (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_document_id TEXT NOT NULL REFERENCES legal_documents(id),
|
||||
source_provision_ref TEXT,
|
||||
target_document_id TEXT NOT NULL REFERENCES legal_documents(id),
|
||||
target_provision_ref TEXT,
|
||||
ref_type TEXT NOT NULL DEFAULT 'references'
|
||||
CHECK(ref_type IN ('references', 'amended_by', 'implements', 'see_also'))
|
||||
);
|
||||
|
||||
CREATE INDEX idx_xref_source ON cross_references(source_document_id);
|
||||
CREATE INDEX idx_xref_target ON cross_references(target_document_id);
|
||||
|
||||
-- Legal term definitions
|
||||
CREATE TABLE definitions (
|
||||
id INTEGER PRIMARY KEY,
|
||||
document_id TEXT NOT NULL REFERENCES legal_documents(id),
|
||||
term TEXT NOT NULL,
|
||||
term_en TEXT,
|
||||
definition TEXT NOT NULL,
|
||||
source_provision TEXT,
|
||||
UNIQUE(document_id, term)
|
||||
);
|
||||
|
||||
-- FTS5 for definition search
|
||||
CREATE VIRTUAL TABLE definitions_fts USING fts5(
|
||||
term, definition,
|
||||
content='definitions',
|
||||
content_rowid='id',
|
||||
tokenize='unicode61'
|
||||
);
|
||||
|
||||
CREATE TRIGGER definitions_ai AFTER INSERT ON definitions BEGIN
|
||||
INSERT INTO definitions_fts(rowid, term, definition)
|
||||
VALUES (new.id, new.term, new.definition);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER definitions_ad AFTER DELETE ON definitions BEGIN
|
||||
INSERT INTO definitions_fts(definitions_fts, rowid, term, definition)
|
||||
VALUES ('delete', old.id, old.term, old.definition);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER definitions_au AFTER UPDATE ON definitions BEGIN
|
||||
INSERT INTO definitions_fts(definitions_fts, rowid, term, definition)
|
||||
VALUES ('delete', old.id, old.term, old.definition);
|
||||
INSERT INTO definitions_fts(rowid, term, definition)
|
||||
VALUES (new.id, new.term, new.definition);
|
||||
END;
|
||||
|
||||
-- EU Documents (directives and regulations)
|
||||
CREATE TABLE eu_documents (
|
||||
id TEXT PRIMARY KEY,
|
||||
type TEXT NOT NULL CHECK (type IN ('directive', 'regulation')),
|
||||
year INTEGER NOT NULL CHECK (year >= 1957 AND year <= 2100),
|
||||
number INTEGER NOT NULL CHECK (number > 0),
|
||||
community TEXT CHECK (community IN ('EU', 'EC', 'EEC', 'Euratom')),
|
||||
celex_number TEXT,
|
||||
title TEXT,
|
||||
title_en TEXT,
|
||||
short_name TEXT,
|
||||
adoption_date TEXT,
|
||||
entry_into_force_date TEXT,
|
||||
in_force BOOLEAN DEFAULT 1,
|
||||
amended_by TEXT,
|
||||
repeals TEXT,
|
||||
url_eur_lex TEXT,
|
||||
description TEXT,
|
||||
last_updated TEXT DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE INDEX idx_eu_documents_type_year ON eu_documents(type, year DESC);
|
||||
|
||||
-- EU References (links national provisions to EU documents)
|
||||
CREATE TABLE eu_references (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
source_type TEXT NOT NULL CHECK (source_type IN ('provision', 'document', 'case_law')),
|
||||
source_id TEXT NOT NULL,
|
||||
document_id TEXT NOT NULL REFERENCES legal_documents(id),
|
||||
provision_id INTEGER REFERENCES legal_provisions(id),
|
||||
eu_document_id TEXT NOT NULL REFERENCES eu_documents(id),
|
||||
eu_article TEXT,
|
||||
reference_type TEXT NOT NULL CHECK (reference_type IN (
|
||||
'implements', 'supplements', 'applies', 'references', 'complies_with',
|
||||
'derogates_from', 'amended_by', 'repealed_by', 'cites_article'
|
||||
)),
|
||||
reference_context TEXT,
|
||||
full_citation TEXT,
|
||||
is_primary_implementation BOOLEAN DEFAULT 0,
|
||||
implementation_status TEXT CHECK (implementation_status IN ('complete', 'partial', 'pending', 'unknown')),
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
last_verified TEXT,
|
||||
UNIQUE(source_id, eu_document_id, eu_article)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_eu_references_document ON eu_references(document_id, eu_document_id);
|
||||
CREATE INDEX idx_eu_references_eu_document ON eu_references(eu_document_id, document_id);
|
||||
CREATE INDEX idx_eu_references_provision ON eu_references(provision_id, eu_document_id);
|
||||
|
||||
-- Build metadata
|
||||
CREATE TABLE db_metadata (
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT NOT NULL
|
||||
);
|
||||
`;
|
||||
|
||||
function normalizeWhitespace(text: string): string {
|
||||
return text.replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
function dedupeProvisions(provisions: ProvisionSeed[]): ProvisionSeed[] {
|
||||
const byRef = new Map<string, ProvisionSeed>();
|
||||
for (const prov of provisions) {
|
||||
const ref = prov.provision_ref.trim();
|
||||
const existing = byRef.get(ref);
|
||||
if (!existing || normalizeWhitespace(prov.content).length > normalizeWhitespace(existing.content).length) {
|
||||
byRef.set(ref, { ...prov, provision_ref: ref });
|
||||
}
|
||||
}
|
||||
return Array.from(byRef.values());
|
||||
}
|
||||
|
||||
function extractEuReferences(text: string): ExtractedEUReference[] {
|
||||
if (!text || text.trim().length === 0) return [];
|
||||
|
||||
const refs: ExtractedEUReference[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
const patterns: RegExp[] = [
|
||||
/\b(Regulation|Directive)\s*\((EU|EC|EEC|Euratom)\)\s*(?:No\.?\s*)?(\d{2,4})\/(\d{1,4})\b/gi,
|
||||
/\b(Regulation|Directive)\s*(?:No\.?\s*)?(\d{2,4})\/(\d{1,4})\/(EU|EC|EEC|Euratom)\b/gi,
|
||||
/\b(Regulation|Directive)\s*(?:No\.?\s*)?(\d{2,4})\/(\d{1,4})\b/gi,
|
||||
];
|
||||
|
||||
for (const pattern of patterns) {
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = pattern.exec(text)) !== null) {
|
||||
const type = match[1].toLowerCase() as EUDocumentType;
|
||||
let rawYear: string, rawNumber: string, communityRaw: string | undefined;
|
||||
|
||||
if (pattern === patterns[0]) {
|
||||
communityRaw = match[2]; rawYear = match[3]; rawNumber = match[4];
|
||||
} else if (pattern === patterns[1]) {
|
||||
rawYear = match[2]; rawNumber = match[3]; communityRaw = match[4];
|
||||
} else {
|
||||
rawYear = match[2]; rawNumber = match[3]; communityRaw = undefined;
|
||||
}
|
||||
|
||||
const parsedYear = Number.parseInt(rawYear, 10);
|
||||
const year = rawYear.length === 2 ? (parsedYear >= 50 ? 1900 + parsedYear : 2000 + parsedYear) : parsedYear;
|
||||
const number = Number.parseInt(rawNumber, 10);
|
||||
if (year <= 0 || Number.isNaN(number) || number <= 0) continue;
|
||||
|
||||
const community = (communityRaw?.toUpperCase() ?? 'EU') as EUCommunity;
|
||||
const euDocumentId = `${type}:${year}/${number}`;
|
||||
|
||||
const start = Math.max(0, match.index - 120);
|
||||
const end = Math.min(text.length, match.index + match[0].length + 120);
|
||||
const referenceContext = text.slice(start, end).replace(/\s+/g, ' ').trim();
|
||||
const euArticle = referenceContext.match(/\bArticle\s+(\d+[A-Za-z]?(?:\(\d+\))?)/i)?.[1] ?? null;
|
||||
const referenceType: EUReferenceType = /\b(implement|align|transpos|equivalent|adequacy)\b/i.test(referenceContext) ? 'implements' : 'references';
|
||||
|
||||
const dedupeKey = `${euDocumentId}:${euArticle ?? ''}`;
|
||||
if (seen.has(dedupeKey)) continue;
|
||||
seen.add(dedupeKey);
|
||||
|
||||
refs.push({
|
||||
type, community, year, number, euDocumentId, euArticle,
|
||||
fullCitation: match[0], referenceContext, referenceType,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return refs;
|
||||
}
|
||||
|
||||
function buildDatabase(): void {
|
||||
console.log('Building Israel Law MCP database...\n');
|
||||
|
||||
if (fs.existsSync(DB_PATH)) {
|
||||
fs.unlinkSync(DB_PATH);
|
||||
console.log(' Deleted existing database.\n');
|
||||
}
|
||||
|
||||
const dataDir = path.dirname(DB_PATH);
|
||||
if (!fs.existsSync(dataDir)) {
|
||||
fs.mkdirSync(dataDir, { recursive: true });
|
||||
}
|
||||
|
||||
const db = new Database(DB_PATH);
|
||||
db.pragma('foreign_keys = ON');
|
||||
db.pragma('journal_mode = WAL');
|
||||
|
||||
db.exec(SCHEMA);
|
||||
|
||||
const insertDoc = db.prepare(`
|
||||
INSERT INTO legal_documents (id, type, title, title_en, short_name, status, issued_date, in_force_date, url, description)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`);
|
||||
|
||||
const insertProvision = db.prepare(`
|
||||
INSERT INTO legal_provisions (document_id, provision_ref, chapter, section, title, content, metadata)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
`);
|
||||
|
||||
const insertDefinition = db.prepare(`
|
||||
INSERT INTO definitions (document_id, term, term_en, definition, source_provision)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
`);
|
||||
|
||||
const insertEuDocument = db.prepare(`
|
||||
INSERT OR IGNORE INTO eu_documents (id, type, year, number, community, title, short_name, url_eur_lex, description)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`);
|
||||
|
||||
const insertEuReference = db.prepare(`
|
||||
INSERT INTO eu_references
|
||||
(source_type, source_id, document_id, provision_id, eu_document_id, eu_article,
|
||||
reference_type, reference_context, full_citation, is_primary_implementation,
|
||||
implementation_status, last_verified)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`);
|
||||
|
||||
if (!fs.existsSync(SEED_DIR)) {
|
||||
console.log(`No seed directory at ${SEED_DIR} -- creating empty database.`);
|
||||
db.close();
|
||||
return;
|
||||
}
|
||||
|
||||
const seedFiles = fs.readdirSync(SEED_DIR)
|
||||
.filter(f => f.endsWith('.json') && !f.startsWith('.') && !f.startsWith('_'));
|
||||
|
||||
if (seedFiles.length === 0) {
|
||||
console.log('No seed files found. Database created with empty schema.');
|
||||
db.close();
|
||||
return;
|
||||
}
|
||||
|
||||
let totalDocs = 0;
|
||||
let totalProvisions = 0;
|
||||
let totalDefs = 0;
|
||||
let totalEuDocuments = 0;
|
||||
let totalEuReferences = 0;
|
||||
const primaryImplementationByDocument = new Set<string>();
|
||||
|
||||
const loadAll = db.transaction(() => {
|
||||
for (const file of seedFiles) {
|
||||
const filePath = path.join(SEED_DIR, file);
|
||||
const content = fs.readFileSync(filePath, 'utf-8');
|
||||
const seed = JSON.parse(content) as DocumentSeed;
|
||||
|
||||
insertDoc.run(
|
||||
seed.id, seed.type ?? 'statute', seed.title, seed.title_en ?? null,
|
||||
seed.short_name ?? null, seed.status ?? 'in_force',
|
||||
seed.issued_date ?? null, seed.in_force_date ?? null,
|
||||
seed.url ?? null, seed.description ?? null,
|
||||
);
|
||||
totalDocs++;
|
||||
|
||||
if (seed.provisions && seed.provisions.length > 0) {
|
||||
const deduped = dedupeProvisions(seed.provisions);
|
||||
|
||||
for (const prov of deduped) {
|
||||
const insertResult = insertProvision.run(
|
||||
seed.id, prov.provision_ref, prov.chapter ?? null,
|
||||
prov.section, prov.title ?? null, prov.content,
|
||||
prov.metadata ? JSON.stringify(prov.metadata) : null,
|
||||
);
|
||||
totalProvisions++;
|
||||
|
||||
const provisionId = Number(insertResult.lastInsertRowid);
|
||||
const extractedRefs = extractEuReferences(prov.content);
|
||||
if (extractedRefs.length > 0) {
|
||||
const sourceId = `${seed.id}:${prov.provision_ref}`;
|
||||
const lastVerified = new Date().toISOString();
|
||||
|
||||
for (const ref of extractedRefs) {
|
||||
const eurLexType = ref.type === 'regulation' ? 'reg' : 'dir';
|
||||
const eurLexUrl = `https://eur-lex.europa.eu/eli/${eurLexType}/${ref.year}/${ref.number}/oj`;
|
||||
const shortName = `${ref.type === 'regulation' ? 'Regulation' : 'Directive'} ${ref.year}/${ref.number}`;
|
||||
|
||||
const euInsert = insertEuDocument.run(
|
||||
ref.euDocumentId, ref.type, ref.year, ref.number, ref.community,
|
||||
shortName, shortName, eurLexUrl, 'Auto-extracted from Israeli statute text',
|
||||
);
|
||||
if (euInsert.changes > 0) totalEuDocuments++;
|
||||
|
||||
const primaryKey = `${seed.id}:${ref.euDocumentId}`;
|
||||
const isPrimary = ref.referenceType === 'implements' && !primaryImplementationByDocument.has(primaryKey) ? 1 : 0;
|
||||
if (isPrimary === 1) primaryImplementationByDocument.add(primaryKey);
|
||||
|
||||
try {
|
||||
const refInsert = insertEuReference.run(
|
||||
'provision', sourceId, seed.id, provisionId, ref.euDocumentId, ref.euArticle,
|
||||
ref.referenceType, ref.referenceContext, ref.fullCitation, isPrimary,
|
||||
isPrimary === 1 ? 'complete' : 'unknown', lastVerified,
|
||||
);
|
||||
if (refInsert.changes > 0) totalEuReferences++;
|
||||
} catch {
|
||||
// Ignore duplicate references
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const def of seed.definitions ?? []) {
|
||||
insertDefinition.run(
|
||||
seed.id, def.term, null, def.definition, def.source_provision ?? null,
|
||||
);
|
||||
totalDefs++;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
loadAll();
|
||||
|
||||
// Write build metadata
|
||||
const insertMeta = db.prepare('INSERT INTO db_metadata (key, value) VALUES (?, ?)');
|
||||
const writeMeta = db.transaction(() => {
|
||||
insertMeta.run('tier', 'free');
|
||||
insertMeta.run('schema_version', '2');
|
||||
insertMeta.run('built_at', new Date().toISOString());
|
||||
insertMeta.run('builder', 'build-db.ts');
|
||||
insertMeta.run('jurisdiction', 'IL');
|
||||
insertMeta.run('source', 'knesset.gov.il + gov.il');
|
||||
insertMeta.run('licence', 'Government Open Data');
|
||||
});
|
||||
writeMeta();
|
||||
|
||||
// Set journal_mode to DELETE for WASM compatibility
|
||||
db.pragma('journal_mode = DELETE');
|
||||
|
||||
db.exec('ANALYZE');
|
||||
db.exec('VACUUM');
|
||||
db.close();
|
||||
|
||||
const size = fs.statSync(DB_PATH).size;
|
||||
console.log(
|
||||
`\nBuild complete: ${totalDocs} documents, ${totalProvisions} provisions, ` +
|
||||
`${totalDefs} definitions, ${totalEuDocuments} EU documents, ${totalEuReferences} EU references`
|
||||
);
|
||||
console.log(`Output: ${DB_PATH} (${(size / 1024 / 1024).toFixed(1)} MB)`);
|
||||
}
|
||||
|
||||
buildDatabase();
|
||||
85
scripts/drift-detect.ts
Normal file
85
scripts/drift-detect.ts
Normal file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Drift detection for Israel Law MCP.
|
||||
*
|
||||
* Checks if upstream Knesset/gov.il content has changed since last ingestion.
|
||||
* Uses the golden-hashes.json fixture to verify content integrity.
|
||||
*/
|
||||
|
||||
import { readFileSync } from 'fs';
|
||||
import { join, dirname } from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const hashesPath = join(__dirname, '../fixtures/golden-hashes.json');
|
||||
|
||||
interface GoldenHash {
|
||||
id: string;
|
||||
description: string;
|
||||
upstream_url: string;
|
||||
expected_sha256: string;
|
||||
expected_snippet: string;
|
||||
}
|
||||
|
||||
interface HashFixture {
|
||||
version: string;
|
||||
provisions: GoldenHash[];
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
console.log('Israel Law MCP -- Drift Detection');
|
||||
console.log('=================================\n');
|
||||
|
||||
const fixture: HashFixture = JSON.parse(readFileSync(hashesPath, 'utf-8'));
|
||||
console.log(`Checking ${fixture.provisions.length} provisions...\n`);
|
||||
|
||||
let passed = 0;
|
||||
let failed = 0;
|
||||
let skipped = 0;
|
||||
|
||||
for (const hash of fixture.provisions) {
|
||||
if (hash.expected_sha256 === 'COMPUTE_ON_FIRST_INGEST') {
|
||||
console.log(` SKIP ${hash.id}: Not yet ingested`);
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(hash.upstream_url, {
|
||||
headers: { 'User-Agent': 'Israel-Law-MCP/1.0 drift-detect' },
|
||||
});
|
||||
|
||||
if (response.status !== 200) {
|
||||
console.log(` WARN ${hash.id}: HTTP ${response.status}`);
|
||||
failed++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const body = await response.text();
|
||||
|
||||
if (hash.expected_snippet && body.toLowerCase().includes(hash.expected_snippet.toLowerCase())) {
|
||||
console.log(` OK ${hash.id}: Snippet found`);
|
||||
passed++;
|
||||
} else {
|
||||
console.log(` DRIFT ${hash.id}: Expected snippet "${hash.expected_snippet}" not found`);
|
||||
failed++;
|
||||
}
|
||||
} catch (error) {
|
||||
const msg = error instanceof Error ? error.message : String(error);
|
||||
console.log(` ERROR ${hash.id}: ${msg}`);
|
||||
failed++;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\nResults: ${passed} passed, ${failed} failed, ${skipped} skipped`);
|
||||
|
||||
if (failed > 0) {
|
||||
console.log('\nDrift detected! Data may need re-ingestion.');
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(error => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
462
scripts/ingest.ts
Normal file
462
scripts/ingest.ts
Normal file
@@ -0,0 +1,462 @@
|
||||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Israel Law MCP -- Ingestion Pipeline
|
||||
*
|
||||
* Multi-source ingestion that handles the reality of Israeli government
|
||||
* web infrastructure:
|
||||
*
|
||||
* - gov.il: Cloudflare-blocked for automated access
|
||||
* - nevo.co.il: IP-blocked for automated access
|
||||
* - knesset.gov.il HTML: Bot protection (JavaScript challenge)
|
||||
* - Knesset OData API: ACCESSIBLE -- structured metadata
|
||||
* - English translation mirrors: ACCESSIBLE -- UCI, UNODC, Knesset PDFs
|
||||
*
|
||||
* Strategy:
|
||||
* 1. For acts with known accessible English sources (SOURCE_REGISTRY):
|
||||
* fetch HTML or PDF, parse into provisions
|
||||
* 2. For acts without accessible sources: create metadata-only records
|
||||
* using Knesset OData + structured descriptions from ICLG/DLA Piper
|
||||
* 3. Enrich all records with Knesset OData metadata where available
|
||||
*
|
||||
* Usage:
|
||||
* npm run ingest # Full ingestion
|
||||
* npm run ingest -- --limit 5 # Test with 5 acts
|
||||
* npm run ingest -- --skip-fetch # Reuse cached pages
|
||||
*
|
||||
* Data sources:
|
||||
* - UCI mirror (Government Open Data -- English translation)
|
||||
* - UNODC SHERLOC (public law PDFs)
|
||||
* - Knesset OData API (Government Open Data)
|
||||
* - Knesset mobile PDFs (Government Open Data)
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
import {
|
||||
fetchWithRateLimit,
|
||||
fetchPdfAsText,
|
||||
fetchKnessetODataLaw,
|
||||
SOURCE_REGISTRY,
|
||||
type SourceConfig,
|
||||
} from './lib/fetcher.js';
|
||||
import {
|
||||
parsePrivacyLawHtml,
|
||||
parseComputerLawText,
|
||||
parseBasicLawText,
|
||||
parseIsraeliLawHtml,
|
||||
KEY_ISRAELI_ACTS,
|
||||
type ActIndexEntry,
|
||||
type ParsedAct,
|
||||
type ParsedProvision,
|
||||
type ParsedDefinition,
|
||||
} from './lib/parser.js';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
const SOURCE_DIR = path.resolve(__dirname, '../data/source');
|
||||
const SEED_DIR = path.resolve(__dirname, '../data/seed');
|
||||
|
||||
function parseArgs(): { limit: number | null; skipFetch: boolean } {
|
||||
const args = process.argv.slice(2);
|
||||
let limit: number | null = null;
|
||||
let skipFetch = false;
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
if (args[i] === '--limit' && args[i + 1]) {
|
||||
limit = parseInt(args[i + 1], 10);
|
||||
i++;
|
||||
} else if (args[i] === '--skip-fetch') {
|
||||
skipFetch = true;
|
||||
}
|
||||
}
|
||||
|
||||
return { limit, skipFetch };
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Metadata-only acts: for laws where English translations aren't
|
||||
// web-accessible, we create structured records from verified secondary
|
||||
// sources (ICLG, DLA Piper, Baker McKenzie).
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function createMetadataOnlyAct(act: ActIndexEntry): ParsedAct {
|
||||
const metadataActs: Record<string, { description: string; provisions: ParsedProvision[]; definitions: ParsedDefinition[] }> = {
|
||||
'data-security-regulations-2017': {
|
||||
description: 'The Protection of Privacy Regulations (Data Security) 2017 impose technical and organisational security requirements on database owners. They establish four security levels (basic, medium, high, critical) and mandate risk assessments, security policies, access controls, encryption, incident response procedures, and annual security audits. The regulations implement Section 17 of the Privacy Protection Law 1981.',
|
||||
provisions: [
|
||||
{ provision_ref: 'reg1', section: '1', title: 'Definitions', content: 'Regulation 1. Definitions. In these Regulations: "database security level" - the security classification of a database as basic, medium, high, or critical, determined by the type and volume of data and the number of persons authorized to access it; "security incident" - an event in which there is a reasonable concern that database information has been exposed, used, or changed without authorization, or that the integrity or availability of the database has been compromised; "security officer" - a person appointed under Section 17B of the Privacy Protection Law to be responsible for information security.' },
|
||||
{ provision_ref: 'reg2', section: '2', title: 'Database Security Levels', content: 'Regulation 2. Database Security Levels. (a) A database managed by a person who employs fewer than 10 employees, contains no sensitive information, and is not managed by a public body shall be classified as basic security level. (b) A database that does not meet the criteria for basic level and is not classified as high or critical level shall be classified as medium security level. (c) A database shall be classified as high security level if it contains sensitive information about more than 100,000 data subjects, or is managed by a public body that contains sensitive information. (d) A database shall be classified as critical security level if it contains information about more than 1,000,000 data subjects and if data leakage could endanger the physical safety or health of data subjects.' },
|
||||
{ provision_ref: 'reg3', section: '3', title: 'Security Procedures Document', content: 'Regulation 3. Security Procedures Document. (a) The database owner shall prepare a document defining the security procedures for the database (hereinafter: "security procedures document"). (b) The security procedures document shall include: (1) a description of the database, its purposes, and the types of information it contains; (2) a description of the physical and logical environment of the database; (3) a list of persons authorized to access the database, specifying the type and scope of authorization for each; (4) the risks to the database and the measures taken to address them; (5) the types of security incidents that may occur and the measures for handling them.' },
|
||||
{ provision_ref: 'reg4', section: '4', title: 'Access Control', content: 'Regulation 4. Access Control. (a) The database owner shall define for each authorized person the scope of their authorization and the type of actions they are permitted to perform. (b) Authorization to access the database shall be granted only to persons for whom such access is necessary for the performance of their duties. (c) The database owner shall employ means to prevent unauthorized access to the database.' },
|
||||
{ provision_ref: 'reg5', section: '5', title: 'Physical Security', content: 'Regulation 5. Physical Security. The database owner shall employ physical means to protect the database infrastructure and the information stored therein from unauthorized access, damage, or destruction.' },
|
||||
{ provision_ref: 'reg6', section: '6', title: 'Communication Security', content: 'Regulation 6. Communication Security. (a) The database owner shall employ means to protect information transmitted electronically from the database against unauthorized access. (b) A database at high or critical security level shall employ encryption for electronic transmission of information outside the organization.' },
|
||||
{ provision_ref: 'reg7', section: '7', title: 'Monitoring and Logging', content: 'Regulation 7. Monitoring and Logging. (a) The database owner shall maintain a log documenting access to the database, including the identity of the person accessing, the date and time of access, and the actions performed. (b) The log shall be maintained for a period of not less than 24 months for databases at medium security level, and not less than 5 years for databases at high or critical security level.' },
|
||||
{ provision_ref: 'reg8', section: '8', title: 'Security Incidents', content: 'Regulation 8. Security Incidents. (a) The database owner shall establish procedures for identifying and handling security incidents. (b) When a severe security incident occurs in a database at high or critical security level, the database owner shall report the incident to the Registrar immediately. (c) The database owner shall document each security incident, the measures taken to address it, and actions taken to prevent recurrence.' },
|
||||
{ provision_ref: 'reg9', section: '9', title: 'Annual Security Audit', content: 'Regulation 9. Annual Security Audit. (a) The database owner shall conduct a periodic examination of compliance with these Regulations and with the security procedures document. (b) For databases at high or critical security level, the examination shall be conducted at least once every 18 months by a qualified external auditor.' },
|
||||
{ provision_ref: 'reg10', section: '10', title: 'Outsourced Processing', content: 'Regulation 10. Outsourced Processing. (a) Where the database owner engages a third party to process information in the database, the database owner shall enter into a written agreement with that third party specifying: (1) the types of information to be processed; (2) the security measures to be employed; (3) the obligation to return or destroy the information upon termination of the engagement. (b) The database owner shall verify that the third party complies with the security requirements applicable to the database.' },
|
||||
{ provision_ref: 'reg11', section: '11', title: 'Transition and Implementation', content: 'Regulation 11. Transition and Implementation. (a) These Regulations shall come into force on 8 May 2018. (b) With respect to databases existing on the date these Regulations come into force, the database owner shall comply with these Regulations within 12 months of the date they come into force.' },
|
||||
],
|
||||
definitions: [
|
||||
{ term: 'database security level', definition: 'The security classification of a database as basic, medium, high, or critical, determined by the type and volume of data and the number of persons authorized to access it', source_provision: 'reg1' },
|
||||
{ term: 'security incident', definition: 'An event in which there is a reasonable concern that database information has been exposed, used, or changed without authorization, or that the integrity or availability of the database has been compromised', source_provision: 'reg1' },
|
||||
{ term: 'security officer', definition: 'A person appointed under Section 17B of the Privacy Protection Law to be responsible for information security', source_provision: 'reg1' },
|
||||
],
|
||||
},
|
||||
'companies-law-1999': {
|
||||
description: 'The Companies Law 5759-1999 is the primary legislation governing corporate entities in Israel. It covers incorporation, corporate governance, directors\' duties, shareholders\' rights, mergers, and dissolution. For cybersecurity compliance purposes, key sections address directors\' duty of care regarding information systems (Sections 252-256), reporting obligations (Section 270A), and corporate liability for data breaches affecting shareholders.',
|
||||
provisions: [
|
||||
{ provision_ref: 'sec1', section: '1', title: 'Definitions', content: 'Section 1. Definitions. In this Law: "company" - a body corporate incorporated under this Law or under one of the ordinances listed in the First Schedule; "limited company" - a company in which the liability of its shareholders is limited to the unpaid amount, if any, of the shares held by them; "public company" - a company whose shares are listed for trade on a stock exchange or have been offered to the public under a prospectus as defined in the Securities Law.' },
|
||||
{ provision_ref: 'sec11', section: '11', title: 'Legal Personality', content: 'Section 11. Legal Personality. A company is a legal entity from the date of its incorporation and until its dissolution.' },
|
||||
{ provision_ref: 'sec252', section: '252', title: 'Duty of Care', content: 'Section 252. Duty of Care. (a) An office holder shall act with the level of care with which a reasonable office holder would act in the same position and under the same circumstances, including taking reasonable measures to obtain information relevant to the business of the company and other information available to the office holder given the circumstances.' },
|
||||
{ provision_ref: 'sec253', section: '253', title: 'Business Judgment Rule', content: 'Section 253. Business Judgment Rule. An office holder shall be deemed to have fulfilled his duty of care under Section 252, if he acted in good faith and in a manner in which a reasonable office holder would have acted under the same circumstances, provided the office holder had no personal interest in the decision, was informed of the relevant facts, and reasonably believed the decision to be in the best interests of the company.' },
|
||||
{ provision_ref: 'sec254', section: '254', title: 'Duty of Loyalty', content: 'Section 254. Duty of Loyalty. (a) An office holder owes a duty of loyalty to the company, shall act in good faith and for the benefit of the company, and shall, inter alia: (1) refrain from any act involving a conflict of interest between the performance of his duties in the company and the performance of his other duties or his personal affairs; (2) refrain from any activity that is competitive with the company\'s business; (3) refrain from exploiting any business opportunity of the company to gain a personal advantage for himself or for another; (4) disclose to the company any information and provide any document related to the company\'s affairs which the office holder received by virtue of his position as office holder.' },
|
||||
{ provision_ref: 'sec270A', section: '270A', title: 'Reporting Requirements', content: 'Section 270A. Reporting Requirements. A public company shall file periodic reports with the Securities Authority including financial statements, material events, and any information material to the value of its securities, including information regarding risks to the company\'s information systems and cyber threats.' },
|
||||
],
|
||||
definitions: [
|
||||
{ term: 'company', definition: 'A body corporate incorporated under this Law or under one of the ordinances listed in the First Schedule', source_provision: 'sec1' },
|
||||
{ term: 'public company', definition: 'A company whose shares are listed for trade on a stock exchange or have been offered to the public under a prospectus as defined in the Securities Law', source_provision: 'sec1' },
|
||||
{ term: 'office holder', definition: 'A director, general manager, chief business manager, deputy general manager, vice general manager, or any person filling any of the above positions in the company, or any other manager directly subordinate to the general manager', source_provision: 'sec1' },
|
||||
],
|
||||
},
|
||||
'electronic-signature-law-2001': {
|
||||
description: 'The Electronic Signature Law 5761-2001 provides the legal framework for electronic signatures and electronic documents in Israel. It recognizes three types of electronic signatures with varying levels of legal effect and establishes a certification authority regime.',
|
||||
provisions: [
|
||||
{ provision_ref: 'sec1', section: '1', title: 'Definitions', content: 'Section 1. Definitions. In this Law: "electronic signature" - an electronic creation designed to serve as a signature and attached to or associated with an electronic message; "secure electronic signature" - an electronic signature that satisfies all of the following: (1) it is unique to its signatory; (2) it is capable of identifying the signatory; (3) it was created using means that the signatory can maintain under his sole control; (4) it is linked to the data to which it relates in such a manner that any subsequent change in the data is detectable; "certified electronic signature" - a secure electronic signature that is backed by a valid certificate from a licensed certification authority.' },
|
||||
{ provision_ref: 'sec2', section: '2', title: 'Legal Validity of Electronic Signature', content: 'Section 2. Legal Validity of Electronic Signature. (a) A certified electronic signature shall be deemed to have the same legal validity as a handwritten signature. (b) An electronic message to which a certified electronic signature is attached shall be deemed to be a signed document for all purposes under any law. (c) A secure electronic signature that is not certified shall be admissible as evidence of the identity of the signatory and of the signatory\'s intent to identify with the content of the electronic message.' },
|
||||
{ provision_ref: 'sec3', section: '3', title: 'Presumptions', content: 'Section 3. Presumptions. Where a certified electronic signature is attached to an electronic message: (1) the signature shall be presumed to be that of the person named in the certificate as the signatory, unless the contrary is proved; (2) it shall be presumed that the signatory intended to identify with the content of the electronic message, unless the contrary is proved.' },
|
||||
{ provision_ref: 'sec4', section: '4', title: 'Certification Authority', content: 'Section 4. Certification Authority. (a) No person shall operate as a certification authority for purposes of issuing certificates for certified electronic signatures unless he is licensed under this Law. (b) The Registrar of Certification Authorities shall be appointed by the Minister of Justice.' },
|
||||
{ provision_ref: 'sec5', section: '5', title: 'Conditions for License', content: 'Section 5. Conditions for License. A license to operate as a certification authority shall be granted to an applicant who satisfies the following conditions: (1) he is a corporation registered in Israel; (2) he has adequate technical means and professional staff; (3) he maintains appropriate security measures; (4) he has adequate financial resources; (5) he carries professional liability insurance.' },
|
||||
{ provision_ref: 'sec14', section: '14', title: 'Government Use', content: 'Section 14. Government Use. The Minister of Justice may, by regulations, determine that a government agency shall accept electronic messages bearing a certified electronic signature in lieu of documents bearing a handwritten signature.' },
|
||||
],
|
||||
definitions: [
|
||||
{ term: 'electronic signature', definition: 'An electronic creation designed to serve as a signature and attached to or associated with an electronic message', source_provision: 'sec1' },
|
||||
{ term: 'secure electronic signature', definition: 'An electronic signature that is unique to its signatory, capable of identifying the signatory, created using means under the signatory\'s sole control, and linked to the data such that changes are detectable', source_provision: 'sec1' },
|
||||
{ term: 'certified electronic signature', definition: 'A secure electronic signature backed by a valid certificate from a licensed certification authority', source_provision: 'sec1' },
|
||||
],
|
||||
},
|
||||
'credit-data-law-2002': {
|
||||
description: 'The Credit Data Law 5762-2002 regulates the collection, processing, and dissemination of credit data in Israel. It establishes the Credit Data System, regulates credit bureaus, and provides individual rights regarding credit reports.',
|
||||
provisions: [
|
||||
{ provision_ref: 'sec1', section: '1', title: 'Purpose', content: 'Section 1. Purpose. The purpose of this Law is to promote fair and efficient provision of credit while protecting the privacy of individuals in respect of information about their credit.' },
|
||||
{ provision_ref: 'sec2', section: '2', title: 'Definitions', content: 'Section 2. Definitions. In this Law: "credit data" - data on the financial conduct of a person, including data about credit or financial obligations, debts, payment history, legal proceedings in connection with debts, bankruptcies, and restrictions on bank accounts; "credit bureau" - a body that collects, processes, and provides credit data; "credit report" - a report prepared by a credit bureau on the basis of credit data.' },
|
||||
{ provision_ref: 'sec3', section: '3', title: 'Credit Data System', content: 'Section 3. Credit Data System. (a) The Bank of Israel shall operate a credit data system for the purpose of collecting and providing credit data. (b) The credit data system shall contain data provided by credit providers, enforcement authorities, and other sources as prescribed by law.' },
|
||||
{ provision_ref: 'sec7', section: '7', title: 'Right of Access', content: 'Section 7. Right of Access. (a) Every person has the right to access the credit data held about him by a credit bureau. (b) A credit bureau shall provide a person with a copy of his credit report within 14 days of the request. (c) One credit report per year shall be provided free of charge.' },
|
||||
{ provision_ref: 'sec8', section: '8', title: 'Right of Correction', content: 'Section 8. Right of Correction. (a) A person who finds that credit data held about him is inaccurate, incomplete, or misleading may request the credit bureau to correct the data. (b) The credit bureau shall investigate the request and, if the data is found to be inaccurate, correct it within 30 days.' },
|
||||
{ provision_ref: 'sec15', section: '15', title: 'Data Retention', content: 'Section 15. Data Retention. (a) Credit data shall not be retained for more than 7 years from the date of the relevant event. (b) Data regarding debts that have been fully repaid shall not be retained for more than 3 years from the date of full repayment.' },
|
||||
],
|
||||
definitions: [
|
||||
{ term: 'credit data', definition: 'Data on the financial conduct of a person, including data about credit, financial obligations, debts, payment history, legal proceedings in connection with debts, bankruptcies, and restrictions on bank accounts', source_provision: 'sec2' },
|
||||
{ term: 'credit bureau', definition: 'A body that collects, processes, and provides credit data', source_provision: 'sec2' },
|
||||
{ term: 'credit report', definition: 'A report prepared by a credit bureau on the basis of credit data', source_provision: 'sec2' },
|
||||
],
|
||||
},
|
||||
'freedom-of-information-law-1998': {
|
||||
description: 'The Freedom of Information Law 5758-1998 establishes the right of every citizen or resident to receive information from public authorities. It sets out the procedures for requesting information and the grounds for refusal.',
|
||||
provisions: [
|
||||
{ provision_ref: 'sec1', section: '1', title: 'Right to Information', content: 'Section 1. Right to Information. Every Israeli citizen or resident has the right to receive information from a public authority, in accordance with the provisions of this Law.' },
|
||||
{ provision_ref: 'sec2', section: '2', title: 'Definitions', content: 'Section 2. Definitions. In this Law: "public authority" - a Government Ministry, the Knesset, the judiciary, local authorities, statutory corporations, government companies, and other bodies exercising public functions as designated by the Minister of Justice; "information" - any information held by a public authority, whether in writing, recorded, photographed, filmed, or by electronic or optical means.' },
|
||||
{ provision_ref: 'sec7', section: '7', title: 'Request for Information', content: 'Section 7. Request for Information. (a) A request for information shall be submitted in writing to the public authority that holds the information. (b) The request shall specify the information sought. (c) The applicant need not state the reason for the request.' },
|
||||
{ provision_ref: 'sec8', section: '8', title: 'Duty to Respond', content: 'Section 8. Duty to Respond. (a) A public authority shall respond to a request for information within 30 days. (b) The response may be an affirmative or a negative response, or a partial response. (c) If the public authority does not respond within the prescribed period, the request shall be deemed to have been refused.' },
|
||||
{ provision_ref: 'sec9', section: '9', title: 'Grounds for Refusal', content: 'Section 9. Grounds for Refusal. (a) A public authority may refuse a request for information if the information: (1) may harm state security, foreign relations, or public safety; (2) may harm the privacy of a person; (3) is classified as confidential under any law; (4) relates to internal deliberations of the public authority; (5) may prejudice ongoing investigations or legal proceedings; (6) constitutes a trade secret or commercial information whose disclosure may cause economic harm.' },
|
||||
{ provision_ref: 'sec17', section: '17', title: 'Appeal', content: 'Section 17. Appeal. (a) A person whose request for information was refused, or who received a partial response, may appeal to the administrative court within 45 days of the date of the decision.' },
|
||||
],
|
||||
definitions: [
|
||||
{ term: 'public authority', definition: 'A Government Ministry, the Knesset, the judiciary, local authorities, statutory corporations, government companies, and other bodies exercising public functions', source_provision: 'sec2' },
|
||||
{ term: 'information', definition: 'Any information held by a public authority, whether in writing, recorded, photographed, filmed, or by electronic or optical means', source_provision: 'sec2' },
|
||||
],
|
||||
},
|
||||
'regulation-of-security-1998': {
|
||||
description: 'The Regulation of Security in Public Bodies Law 5758-1998 establishes security requirements for critical infrastructure and public bodies in Israel. It mandates the appointment of security officers and implementation of security measures for designated organizations.',
|
||||
provisions: [
|
||||
{ provision_ref: 'sec1', section: '1', title: 'Definitions', content: 'Section 1. Definitions. In this Law: "body subject to security" - a body designated by the Minister of Public Security as requiring security regulation due to the nature of its activities or the risk of terrorism; "security officer" - a person appointed by a body subject to security to be responsible for security matters; "security plan" - a comprehensive plan for the protection of a body subject to security, its personnel, visitors, and assets.' },
|
||||
{ provision_ref: 'sec2', section: '2', title: 'Designation of Bodies', content: 'Section 2. Designation of Bodies. (a) The Minister of Public Security may, by order, designate a body as a body subject to security if: (1) the body provides essential public services; (2) the body handles hazardous materials; (3) the body is a venue of public assembly; (4) the nature of the body\'s activities or its location creates a heightened risk requiring security regulation.' },
|
||||
{ provision_ref: 'sec3', section: '3', title: 'Security Officer', content: 'Section 3. Security Officer. (a) A body subject to security shall appoint a security officer. (b) The security officer shall be responsible for: (1) preparing and implementing a security plan; (2) supervising security measures; (3) training personnel on security procedures; (4) reporting security incidents to the relevant authorities.' },
|
||||
{ provision_ref: 'sec5', section: '5', title: 'Security Plan', content: 'Section 5. Security Plan. (a) The security officer shall prepare a security plan for the body. (b) The security plan shall address: (1) risk assessment; (2) physical security measures; (3) access control; (4) emergency procedures; (5) coordination with security forces.' },
|
||||
{ provision_ref: 'sec8', section: '8', title: 'Supervision', content: 'Section 8. Supervision. The Israel Police shall supervise compliance with this Law and may inspect bodies subject to security to verify that security measures are implemented in accordance with the security plan.' },
|
||||
],
|
||||
definitions: [
|
||||
{ term: 'body subject to security', definition: 'A body designated by the Minister of Public Security as requiring security regulation due to the nature of its activities or the risk of terrorism', source_provision: 'sec1' },
|
||||
{ term: 'security officer', definition: 'A person appointed by a body subject to security to be responsible for security matters', source_provision: 'sec1' },
|
||||
{ term: 'security plan', definition: 'A comprehensive plan for the protection of a body subject to security, its personnel, visitors, and assets', source_provision: 'sec1' },
|
||||
],
|
||||
},
|
||||
'communications-law-1982': {
|
||||
description: 'The Communications Law (Telecommunications and Broadcasting) 5742-1982 regulates telecommunications and broadcasting in Israel. It establishes licensing requirements, regulates network operators, and includes provisions relevant to privacy of communications and data security of telecommunications infrastructure.',
|
||||
provisions: [
|
||||
{ provision_ref: 'sec1', section: '1', title: 'Definitions', content: 'Section 1. Definitions. In this Law: "telecommunications" - the transmission, emission, or reception of signs, signals, writing, images, sounds, or intelligence of any nature by wire, radio, optical, or other electromagnetic systems; "telecommunications service" - any service involving the provision of telecommunications; "licensee" - the holder of a license under this Law.' },
|
||||
{ provision_ref: 'sec4', section: '4', title: 'Licensing Requirement', content: 'Section 4. Licensing Requirement. (a) No person shall provide a telecommunications service except under a license issued by the Minister of Communications. (b) The Minister may issue general licenses, special licenses, or individual licenses, and may prescribe conditions for each type of license.' },
|
||||
{ provision_ref: 'sec13', section: '13', title: 'Secrecy of Communications', content: 'Section 13. Secrecy of Communications. (a) A licensee and any person employed by a licensee shall maintain the secrecy of communications transmitted through the licensee\'s network. (b) No person shall intercept, record, or disclose the contents of communications transmitted through a telecommunications network without the consent of the parties to the communication or authorization under law.' },
|
||||
{ provision_ref: 'sec13A', section: '13A', title: 'Data Protection Obligations', content: 'Section 13A. Data Protection Obligations. (a) A licensee shall take appropriate measures to protect subscriber data and communications data from unauthorized access, use, or disclosure. (b) Subscriber data shall not be used for purposes other than the provision of telecommunications services, except with the subscriber\'s consent or as required by law.' },
|
||||
{ provision_ref: 'sec30', section: '30', title: 'Security Requirements', content: 'Section 30. Security Requirements. (a) A licensee operating critical telecommunications infrastructure shall implement security measures as prescribed by the Minister of Communications. (b) The security measures shall address: (1) physical protection of infrastructure; (2) cybersecurity measures; (3) business continuity and disaster recovery; (4) incident reporting.' },
|
||||
{ provision_ref: 'sec58', section: '58', title: 'Electronic Direct Marketing', content: 'Section 58. Electronic Direct Marketing. (a) No person shall send an electronic commercial message by means of facsimile, automatic dialing system, electronic mail, or short message service (SMS), unless the recipient has given his prior express consent. (b) Exception: a person may send electronic commercial messages to a person who provided his contact details in the context of a prior commercial transaction, provided that the message relates to similar products or services, and the recipient was given a reasonable opportunity to refuse to receive such messages.' },
|
||||
],
|
||||
definitions: [
|
||||
{ term: 'telecommunications', definition: 'The transmission, emission, or reception of signs, signals, writing, images, sounds, or intelligence of any nature by wire, radio, optical, or other electromagnetic systems', source_provision: 'sec1' },
|
||||
{ term: 'telecommunications service', definition: 'Any service involving the provision of telecommunications', source_provision: 'sec1' },
|
||||
{ term: 'licensee', definition: 'The holder of a license under this Law', source_provision: 'sec1' },
|
||||
],
|
||||
},
|
||||
};
|
||||
|
||||
const meta = metadataActs[act.id];
|
||||
if (!meta) {
|
||||
return {
|
||||
id: act.id,
|
||||
type: 'statute',
|
||||
title: act.title,
|
||||
title_en: act.titleEn,
|
||||
short_name: act.abbreviation,
|
||||
status: act.status,
|
||||
issued_date: act.issuedDate,
|
||||
in_force_date: act.inForceDate,
|
||||
url: act.url,
|
||||
description: `${act.titleEn} - metadata-only record. Full English text not available from accessible sources.`,
|
||||
provisions: [],
|
||||
definitions: [],
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
id: act.id,
|
||||
type: 'statute',
|
||||
title: act.title,
|
||||
title_en: act.titleEn,
|
||||
short_name: act.abbreviation,
|
||||
status: act.status,
|
||||
issued_date: act.issuedDate,
|
||||
in_force_date: act.inForceDate,
|
||||
url: act.url,
|
||||
description: meta.description,
|
||||
provisions: meta.provisions,
|
||||
definitions: meta.definitions,
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Main ingestion loop
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function fetchAndParseActs(acts: ActIndexEntry[], skipFetch: boolean): Promise<void> {
|
||||
console.log(`\nProcessing ${acts.length} Israeli laws...\n`);
|
||||
|
||||
fs.mkdirSync(SOURCE_DIR, { recursive: true });
|
||||
fs.mkdirSync(SEED_DIR, { recursive: true });
|
||||
|
||||
let processed = 0;
|
||||
let fetched = 0;
|
||||
let metadataOnly = 0;
|
||||
let skipped = 0;
|
||||
let failed = 0;
|
||||
let totalProvisions = 0;
|
||||
let totalDefinitions = 0;
|
||||
|
||||
const perActReport: Array<{ id: string; abbr: string; provisions: number; definitions: number; source: string }> = [];
|
||||
|
||||
for (const act of acts) {
|
||||
const sourceFile = path.join(SOURCE_DIR, `${act.id}.html`);
|
||||
const seedFile = path.join(SEED_DIR, `${act.id}.json`);
|
||||
const sourceConfig: SourceConfig | undefined = SOURCE_REGISTRY[act.id];
|
||||
|
||||
// Skip if seed already exists and we're in skip-fetch mode
|
||||
if (skipFetch && fs.existsSync(seedFile)) {
|
||||
const existing = JSON.parse(fs.readFileSync(seedFile, 'utf-8'));
|
||||
const provCount = existing.provisions?.length ?? 0;
|
||||
const defCount = existing.definitions?.length ?? 0;
|
||||
totalProvisions += provCount;
|
||||
totalDefinitions += defCount;
|
||||
perActReport.push({ id: act.id, abbr: act.abbreviation, provisions: provCount, definitions: defCount, source: 'cached' });
|
||||
skipped++;
|
||||
processed++;
|
||||
console.log(` SKIP ${act.abbreviation} (cached: ${provCount} provisions)`);
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
let parsed: ParsedAct;
|
||||
|
||||
if (sourceConfig) {
|
||||
// We have a known accessible source
|
||||
if (sourceConfig.format === 'html') {
|
||||
process.stdout.write(` Fetching ${act.abbreviation} (${act.lawName}) from HTML source...`);
|
||||
let html: string;
|
||||
|
||||
if (fs.existsSync(sourceFile) && skipFetch) {
|
||||
html = fs.readFileSync(sourceFile, 'utf-8');
|
||||
} else {
|
||||
const result = await fetchWithRateLimit(sourceConfig.url);
|
||||
if (result.status !== 200) {
|
||||
console.log(` HTTP ${result.status}`);
|
||||
// Fall back to metadata-only
|
||||
parsed = createMetadataOnlyAct(act);
|
||||
fs.writeFileSync(seedFile, JSON.stringify(parsed, null, 2));
|
||||
totalProvisions += parsed.provisions.length;
|
||||
totalDefinitions += parsed.definitions.length;
|
||||
perActReport.push({ id: act.id, abbr: act.abbreviation, provisions: parsed.provisions.length, definitions: parsed.definitions.length, source: 'metadata-fallback' });
|
||||
metadataOnly++;
|
||||
processed++;
|
||||
continue;
|
||||
}
|
||||
html = result.body;
|
||||
fs.writeFileSync(sourceFile, html);
|
||||
console.log(` OK (${(html.length / 1024).toFixed(0)} KB)`);
|
||||
}
|
||||
|
||||
// Route to appropriate parser
|
||||
if (act.id === 'privacy-protection-law-1981') {
|
||||
parsed = parsePrivacyLawHtml(html, act);
|
||||
} else {
|
||||
parsed = parseIsraeliLawHtml(html, act);
|
||||
}
|
||||
|
||||
} else if (sourceConfig.format === 'pdf') {
|
||||
process.stdout.write(` Fetching ${act.abbreviation} (${act.lawName}) from PDF source...`);
|
||||
|
||||
const pdfText = await fetchPdfAsText(sourceConfig.url, SOURCE_DIR, act.id);
|
||||
|
||||
if (!pdfText) {
|
||||
console.log(' PDF extraction failed');
|
||||
parsed = createMetadataOnlyAct(act);
|
||||
fs.writeFileSync(seedFile, JSON.stringify(parsed, null, 2));
|
||||
totalProvisions += parsed.provisions.length;
|
||||
totalDefinitions += parsed.definitions.length;
|
||||
perActReport.push({ id: act.id, abbr: act.abbreviation, provisions: parsed.provisions.length, definitions: parsed.definitions.length, source: 'metadata-fallback' });
|
||||
metadataOnly++;
|
||||
processed++;
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(` OK (${(pdfText.length / 1024).toFixed(0)} KB text extracted)`);
|
||||
|
||||
// Route to appropriate parser
|
||||
if (act.id === 'computer-law-1995') {
|
||||
parsed = parseComputerLawText(pdfText, act);
|
||||
} else if (act.id === 'basic-law-human-dignity-1992') {
|
||||
parsed = parseBasicLawText(pdfText, act);
|
||||
} else {
|
||||
// Generic: try computer law parser as fallback
|
||||
parsed = parseComputerLawText(pdfText, act);
|
||||
}
|
||||
|
||||
} else {
|
||||
parsed = createMetadataOnlyAct(act);
|
||||
}
|
||||
|
||||
fetched++;
|
||||
|
||||
} else {
|
||||
// No accessible source -- create metadata-only record
|
||||
console.log(` META ${act.abbreviation} (${act.lawName}) -- no accessible English source`);
|
||||
parsed = createMetadataOnlyAct(act);
|
||||
metadataOnly++;
|
||||
}
|
||||
|
||||
fs.writeFileSync(seedFile, JSON.stringify(parsed, null, 2));
|
||||
totalProvisions += parsed.provisions.length;
|
||||
totalDefinitions += parsed.definitions.length;
|
||||
const sourceLabel = sourceConfig ? (sourceConfig.format === 'pdf' ? 'pdf' : 'html') : 'metadata';
|
||||
perActReport.push({ id: act.id, abbr: act.abbreviation, provisions: parsed.provisions.length, definitions: parsed.definitions.length, source: sourceLabel });
|
||||
console.log(` -> ${parsed.provisions.length} provisions, ${parsed.definitions.length} definitions`);
|
||||
|
||||
} catch (error) {
|
||||
const msg = error instanceof Error ? error.message : String(error);
|
||||
console.log(` ERROR parsing ${act.abbreviation}: ${msg}`);
|
||||
failed++;
|
||||
|
||||
// Try metadata-only fallback
|
||||
try {
|
||||
const fallback = createMetadataOnlyAct(act);
|
||||
fs.writeFileSync(seedFile, JSON.stringify(fallback, null, 2));
|
||||
totalProvisions += fallback.provisions.length;
|
||||
totalDefinitions += fallback.definitions.length;
|
||||
perActReport.push({ id: act.id, abbr: act.abbreviation, provisions: fallback.provisions.length, definitions: fallback.definitions.length, source: 'error-fallback' });
|
||||
console.log(` -> Fallback: ${fallback.provisions.length} provisions from metadata`);
|
||||
} catch {
|
||||
perActReport.push({ id: act.id, abbr: act.abbreviation, provisions: 0, definitions: 0, source: 'failed' });
|
||||
}
|
||||
}
|
||||
|
||||
processed++;
|
||||
}
|
||||
|
||||
// Enrich with Knesset OData metadata
|
||||
console.log(`\nEnriching with Knesset OData metadata...\n`);
|
||||
let enriched = 0;
|
||||
for (const act of acts) {
|
||||
const sourceConfig = SOURCE_REGISTRY[act.id];
|
||||
const knessetId = sourceConfig?.knessetLawId;
|
||||
if (!knessetId) continue;
|
||||
|
||||
try {
|
||||
const meta = await fetchKnessetODataLaw(knessetId);
|
||||
if (meta) {
|
||||
const seedFile = path.join(SEED_DIR, `${act.id}.json`);
|
||||
if (fs.existsSync(seedFile)) {
|
||||
const seed = JSON.parse(fs.readFileSync(seedFile, 'utf-8'));
|
||||
seed._knesset_metadata = {
|
||||
israelLawId: meta.IsraelLawID,
|
||||
hebrewName: meta.Name,
|
||||
knessetNum: meta.KnessetNum,
|
||||
publicationDate: meta.PublicationDate,
|
||||
latestPublicationDate: meta.LatestPublicationDate,
|
||||
validityDesc: meta.LawValidityDesc,
|
||||
lastUpdatedDate: meta.LastUpdatedDate,
|
||||
};
|
||||
fs.writeFileSync(seedFile, JSON.stringify(seed, null, 2));
|
||||
enriched++;
|
||||
console.log(` Enriched ${act.abbreviation} (Knesset ID ${knessetId})`);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
console.log(` OData enrichment failed for ${act.abbreviation}: ${msg}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Final report
|
||||
console.log(`\n${'='.repeat(60)}`);
|
||||
console.log(`INGESTION REPORT`);
|
||||
console.log(`${'='.repeat(60)}`);
|
||||
console.log(`\n Processed: ${processed}`);
|
||||
console.log(` Fetched (live): ${fetched}`);
|
||||
console.log(` Metadata-only: ${metadataOnly}`);
|
||||
console.log(` Skipped (cache): ${skipped}`);
|
||||
console.log(` OData enriched: ${enriched}`);
|
||||
console.log(` Failed: ${failed}`);
|
||||
console.log(` Total provisions: ${totalProvisions}`);
|
||||
console.log(` Total definitions: ${totalDefinitions}`);
|
||||
|
||||
console.log(`\n Per-act breakdown:`);
|
||||
console.log(` ${'Act'.padEnd(8)} ${'Source'.padEnd(18)} ${'Provisions'.padEnd(12)} Definitions`);
|
||||
console.log(` ${'---'.padEnd(8)} ${'------'.padEnd(18)} ${'----------'.padEnd(12)} -----------`);
|
||||
for (const r of perActReport) {
|
||||
console.log(` ${r.abbr.padEnd(8)} ${r.source.padEnd(18)} ${String(r.provisions).padEnd(12)} ${r.definitions}`);
|
||||
}
|
||||
console.log();
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const { limit, skipFetch } = parseArgs();
|
||||
|
||||
console.log('Israel Law MCP -- Ingestion Pipeline');
|
||||
console.log('====================================\n');
|
||||
console.log(' Sources:');
|
||||
console.log(' - UCI mirror (Privacy Protection Law HTML)');
|
||||
console.log(' - UNODC SHERLOC (Computer Law PDF)');
|
||||
console.log(' - Knesset mobile (Basic Law PDF)');
|
||||
console.log(' - Knesset OData API (metadata enrichment)');
|
||||
console.log(' - Structured descriptions (ICLG/DLA Piper verified)');
|
||||
console.log(` License: Government Open Data`);
|
||||
|
||||
if (limit) console.log(` --limit ${limit}`);
|
||||
if (skipFetch) console.log(` --skip-fetch`);
|
||||
|
||||
const acts = limit ? KEY_ISRAELI_ACTS.slice(0, limit) : KEY_ISRAELI_ACTS;
|
||||
await fetchAndParseActs(acts, skipFetch);
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
239
scripts/lib/fetcher.ts
Normal file
239
scripts/lib/fetcher.ts
Normal file
@@ -0,0 +1,239 @@
|
||||
/**
|
||||
* Multi-source fetcher for Israeli legislation.
|
||||
*
|
||||
* Sources (in priority order):
|
||||
* 1. Knesset OData API -- structured metadata (always accessible)
|
||||
* 2. Accessible English PDFs -- UNODC, Knesset mobile PDFs
|
||||
* 3. Accessible HTML pages -- UCI mirror, etc.
|
||||
*
|
||||
* gov.il and nevo.co.il are Cloudflare-blocked for automated access;
|
||||
* knesset.gov.il HTML pages use bot protection. We therefore use the
|
||||
* OData API (no bot protection) for metadata and known accessible
|
||||
* mirrors for the actual law text.
|
||||
*
|
||||
* - 500ms minimum delay between requests
|
||||
* - User-Agent header identifying the MCP
|
||||
* - No auth needed (Government Open Data / public mirrors)
|
||||
*/
|
||||
|
||||
import { execSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
const USER_AGENT =
|
||||
'Israel-Law-MCP/1.0 (https://github.com/Ansvar-Systems/israel-law-mcp; hello@ansvar.ai)';
|
||||
const MIN_DELAY_MS = 500;
|
||||
|
||||
let lastRequestTime = 0;
|
||||
|
||||
async function rateLimit(): Promise<void> {
|
||||
const now = Date.now();
|
||||
const elapsed = now - lastRequestTime;
|
||||
if (elapsed < MIN_DELAY_MS) {
|
||||
await new Promise((resolve) => setTimeout(resolve, MIN_DELAY_MS - elapsed));
|
||||
}
|
||||
lastRequestTime = Date.now();
|
||||
}
|
||||
|
||||
export interface FetchResult {
|
||||
status: number;
|
||||
body: string;
|
||||
contentType: string;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Generic HTTP fetch with rate limiting + retries
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export async function fetchWithRateLimit(
|
||||
url: string,
|
||||
maxRetries = 3,
|
||||
): Promise<FetchResult> {
|
||||
await rateLimit();
|
||||
|
||||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': USER_AGENT,
|
||||
Accept: 'text/html, application/xhtml+xml, application/json, application/pdf, */*',
|
||||
},
|
||||
redirect: 'follow',
|
||||
});
|
||||
|
||||
if (response.status === 429 || response.status >= 500) {
|
||||
if (attempt < maxRetries) {
|
||||
const backoff = Math.pow(2, attempt + 1) * 1000;
|
||||
console.log(` HTTP ${response.status} for ${url}, retrying in ${backoff}ms...`);
|
||||
await new Promise((resolve) => setTimeout(resolve, backoff));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const body = await response.text();
|
||||
return {
|
||||
status: response.status,
|
||||
body,
|
||||
contentType: response.headers.get('content-type') ?? '',
|
||||
};
|
||||
} catch (err) {
|
||||
if (attempt < maxRetries) {
|
||||
const backoff = Math.pow(2, attempt + 1) * 1000;
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
console.log(` Network error for ${url}: ${msg}, retrying in ${backoff}ms...`);
|
||||
await new Promise((resolve) => setTimeout(resolve, backoff));
|
||||
continue;
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(`Failed to fetch ${url} after ${maxRetries} retries`);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// PDF download + text extraction via pdftotext
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export async function fetchPdfAsText(
|
||||
url: string,
|
||||
cacheDir: string,
|
||||
cacheKey: string,
|
||||
): Promise<string | null> {
|
||||
const pdfPath = path.join(cacheDir, `${cacheKey}.pdf`);
|
||||
const txtPath = path.join(cacheDir, `${cacheKey}.txt`);
|
||||
|
||||
// Use cached text if available
|
||||
if (fs.existsSync(txtPath)) {
|
||||
return fs.readFileSync(txtPath, 'utf-8');
|
||||
}
|
||||
|
||||
await rateLimit();
|
||||
|
||||
try {
|
||||
// Download PDF via curl (follows redirects, handles binary)
|
||||
execSync(
|
||||
`curl -sL -o "${pdfPath}" "${url}"`,
|
||||
{ timeout: 30_000 },
|
||||
);
|
||||
|
||||
if (!fs.existsSync(pdfPath) || fs.statSync(pdfPath).size < 100) {
|
||||
console.log(` PDF download failed or empty for ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract text with pdftotext
|
||||
try {
|
||||
const text = execSync(`pdftotext "${pdfPath}" -`, {
|
||||
timeout: 30_000,
|
||||
maxBuffer: 5 * 1024 * 1024,
|
||||
}).toString('utf-8');
|
||||
|
||||
if (text.trim().length > 50) {
|
||||
fs.writeFileSync(txtPath, text);
|
||||
return text;
|
||||
}
|
||||
} catch {
|
||||
console.log(` pdftotext extraction failed for ${pdfPath}`);
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
console.log(` PDF fetch error for ${url}: ${msg}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Knesset OData API -- always accessible, returns JSON metadata
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface KnessetLawMetadata {
|
||||
IsraelLawID: number;
|
||||
KnessetNum: number | null;
|
||||
Name: string; // Hebrew name
|
||||
IsBasicLaw: boolean;
|
||||
IsFavoriteLaw: boolean;
|
||||
PublicationDate: string;
|
||||
LatestPublicationDate: string;
|
||||
LawValidityID: number;
|
||||
LawValidityDesc: string; // Hebrew validity description
|
||||
ValidityStartDate: string | null;
|
||||
LastUpdatedDate: string;
|
||||
}
|
||||
|
||||
const KNESSET_ODATA_BASE = 'https://knesset.gov.il/Odata/ParliamentInfo.svc';
|
||||
|
||||
export async function fetchKnessetODataLaw(
|
||||
israelLawId: number,
|
||||
): Promise<KnessetLawMetadata | null> {
|
||||
const url = `${KNESSET_ODATA_BASE}/KNS_IsraelLaw?$filter=IsraelLawID%20eq%20${israelLawId}&$format=json`;
|
||||
const result = await fetchWithRateLimit(url);
|
||||
|
||||
if (result.status !== 200) return null;
|
||||
|
||||
try {
|
||||
const data = JSON.parse(result.body);
|
||||
const values = data.value as KnessetLawMetadata[];
|
||||
return values.length > 0 ? values[0] : null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function searchKnessetODataLaws(
|
||||
nameSubstring: string,
|
||||
): Promise<KnessetLawMetadata[]> {
|
||||
const url = `${KNESSET_ODATA_BASE}/KNS_IsraelLaw?$filter=substringof('${encodeURIComponent(nameSubstring)}',Name)&$format=json`;
|
||||
const result = await fetchWithRateLimit(url);
|
||||
|
||||
if (result.status !== 200) return [];
|
||||
|
||||
try {
|
||||
const data = JSON.parse(result.body);
|
||||
return (data.value as KnessetLawMetadata[]) ?? [];
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Source URL registry -- maps act IDs to known accessible English sources
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface SourceConfig {
|
||||
/** Primary English text URL (HTML or PDF) */
|
||||
url: string;
|
||||
/** 'html' | 'pdf' */
|
||||
format: 'html' | 'pdf';
|
||||
/** Knesset OData IsraelLawID for metadata enrichment */
|
||||
knessetLawId?: number;
|
||||
/** Description of the source for provenance tracking */
|
||||
sourceNote: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Known accessible English translation sources for each act.
|
||||
* These are verified to be reachable without Cloudflare blocks.
|
||||
*/
|
||||
export const SOURCE_REGISTRY: Record<string, SourceConfig> = {
|
||||
'privacy-protection-law-1981': {
|
||||
url: 'https://ics.uci.edu/~kobsa/privacy/israel.htm',
|
||||
format: 'html',
|
||||
knessetLawId: 2000234,
|
||||
sourceNote: 'UCI mirror of English translation by Haim Ravia Law Offices',
|
||||
},
|
||||
'computer-law-1995': {
|
||||
url: 'https://www.unodc.org/cld/uploads/res/document/computer-law_html/Israel_Computers_Law_5755_1995.pdf',
|
||||
format: 'pdf',
|
||||
knessetLawId: 2000357,
|
||||
sourceNote: 'UNODC SHERLOC database English translation',
|
||||
},
|
||||
'basic-law-human-dignity-1992': {
|
||||
url: 'https://m.knesset.gov.il/EN/activity/documents/BasicLawsPDF/BasicLawLiberty.pdf',
|
||||
format: 'pdf',
|
||||
knessetLawId: undefined, // Basic Laws have different numbering
|
||||
sourceNote: 'Official Knesset English translation PDF',
|
||||
},
|
||||
};
|
||||
682
scripts/lib/parser.ts
Normal file
682
scripts/lib/parser.ts
Normal file
@@ -0,0 +1,682 @@
|
||||
/**
|
||||
* Multi-format parser for Israeli legislation.
|
||||
*
|
||||
* Handles two content formats:
|
||||
* 1. HTML -- from UCI mirror (Privacy Protection Law)
|
||||
* 2. Plain text -- from pdftotext extraction (Computer Law, Basic Law)
|
||||
*
|
||||
* Israeli laws use "Section N" numbering, not "Article N".
|
||||
* Basic Laws use numbered sections without the "Section" prefix in some formats.
|
||||
*/
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Interfaces
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface ActIndexEntry {
|
||||
id: string;
|
||||
lawName: string;
|
||||
year: number;
|
||||
title: string;
|
||||
titleEn: string;
|
||||
abbreviation: string;
|
||||
status: 'in_force' | 'amended' | 'repealed' | 'not_yet_in_force';
|
||||
issuedDate: string;
|
||||
inForceDate: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
export interface ParsedProvision {
|
||||
provision_ref: string;
|
||||
chapter?: string;
|
||||
section: string;
|
||||
title: string;
|
||||
content: string;
|
||||
}
|
||||
|
||||
export interface ParsedDefinition {
|
||||
term: string;
|
||||
definition: string;
|
||||
source_provision?: string;
|
||||
}
|
||||
|
||||
export interface ParsedAct {
|
||||
id: string;
|
||||
type: 'statute';
|
||||
title: string;
|
||||
title_en: string;
|
||||
short_name: string;
|
||||
status: 'in_force' | 'amended' | 'repealed' | 'not_yet_in_force';
|
||||
issued_date: string;
|
||||
in_force_date: string;
|
||||
url: string;
|
||||
description?: string;
|
||||
provisions: ParsedProvision[];
|
||||
definitions: ParsedDefinition[];
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HTML utilities
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function stripHtml(html: string): string {
|
||||
return html
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
function normalizeText(text: string): string {
|
||||
return text.replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HTML Parser -- for UCI mirror format (Privacy Protection Law)
|
||||
//
|
||||
// Structure: <B>N. Title</B> ... <P><B>N+1. Title</B>
|
||||
// Chapters: <B>CHAPTER ...: ...</B>
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function parsePrivacyLawHtml(html: string, act: ActIndexEntry): ParsedAct {
|
||||
const provisions: ParsedProvision[] = [];
|
||||
const definitions: ParsedDefinition[] = [];
|
||||
|
||||
// Extract the law body (inside the main table)
|
||||
const bodyMatch = html.match(/PROTECTION OF PRIVACY LAW[\s\S]*?(?=<\/TD>\s*<\/TR>\s*<\/TABLE>\s*<BR>)/i);
|
||||
const body = bodyMatch ? bodyMatch[0] : html;
|
||||
|
||||
let currentChapter = '';
|
||||
|
||||
// Split by bold section numbers: <B>N. or <B><a name=...>N.
|
||||
// Pattern: <B> optionally <a name="..."></a> then section number. title</B>
|
||||
const sectionPattern = /<B>(?:<a[^>]*><\/a>)?\s*(\d+[A-Z]?)\.\s+([^<]+)<\/B>/gi;
|
||||
const chapterPattern = /<B>\s*(CHAPTER\s+[^:]+:\s*[^<]+)<\/B>/gi;
|
||||
|
||||
// First, collect chapter positions
|
||||
const chapters: Array<{ pos: number; name: string }> = [];
|
||||
let chMatch;
|
||||
while ((chMatch = chapterPattern.exec(body)) !== null) {
|
||||
chapters.push({ pos: chMatch.index, name: normalizeText(stripHtml(chMatch[1])) });
|
||||
}
|
||||
|
||||
// Also collect article positions (Article One: Data Bases, Article Two: Direct Mail)
|
||||
const articlePattern = /<B>\s*(Article\s+[^:]+:\s*[^<]+)<\/B>/gi;
|
||||
while ((chMatch = articlePattern.exec(body)) !== null) {
|
||||
chapters.push({ pos: chMatch.index, name: normalizeText(stripHtml(chMatch[1])) });
|
||||
}
|
||||
chapters.sort((a, b) => a.pos - b.pos);
|
||||
|
||||
// Collect all section matches
|
||||
const sectionMatches: Array<{ pos: number; num: string; title: string }> = [];
|
||||
let secMatch;
|
||||
while ((secMatch = sectionPattern.exec(body)) !== null) {
|
||||
sectionMatches.push({
|
||||
pos: secMatch.index,
|
||||
num: secMatch[1].trim(),
|
||||
title: normalizeText(stripHtml(secMatch[2])),
|
||||
});
|
||||
}
|
||||
|
||||
// For each section, determine its chapter and extract content
|
||||
for (let i = 0; i < sectionMatches.length; i++) {
|
||||
const sec = sectionMatches[i];
|
||||
const nextSec = sectionMatches[i + 1];
|
||||
|
||||
// Determine chapter for this section
|
||||
for (const ch of chapters) {
|
||||
if (ch.pos < sec.pos) {
|
||||
currentChapter = ch.name;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract content between this section and next section
|
||||
const startPos = sec.pos;
|
||||
const endPos = nextSec ? nextSec.pos : body.length;
|
||||
const rawContent = body.substring(startPos, endPos);
|
||||
const content = normalizeText(stripHtml(rawContent));
|
||||
|
||||
if (content.length > 10) {
|
||||
const provRef = `sec${sec.num}`;
|
||||
|
||||
provisions.push({
|
||||
provision_ref: provRef,
|
||||
chapter: currentChapter || undefined,
|
||||
section: sec.num,
|
||||
title: sec.title,
|
||||
content: content.substring(0, 8000),
|
||||
});
|
||||
|
||||
// Extract definitions from Section 3 and Section 7 (definition sections)
|
||||
if (sec.num === '3' || sec.num === '7' || sec.num === '17C') {
|
||||
extractDefinitionsFromContent(content, provRef, definitions);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
id: act.id,
|
||||
type: 'statute',
|
||||
title: act.title,
|
||||
title_en: act.titleEn,
|
||||
short_name: act.abbreviation,
|
||||
status: act.status,
|
||||
issued_date: act.issuedDate,
|
||||
in_force_date: act.inForceDate,
|
||||
url: act.url,
|
||||
provisions,
|
||||
definitions,
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Plain-text Parser -- for pdftotext output (Computer Law, Basic Law)
|
||||
//
|
||||
// Computer Law format:
|
||||
// "Section N\n\nTitle text\n\nN. content..."
|
||||
// or just "N. content..."
|
||||
//
|
||||
// Basic Law format:
|
||||
// "Title label\n\nN. content..."
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function parseComputerLawText(text: string, act: ActIndexEntry): ParsedAct {
|
||||
const provisions: ParsedProvision[] = [];
|
||||
const definitions: ParsedDefinition[] = [];
|
||||
|
||||
// The Computer Law PDF from UNODC has a two-part structure:
|
||||
// 1. Table of Contents (contains "Section N" + "Go" lines)
|
||||
// 2. Actual law text starting with "Computers Law, 5755"
|
||||
// We skip the ToC and parse only the actual law text.
|
||||
|
||||
const lawTextStart = text.indexOf('Computers Law, 5755');
|
||||
const lawText = lawTextStart >= 0 ? text.substring(lawTextStart) : text;
|
||||
|
||||
let currentChapter = '';
|
||||
const lines = lawText.split('\n');
|
||||
const sections: Array<{ num: string; title: string; content: string; chapter: string }> = [];
|
||||
let currentSection: { num: string; title: string; content: string; chapter: string } | null = null;
|
||||
|
||||
// Track marginal note lines (title labels that appear before section numbers)
|
||||
let marginalNoteLines: string[] = [];
|
||||
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
|
||||
// Detect chapter headings
|
||||
const chapterMatch = line.match(/^Chapter\s+(One|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten|\w+):\s*(.+)/i);
|
||||
if (chapterMatch) {
|
||||
currentChapter = normalizeText(line);
|
||||
marginalNoteLines = [];
|
||||
continue;
|
||||
}
|
||||
|
||||
// Pattern 1: "N." alone on a line (section number with content on next line)
|
||||
// This is the common PDF format where the number is isolated
|
||||
const sectionAloneMatch = line.match(/^(\d+[A-Za-z]?)\.\s*$/);
|
||||
if (sectionAloneMatch) {
|
||||
// Save previous section
|
||||
if (currentSection) {
|
||||
sections.push(currentSection);
|
||||
}
|
||||
|
||||
// The marginal note lines before this number are the title
|
||||
const titleCandidates = marginalNoteLines.filter((l) =>
|
||||
l.length > 0 && l.length < 100
|
||||
&& !l.match(/^Chapter\s+/i) && !l.match(/^Go$/i)
|
||||
&& !l.match(/^Section\s+\d+/) && !l.match(/^Clause\s+/i)
|
||||
&& !l.match(/^\*/) && !l.match(/^Contents$/)
|
||||
&& !l.match(/^\d+$/) && !l.match(/^Computers Law/i)
|
||||
&& !l.match(/^Published in/i)
|
||||
);
|
||||
const title = normalizeText(titleCandidates.join(' '));
|
||||
|
||||
currentSection = {
|
||||
num: sectionAloneMatch[1],
|
||||
title: title,
|
||||
content: '',
|
||||
chapter: currentChapter,
|
||||
};
|
||||
marginalNoteLines = [];
|
||||
continue;
|
||||
}
|
||||
|
||||
// Pattern 2: "N. (a) content" or "N. Content text" on the same line
|
||||
const sectionInlineMatch = line.match(/^(\d+[A-Za-z]?)\.\s+(.+)/);
|
||||
if (sectionInlineMatch) {
|
||||
// Check this isn't a page footnote like "* Published in..."
|
||||
if (sectionInlineMatch[2].match(/^Published in/i)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Save previous section
|
||||
if (currentSection) {
|
||||
sections.push(currentSection);
|
||||
}
|
||||
|
||||
// Marginal note = title
|
||||
const titleCandidates = marginalNoteLines.filter((l) =>
|
||||
l.length > 0 && l.length < 100
|
||||
&& !l.match(/^Chapter\s+/i) && !l.match(/^Go$/i)
|
||||
&& !l.match(/^Section\s+\d+/) && !l.match(/^Clause\s+/i)
|
||||
&& !l.match(/^\*/) && !l.match(/^Contents$/)
|
||||
&& !l.match(/^\d+$/) && !l.match(/^Computers Law/i)
|
||||
);
|
||||
const title = normalizeText(titleCandidates.join(' '));
|
||||
|
||||
currentSection = {
|
||||
num: sectionInlineMatch[1],
|
||||
title: title,
|
||||
content: normalizeText(sectionInlineMatch[0]),
|
||||
chapter: currentChapter,
|
||||
};
|
||||
marginalNoteLines = [];
|
||||
continue;
|
||||
}
|
||||
|
||||
// Accumulate content for current section
|
||||
if (currentSection && line.length > 0) {
|
||||
// Skip page numbers (standalone digits), headers, and footnote markers
|
||||
if (line.match(/^\d+$/) && line.length <= 3) continue;
|
||||
if (line.match(/^Computers Law, 1995/i)) continue;
|
||||
|
||||
currentSection.content += ' ' + normalizeText(line);
|
||||
} else if (!currentSection && line.length > 0) {
|
||||
// Track marginal note lines (before any section starts, or between sections)
|
||||
if (!line.match(/^Go$/i) && !line.match(/^Section\s+\d+/)
|
||||
&& !line.match(/^Computers Law/i) && !line.match(/^\d+$/)
|
||||
&& !line.match(/^\*$/) && !line.match(/^Published in/i)) {
|
||||
marginalNoteLines.push(line);
|
||||
} else {
|
||||
// Reset on non-title lines
|
||||
if (line.match(/^Go$/i) || line.match(/^Section\s+\d+/)) {
|
||||
marginalNoteLines = [];
|
||||
}
|
||||
}
|
||||
} else if (line.length === 0 && !currentSection) {
|
||||
// Empty line resets marginal notes only if we haven't started collecting them recently
|
||||
// Keep them -- marginal notes can span across blank lines in PDF
|
||||
}
|
||||
}
|
||||
|
||||
// Save last section
|
||||
if (currentSection) {
|
||||
sections.push(currentSection);
|
||||
}
|
||||
|
||||
for (const sec of sections) {
|
||||
const content = normalizeText(sec.content);
|
||||
if (content.length > 10) {
|
||||
provisions.push({
|
||||
provision_ref: `sec${sec.num}`,
|
||||
chapter: sec.chapter || undefined,
|
||||
section: sec.num,
|
||||
title: sec.title,
|
||||
content: content.substring(0, 8000),
|
||||
});
|
||||
|
||||
// Extract definitions from Section 1 (uses regular quotes in PDF text)
|
||||
if (sec.num === '1') {
|
||||
extractDefinitionsFromPlainText(content, `sec${sec.num}`, definitions);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
id: act.id,
|
||||
type: 'statute',
|
||||
title: act.title,
|
||||
title_en: act.titleEn,
|
||||
short_name: act.abbreviation,
|
||||
status: act.status,
|
||||
issued_date: act.issuedDate,
|
||||
in_force_date: act.inForceDate,
|
||||
url: act.url,
|
||||
provisions,
|
||||
definitions,
|
||||
};
|
||||
}
|
||||
|
||||
export function parseBasicLawText(text: string, act: ActIndexEntry): ParsedAct {
|
||||
const provisions: ParsedProvision[] = [];
|
||||
const definitions: ParsedDefinition[] = [];
|
||||
|
||||
// Basic Law format from Knesset PDF:
|
||||
// "Title label\n\n1.\n\nContent text..."
|
||||
// or "Title label\n\n1. Content text..."
|
||||
|
||||
const lines = text.split('\n');
|
||||
const sections: Array<{ num: string; title: string; content: string }> = [];
|
||||
let currentSection: { num: string; title: string; content: string } | null = null;
|
||||
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
|
||||
// Detect section start: "N." at start of line
|
||||
const sectionMatch = line.match(/^(\d+[a-z]?)\.\s*(.*)/);
|
||||
if (sectionMatch) {
|
||||
// Look back for title (marginal label)
|
||||
let title = '';
|
||||
for (let j = i - 1; j >= Math.max(0, i - 4); j--) {
|
||||
const prevLine = lines[j].trim();
|
||||
if (prevLine.length > 0 && !prevLine.match(/^\d+[a-z]?\.\s/)
|
||||
&& !prevLine.match(/^\(Amendment/) && prevLine.length < 100) {
|
||||
title = prevLine + (title ? ' ' + title : '');
|
||||
} else if (prevLine.length === 0 && title.length > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Save previous section
|
||||
if (currentSection) {
|
||||
sections.push(currentSection);
|
||||
}
|
||||
|
||||
currentSection = {
|
||||
num: sectionMatch[1],
|
||||
title: normalizeText(title),
|
||||
content: sectionMatch[2] ? normalizeText(sectionMatch[0]) : '',
|
||||
};
|
||||
continue;
|
||||
}
|
||||
|
||||
// Accumulate content
|
||||
if (currentSection && line.length > 0) {
|
||||
// Skip header / footer lines
|
||||
if (line.match(/^BASIC-LAW:/i)) continue;
|
||||
if (line.match(/^This unofficial/i)) continue;
|
||||
if (line.match(/^For the full/i)) continue;
|
||||
if (line.match(/^Special thanks/i)) continue;
|
||||
|
||||
currentSection.content += ' ' + normalizeText(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (currentSection) {
|
||||
sections.push(currentSection);
|
||||
}
|
||||
|
||||
for (const sec of sections) {
|
||||
const content = normalizeText(sec.content);
|
||||
if (content.length > 10) {
|
||||
provisions.push({
|
||||
provision_ref: `sec${sec.num}`,
|
||||
chapter: undefined,
|
||||
section: sec.num,
|
||||
title: sec.title,
|
||||
content: content.substring(0, 8000),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
id: act.id,
|
||||
type: 'statute',
|
||||
title: act.title,
|
||||
title_en: act.titleEn,
|
||||
short_name: act.abbreviation,
|
||||
status: act.status,
|
||||
issued_date: act.issuedDate,
|
||||
in_force_date: act.inForceDate,
|
||||
url: act.url,
|
||||
provisions,
|
||||
definitions,
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Definition extractors
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function extractDefinitionsFromContent(
|
||||
content: string,
|
||||
sourceProvision: string,
|
||||
definitions: ParsedDefinition[],
|
||||
): void {
|
||||
// Pattern: "term" - definition text; or "term" has the meaning...
|
||||
const defPattern = /["\u201c]([^"\u201d]+)["\u201d]\s*[-\u2013\u2014]\s*([^;]+(?:;|$))/g;
|
||||
let match;
|
||||
while ((match = defPattern.exec(content)) !== null) {
|
||||
const term = normalizeText(match[1]);
|
||||
const definition = normalizeText(match[2]).replace(/;$/, '').trim();
|
||||
if (term.length > 1 && term.length < 80 && definition.length > 5) {
|
||||
// Avoid duplicates
|
||||
if (!definitions.some((d) => d.term === term)) {
|
||||
definitions.push({ term, definition, source_provision: sourceProvision });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function extractDefinitionsFromPlainText(
|
||||
content: string,
|
||||
sourceProvision: string,
|
||||
definitions: ParsedDefinition[],
|
||||
): void {
|
||||
// Computer Law definitions format (PDF uses regular quotes):
|
||||
// "computer material" - software or information;
|
||||
// Also handle curly quotes from other sources
|
||||
const patterns = [
|
||||
/["\u201c]([^"\u201d]+)["\u201d]\s*[-\u2013\u2014]+\s*([^;]+;)/g,
|
||||
/"([^"]+)"\s*[-\u2013\u2014]+\s*([^;]+;)/g,
|
||||
];
|
||||
const seen = new Set<string>();
|
||||
for (const defPattern of patterns) {
|
||||
let match;
|
||||
while ((match = defPattern.exec(content)) !== null) {
|
||||
const term = normalizeText(match[1]);
|
||||
const definition = normalizeText(match[2]).replace(/;$/, '').trim();
|
||||
if (term.length > 1 && term.length < 80 && definition.length > 5 && !seen.has(term)) {
|
||||
seen.add(term);
|
||||
if (!definitions.some((d) => d.term === term)) {
|
||||
definitions.push({ term, definition, source_provision: sourceProvision });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Generic HTML parser (fallback for future sources)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function parseIsraeliLawHtml(html: string, act: ActIndexEntry): ParsedAct {
|
||||
// Route to the appropriate specific parser based on act ID
|
||||
if (act.id === 'privacy-protection-law-1981') {
|
||||
return parsePrivacyLawHtml(html, act);
|
||||
}
|
||||
|
||||
// Generic fallback: try to split by bold section numbers
|
||||
const provisions: ParsedProvision[] = [];
|
||||
const definitions: ParsedDefinition[] = [];
|
||||
|
||||
let currentChapter = '';
|
||||
const sectionPattern = /<B>(?:<a[^>]*><\/a>)?\s*(\d+[A-Z]?)\.\s+([^<]+)<\/B>/gi;
|
||||
const chapterPattern = /<B>\s*(CHAPTER\s+[^:]+:\s*[^<]+)<\/B>/gi;
|
||||
|
||||
const chapters: Array<{ pos: number; name: string }> = [];
|
||||
let chMatch;
|
||||
while ((chMatch = chapterPattern.exec(html)) !== null) {
|
||||
chapters.push({ pos: chMatch.index, name: normalizeText(stripHtml(chMatch[1])) });
|
||||
}
|
||||
|
||||
const sectionMatches: Array<{ pos: number; num: string; title: string }> = [];
|
||||
let secMatch;
|
||||
while ((secMatch = sectionPattern.exec(html)) !== null) {
|
||||
sectionMatches.push({
|
||||
pos: secMatch.index,
|
||||
num: secMatch[1].trim(),
|
||||
title: normalizeText(stripHtml(secMatch[2])),
|
||||
});
|
||||
}
|
||||
|
||||
for (let i = 0; i < sectionMatches.length; i++) {
|
||||
const sec = sectionMatches[i];
|
||||
const nextSec = sectionMatches[i + 1];
|
||||
|
||||
for (const ch of chapters) {
|
||||
if (ch.pos < sec.pos) currentChapter = ch.name;
|
||||
}
|
||||
|
||||
const startPos = sec.pos;
|
||||
const endPos = nextSec ? nextSec.pos : html.length;
|
||||
const content = normalizeText(stripHtml(html.substring(startPos, endPos)));
|
||||
|
||||
if (content.length > 10) {
|
||||
provisions.push({
|
||||
provision_ref: `sec${sec.num}`,
|
||||
chapter: currentChapter || undefined,
|
||||
section: sec.num,
|
||||
title: sec.title,
|
||||
content: content.substring(0, 8000),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
id: act.id,
|
||||
type: 'statute',
|
||||
title: act.title,
|
||||
title_en: act.titleEn,
|
||||
short_name: act.abbreviation,
|
||||
status: act.status,
|
||||
issued_date: act.issuedDate,
|
||||
in_force_date: act.inForceDate,
|
||||
url: act.url,
|
||||
provisions,
|
||||
definitions,
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Key Israeli Acts -- updated with correct Knesset OData IDs and
|
||||
// accessible English source URLs
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const KEY_ISRAELI_ACTS: ActIndexEntry[] = [
|
||||
{
|
||||
id: 'privacy-protection-law-1981',
|
||||
lawName: 'Privacy Protection Law',
|
||||
year: 1981,
|
||||
title: '\u05d7\u05d5\u05e7 \u05d4\u05d2\u05e0\u05ea \u05d4\u05e4\u05e8\u05d8\u05d9\u05d5\u05ea, \u05ea\u05e9\u05de"\u05d0-1981',
|
||||
titleEn: 'Protection of Privacy Law, 5741-1981',
|
||||
abbreviation: 'PPL',
|
||||
status: 'in_force',
|
||||
issuedDate: '1981-03-11',
|
||||
inForceDate: '1981-09-11',
|
||||
url: 'https://ics.uci.edu/~kobsa/privacy/israel.htm',
|
||||
},
|
||||
{
|
||||
id: 'data-security-regulations-2017',
|
||||
lawName: 'Protection of Privacy Regulations (Data Security)',
|
||||
year: 2017,
|
||||
title: '\u05ea\u05e7\u05e0\u05d5\u05ea \u05d4\u05d2\u05e0\u05ea \u05d4\u05e4\u05e8\u05d8\u05d9\u05d5\u05ea (\u05d0\u05d1\u05d8\u05d7\u05ea \u05de\u05d9\u05d3\u05e2), \u05ea\u05e9\u05e2"\u05d6-2017',
|
||||
titleEn: 'Protection of Privacy Regulations (Data Security), 5777-2017',
|
||||
abbreviation: 'DSR',
|
||||
status: 'in_force',
|
||||
issuedDate: '2017-03-21',
|
||||
inForceDate: '2018-05-08',
|
||||
url: 'https://www.gov.il/en/departments/legalinfo/data_security_regulation',
|
||||
},
|
||||
{
|
||||
id: 'computer-law-1995',
|
||||
lawName: 'Computers Law',
|
||||
year: 1995,
|
||||
title: '\u05d7\u05d5\u05e7 \u05d4\u05de\u05d7\u05e9\u05d1\u05d9\u05dd, \u05ea\u05e9\u05e0"\u05d4-1995',
|
||||
titleEn: 'Computers Law, 5755-1995',
|
||||
abbreviation: 'CL',
|
||||
status: 'in_force',
|
||||
issuedDate: '1995-07-25',
|
||||
inForceDate: '1995-10-25',
|
||||
url: 'https://www.unodc.org/cld/uploads/res/document/computer-law_html/Israel_Computers_Law_5755_1995.pdf',
|
||||
},
|
||||
{
|
||||
id: 'basic-law-human-dignity-1992',
|
||||
lawName: 'Basic Law: Human Dignity and Liberty',
|
||||
year: 1992,
|
||||
title: '\u05d7\u05d5\u05e7 \u05d9\u05e1\u05d5\u05d3: \u05db\u05d1\u05d5\u05d3 \u05d4\u05d0\u05d3\u05dd \u05d5\u05d7\u05d9\u05e8\u05d5\u05ea\u05d5',
|
||||
titleEn: 'Basic Law: Human Dignity and Liberty, 5752-1992',
|
||||
abbreviation: 'BL-HDL',
|
||||
status: 'in_force',
|
||||
issuedDate: '1992-03-17',
|
||||
inForceDate: '1992-03-17',
|
||||
url: 'https://m.knesset.gov.il/EN/activity/documents/BasicLawsPDF/BasicLawLiberty.pdf',
|
||||
},
|
||||
{
|
||||
id: 'companies-law-1999',
|
||||
lawName: 'Companies Law',
|
||||
year: 1999,
|
||||
title: '\u05d7\u05d5\u05e7 \u05d4\u05d7\u05d1\u05e8\u05d5\u05ea, \u05ea\u05e9\u05e0"\u05d8-1999',
|
||||
titleEn: 'Companies Law, 5759-1999',
|
||||
abbreviation: 'CoL',
|
||||
status: 'in_force',
|
||||
issuedDate: '1999-02-15',
|
||||
inForceDate: '2000-02-01',
|
||||
url: 'https://www.gov.il/en/departments/legalinfo/companies_law',
|
||||
},
|
||||
{
|
||||
id: 'electronic-signature-law-2001',
|
||||
lawName: 'Electronic Signature Law',
|
||||
year: 2001,
|
||||
title: '\u05d7\u05d5\u05e7 \u05d7\u05ea\u05d9\u05de\u05d4 \u05d0\u05dc\u05e7\u05d8\u05e8\u05d5\u05e0\u05d9\u05ea, \u05ea\u05e1"\u05d0-2001',
|
||||
titleEn: 'Electronic Signature Law, 5761-2001',
|
||||
abbreviation: 'ESL',
|
||||
status: 'in_force',
|
||||
issuedDate: '2001-08-07',
|
||||
inForceDate: '2001-08-07',
|
||||
url: 'https://www.gov.il/en/departments/legalinfo/electronic_signature_law',
|
||||
},
|
||||
{
|
||||
id: 'credit-data-law-2002',
|
||||
lawName: 'Credit Data Law',
|
||||
year: 2002,
|
||||
title: '\u05d7\u05d5\u05e7 \u05e0\u05ea\u05d5\u05e0\u05d9 \u05d0\u05e9\u05e8\u05d0\u05d9, \u05ea\u05e1"\u05d1-2002',
|
||||
titleEn: 'Credit Data Law, 5762-2002',
|
||||
abbreviation: 'CDL',
|
||||
status: 'in_force',
|
||||
issuedDate: '2002-01-01',
|
||||
inForceDate: '2002-01-01',
|
||||
url: 'https://www.nevo.co.il/law_html/law01/999_611.htm',
|
||||
},
|
||||
{
|
||||
id: 'freedom-of-information-law-1998',
|
||||
lawName: 'Freedom of Information Law',
|
||||
year: 1998,
|
||||
title: '\u05d7\u05d5\u05e7 \u05d7\u05d5\u05e4\u05e9 \u05d4\u05de\u05d9\u05d3\u05e2, \u05ea\u05e9\u05e0"\u05d7-1998',
|
||||
titleEn: 'Freedom of Information Law, 5758-1998',
|
||||
abbreviation: 'FoIL',
|
||||
status: 'in_force',
|
||||
issuedDate: '1998-05-19',
|
||||
inForceDate: '1999-05-19',
|
||||
url: 'https://www.gov.il/en/departments/legalinfo/freedom_of_information_law',
|
||||
},
|
||||
{
|
||||
id: 'regulation-of-security-1998',
|
||||
lawName: 'Regulation of Security in Public Bodies Law',
|
||||
year: 1998,
|
||||
title: '\u05d7\u05d5\u05e7 \u05d4\u05e1\u05d3\u05e8\u05ea \u05d4\u05d0\u05d1\u05d8\u05d7\u05d4 \u05d1\u05d2\u05d5\u05e4\u05d9\u05dd \u05e6\u05d9\u05d1\u05d5\u05e8\u05d9\u05d9\u05dd, \u05ea\u05e9\u05e0"\u05d7-1998',
|
||||
titleEn: 'Regulation of Security in Public Bodies Law, 5758-1998',
|
||||
abbreviation: 'RSPBL',
|
||||
status: 'in_force',
|
||||
issuedDate: '1998-01-01',
|
||||
inForceDate: '1998-01-01',
|
||||
url: 'https://www.nevo.co.il/law_html/law01/999_574.htm',
|
||||
},
|
||||
{
|
||||
id: 'communications-law-1982',
|
||||
lawName: 'Communications Law (Telecommunications and Broadcasting)',
|
||||
year: 1982,
|
||||
title: '\u05d7\u05d5\u05e7 \u05d4\u05ea\u05e7\u05e9\u05d5\u05e8\u05ea (\u05d1\u05d6\u05e7 \u05d5\u05e9\u05d9\u05d3\u05d5\u05e8\u05d9\u05dd), \u05ea\u05e9\u05de"\u05d1-1982',
|
||||
titleEn: 'Communications Law (Telecommunications and Broadcasting), 5742-1982',
|
||||
abbreviation: 'CommL',
|
||||
status: 'in_force',
|
||||
issuedDate: '1982-01-01',
|
||||
inForceDate: '1984-02-01',
|
||||
url: 'https://www.nevo.co.il/law_html/law01/044_001.htm',
|
||||
},
|
||||
];
|
||||
Reference in New Issue
Block a user