Files
israel-law-mcp/scripts/add_law_from_wikisource.py
chaim 4d64a43461
Some checks failed
CI / test (18) (push) Has been cancelled
CI / test (20) (push) Has been cancelled
CI / test (22) (push) Has been cancelled
Build and Push to GHCR / Build and Push (push) Has been cancelled
Semgrep SAST / Semgrep security scan (push) Has been cancelled
Trivy Security Scan / Trivy vulnerability scan (push) Has been cancelled
Daily Data Freshness Check / check-updates (push) Has been cancelled
Daily Data Freshness Check / report (push) Has been cancelled
Drift Detection / drift-detect (push) Has been cancelled
feat: expand law seed data with full Wikisource text and add import script
Update contracts-general-part-law-1973 and planning-and-building-law-1965
with complete Hebrew provision text from Wikisource. Add land-law-1969 seed
data and add_law_from_wikisource.py script for importing laws.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 17:02:17 +00:00

286 lines
9.8 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Add an Israeli law to the database from Hebrew Wikisource.
Usage:
python scripts/add_law_from_wikisource.py "חוק_המקרקעין" --id land-law-1969 --title-en "Land Law, 5729-1969"
python scripts/add_law_from_wikisource.py "חוק_השכירות_והשאילה" --id rental-law-1971 --title-en "Rental and Lending Law, 5731-1971"
The script:
1. Fetches the law's wikitext from Hebrew Wikisource API
2. Parses chapters, sections, and definitions
3. Cleans wiki markup
4. Writes a seed JSON file to data/seed/
5. Rebuilds the database (npm run build:db)
"""
import argparse
import json
import re
import subprocess
import sys
from pathlib import Path
SCRIPT_DIR = Path(__file__).parent
PROJECT_DIR = SCRIPT_DIR.parent
SEED_DIR = PROJECT_DIR / "data" / "seed"
try:
import requests
except ImportError:
print("Error: 'requests' package required. Install with: pip install requests")
sys.exit(1)
def fetch_wikitext(page_name: str) -> str:
"""Fetch wikitext content from Hebrew Wikisource API."""
url = "https://he.wikisource.org/w/api.php"
params = {
"action": "parse",
"page": page_name,
"prop": "wikitext",
"format": "json",
}
headers = {"User-Agent": "IsraelLawMCP/1.0 (law database builder)"}
r = requests.get(url, params=params, headers=headers, timeout=30)
r.raise_for_status()
data = r.json()
if "error" in data:
raise ValueError(f"Wikisource API error: {data['error'].get('info', 'Unknown error')}")
return data["parse"]["wikitext"]["*"]
def clean_wiki_markup(text: str) -> str:
"""Remove wiki markup from text, keeping readable content."""
# Handle {{ח:תת|...}} sub-section markers - keep text
text = re.sub(r"\{\{ח:תת\|([^}]*)\}\}", r"\1", text)
# Handle {{ח:פנימי|...|display}} - keep display text
text = re.sub(r"\{\{ח:פנימי\|[^|]*\|([^}]*)\}\}", r"\1", text)
# Handle {{ח:חיצוני|...|display}} - keep display text
text = re.sub(r"\{\{ח:חיצוני\|[^|]*\|([^}]*)\}\}", r"\1", text)
# Handle {{ח:הערה|...}} - keep content
text = re.sub(r"\{\{ח:הערה\|([^}]*)\}\}", r"\1", text)
# Remove {{ח:ת|...}} paragraph markers
text = re.sub(r"\{\{ח:ת(?:\|[^}]*)?\}\}", "", text)
# Remove all remaining {{ }} templates (iteratively for nested ones)
while "{{" in text:
new_text = re.sub(r"\{\{[^{}]*\}\}", "", text)
if new_text == text:
# Handle broken/unclosed templates
text = re.sub(r"\{\{[^}]*$", "", text, flags=re.MULTILINE)
text = re.sub(r"\}\}", "", text)
break
text = new_text
# Remove [[ ]] wiki links, keep display text
text = re.sub(r"\[\[[^|\]]*\|([^\]]*)\]\]", r"\1", text)
text = re.sub(r"\[\[([^\]]*)\]\]", r"\1", text)
# Clean HTML
text = re.sub(r"<ref[^>]*>.*?</ref>", "", text, flags=re.DOTALL)
text = re.sub(r"<ref[^/]*/>", "", text)
text = re.sub(r"</?small>", "", text)
text = re.sub(r"<wbr>", "", text)
text = re.sub(r"<br\s*/?>", "\n", text)
text = re.sub(r"<div[^>]*>", "", text)
text = re.sub(r"</div>", "", text)
# Remove section delimiters
text = re.sub(r"-{4,}", "", text)
# Remove amendment info from titles
text = re.sub(r"\|תיקון:[^\n]*", "", text)
# Clean whitespace
text = re.sub(r" +", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def parse_law(wikitext: str) -> dict:
"""Parse wikitext into chapters and provisions."""
# Find chapter boundaries: {{ח:קטע2|ID|NAME}} and {{ח:קטע3|ID|NAME}}
chapter_positions = []
for m in re.finditer(r"\{\{ח:קטע[23]\|([^|]+)\|([^|}]+)", wikitext):
chapter_positions.append((m.start(), m.group(2).strip()))
# Find sections: {{ח:סעיף|NUM|TITLE|...}}
section_pattern = re.compile(r"\{\{ח:סעיף\|([^|}]+)(?:\|([^|}]*))?(?:\|[^}]*)?\}\}")
sections_raw = [
(m.start(), m.end(), m.group(1).strip(), (m.group(2) or "").strip())
for m in section_pattern.finditer(wikitext)
]
def get_chapter(pos):
current = ""
for cp, cn in chapter_positions:
if cp > pos:
break
current = cn
return current
# Extract section contents
provisions = []
for i, (start, end, sec_num, sec_title) in enumerate(sections_raw):
# Content runs until next section or chapter marker
if i + 1 < len(sections_raw):
content_end = sections_raw[i + 1][0]
else:
content_end = min(start + 5000, len(wikitext))
# Stop at next chapter marker if closer
for cp, _cn in chapter_positions:
if end < cp < content_end:
content_end = cp
break
raw_content = wikitext[end:content_end]
content = clean_wiki_markup(raw_content)
title = clean_wiki_markup(sec_title)
chapter = clean_wiki_markup(get_chapter(start))
if content and len(content) > 5:
provisions.append(
{
"provision_ref": f"sec{sec_num}",
"section": sec_num,
"chapter": chapter,
"title": title,
"content": content,
}
)
return {
"provisions": provisions,
"chapters": [clean_wiki_markup(name) for _, name in chapter_positions],
}
def extract_title_from_wikitext(wikitext: str) -> str:
"""Try to extract the Hebrew title from wikitext header."""
m = re.search(r"\{\{ח:כותרת\|([^}]+)\}\}", wikitext)
if m:
return m.group(1).strip()
return ""
def build_seed(
law_id: str,
title_he: str,
title_en: str,
short_name: str,
provisions: list,
url: str,
status: str = "in_force",
issued_date: str = "",
in_force_date: str = "",
description: str = "",
) -> dict:
"""Build a seed JSON structure."""
return {
"id": law_id,
"type": "statute",
"title": title_he,
"title_en": title_en,
"short_name": short_name,
"status": status,
"issued_date": issued_date,
"in_force_date": in_force_date,
"url": url,
"description": description,
"provisions": provisions,
"definitions": [],
}
def main():
parser = argparse.ArgumentParser(description="Add Israeli law from Wikisource")
parser.add_argument("page_name", help="Wikisource page name (e.g. חוק_המקרקעין)")
parser.add_argument("--id", required=True, help="Law ID for database (e.g. land-law-1969)")
parser.add_argument("--title-en", default="", help="English title")
parser.add_argument("--short-name", default="", help="Short abbreviation")
parser.add_argument("--status", default="in_force", choices=["in_force", "amended", "repealed"])
parser.add_argument("--issued-date", default="", help="Date issued (YYYY-MM-DD)")
parser.add_argument("--in-force-date", default="", help="Date in force (YYYY-MM-DD)")
parser.add_argument("--description", default="", help="English description")
parser.add_argument("--no-rebuild", action="store_true", help="Skip database rebuild")
parser.add_argument("--dry-run", action="store_true", help="Print stats without writing")
args = parser.parse_args()
# Step 1: Fetch from Wikisource
print(f"Fetching '{args.page_name}' from Hebrew Wikisource...")
wikitext = fetch_wikitext(args.page_name)
print(f" Retrieved {len(wikitext):,} characters")
# Step 2: Extract title
title_he = extract_title_from_wikitext(wikitext)
if not title_he:
title_he = args.page_name.replace("_", " ")
print(f" Hebrew title: {title_he}")
# Step 3: Parse
print("Parsing sections...")
result = parse_law(wikitext)
provisions = result["provisions"]
chapters = result["chapters"]
print(f" Found {len(provisions)} provisions in {len(set(chapters))} chapters/sections")
if not provisions:
print("ERROR: No provisions found. The page may not be a law or uses different markup.")
sys.exit(1)
# Step 4: Print chapter distribution
ch_dist = {}
for p in provisions:
ch_dist[p["chapter"]] = ch_dist.get(p["chapter"], 0) + 1
print("\n Chapter distribution:")
for ch, count in ch_dist.items():
print(f" {ch}: {count} sections")
if args.dry_run:
print("\n[Dry run - not writing files]")
print(f"\nSample (first provision):")
print(json.dumps(provisions[0], ensure_ascii=False, indent=2)[:300])
return
# Step 5: Build and write seed
url = f"https://he.wikisource.org/wiki/{args.page_name}"
seed = build_seed(
law_id=args.id,
title_he=title_he,
title_en=args.title_en,
short_name=args.short_name,
provisions=provisions,
url=url,
status=args.status,
issued_date=args.issued_date,
in_force_date=args.in_force_date,
description=args.description,
)
output_path = SEED_DIR / f"{args.id}.json"
with open(output_path, "w", encoding="utf-8") as f:
json.dump(seed, f, ensure_ascii=False, indent=2)
print(f"\nSeed written to: {output_path}")
# Step 6: Rebuild database
if not args.no_rebuild:
print("\nRebuilding database...")
result = subprocess.run(
["npm", "run", "build:db"],
cwd=str(PROJECT_DIR),
capture_output=True,
text=True,
)
if result.returncode == 0:
print(result.stdout)
print("Database rebuilt successfully!")
else:
print(f"ERROR rebuilding database:\n{result.stderr}")
sys.exit(1)
else:
print("\nSkipped database rebuild (use 'npm run build:db' manually)")
print(f"\nDone! Added {len(provisions)} provisions for '{title_he}'")
if __name__ == "__main__":
main()