feat: expand law seed data with full Wikisource text and add import script
Some checks failed
CI / test (18) (push) Has been cancelled
CI / test (20) (push) Has been cancelled
CI / test (22) (push) Has been cancelled
Build and Push to GHCR / Build and Push (push) Has been cancelled
Semgrep SAST / Semgrep security scan (push) Has been cancelled
Trivy Security Scan / Trivy vulnerability scan (push) Has been cancelled
Daily Data Freshness Check / check-updates (push) Has been cancelled
Daily Data Freshness Check / report (push) Has been cancelled
Drift Detection / drift-detect (push) Has been cancelled
Some checks failed
CI / test (18) (push) Has been cancelled
CI / test (20) (push) Has been cancelled
CI / test (22) (push) Has been cancelled
Build and Push to GHCR / Build and Push (push) Has been cancelled
Semgrep SAST / Semgrep security scan (push) Has been cancelled
Trivy Security Scan / Trivy vulnerability scan (push) Has been cancelled
Daily Data Freshness Check / check-updates (push) Has been cancelled
Daily Data Freshness Check / report (push) Has been cancelled
Drift Detection / drift-detect (push) Has been cancelled
Update contracts-general-part-law-1973 and planning-and-building-law-1965 with complete Hebrew provision text from Wikisource. Add land-law-1969 seed data and add_law_from_wikisource.py script for importing laws. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
285
scripts/add_law_from_wikisource.py
Executable file
285
scripts/add_law_from_wikisource.py
Executable file
@@ -0,0 +1,285 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Add an Israeli law to the database from Hebrew Wikisource.
|
||||
|
||||
Usage:
|
||||
python scripts/add_law_from_wikisource.py "חוק_המקרקעין" --id land-law-1969 --title-en "Land Law, 5729-1969"
|
||||
python scripts/add_law_from_wikisource.py "חוק_השכירות_והשאילה" --id rental-law-1971 --title-en "Rental and Lending Law, 5731-1971"
|
||||
|
||||
The script:
|
||||
1. Fetches the law's wikitext from Hebrew Wikisource API
|
||||
2. Parses chapters, sections, and definitions
|
||||
3. Cleans wiki markup
|
||||
4. Writes a seed JSON file to data/seed/
|
||||
5. Rebuilds the database (npm run build:db)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
PROJECT_DIR = SCRIPT_DIR.parent
|
||||
SEED_DIR = PROJECT_DIR / "data" / "seed"
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
print("Error: 'requests' package required. Install with: pip install requests")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def fetch_wikitext(page_name: str) -> str:
|
||||
"""Fetch wikitext content from Hebrew Wikisource API."""
|
||||
url = "https://he.wikisource.org/w/api.php"
|
||||
params = {
|
||||
"action": "parse",
|
||||
"page": page_name,
|
||||
"prop": "wikitext",
|
||||
"format": "json",
|
||||
}
|
||||
headers = {"User-Agent": "IsraelLawMCP/1.0 (law database builder)"}
|
||||
r = requests.get(url, params=params, headers=headers, timeout=30)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
|
||||
if "error" in data:
|
||||
raise ValueError(f"Wikisource API error: {data['error'].get('info', 'Unknown error')}")
|
||||
|
||||
return data["parse"]["wikitext"]["*"]
|
||||
|
||||
|
||||
def clean_wiki_markup(text: str) -> str:
|
||||
"""Remove wiki markup from text, keeping readable content."""
|
||||
# Handle {{ח:תת|...}} sub-section markers - keep text
|
||||
text = re.sub(r"\{\{ח:תת\|([^}]*)\}\}", r"\1", text)
|
||||
# Handle {{ח:פנימי|...|display}} - keep display text
|
||||
text = re.sub(r"\{\{ח:פנימי\|[^|]*\|([^}]*)\}\}", r"\1", text)
|
||||
# Handle {{ח:חיצוני|...|display}} - keep display text
|
||||
text = re.sub(r"\{\{ח:חיצוני\|[^|]*\|([^}]*)\}\}", r"\1", text)
|
||||
# Handle {{ח:הערה|...}} - keep content
|
||||
text = re.sub(r"\{\{ח:הערה\|([^}]*)\}\}", r"\1", text)
|
||||
# Remove {{ח:ת|...}} paragraph markers
|
||||
text = re.sub(r"\{\{ח:ת(?:\|[^}]*)?\}\}", "", text)
|
||||
# Remove all remaining {{ }} templates (iteratively for nested ones)
|
||||
while "{{" in text:
|
||||
new_text = re.sub(r"\{\{[^{}]*\}\}", "", text)
|
||||
if new_text == text:
|
||||
# Handle broken/unclosed templates
|
||||
text = re.sub(r"\{\{[^}]*$", "", text, flags=re.MULTILINE)
|
||||
text = re.sub(r"\}\}", "", text)
|
||||
break
|
||||
text = new_text
|
||||
# Remove [[ ]] wiki links, keep display text
|
||||
text = re.sub(r"\[\[[^|\]]*\|([^\]]*)\]\]", r"\1", text)
|
||||
text = re.sub(r"\[\[([^\]]*)\]\]", r"\1", text)
|
||||
# Clean HTML
|
||||
text = re.sub(r"<ref[^>]*>.*?</ref>", "", text, flags=re.DOTALL)
|
||||
text = re.sub(r"<ref[^/]*/>", "", text)
|
||||
text = re.sub(r"</?small>", "", text)
|
||||
text = re.sub(r"<wbr>", "", text)
|
||||
text = re.sub(r"<br\s*/?>", "\n", text)
|
||||
text = re.sub(r"<div[^>]*>", "", text)
|
||||
text = re.sub(r"</div>", "", text)
|
||||
# Remove section delimiters
|
||||
text = re.sub(r"-{4,}", "", text)
|
||||
# Remove amendment info from titles
|
||||
text = re.sub(r"\|תיקון:[^\n]*", "", text)
|
||||
# Clean whitespace
|
||||
text = re.sub(r" +", " ", text)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def parse_law(wikitext: str) -> dict:
|
||||
"""Parse wikitext into chapters and provisions."""
|
||||
# Find chapter boundaries: {{ח:קטע2|ID|NAME}} and {{ח:קטע3|ID|NAME}}
|
||||
chapter_positions = []
|
||||
for m in re.finditer(r"\{\{ח:קטע[23]\|([^|]+)\|([^|}]+)", wikitext):
|
||||
chapter_positions.append((m.start(), m.group(2).strip()))
|
||||
|
||||
# Find sections: {{ח:סעיף|NUM|TITLE|...}}
|
||||
section_pattern = re.compile(r"\{\{ח:סעיף\|([^|}]+)(?:\|([^|}]*))?(?:\|[^}]*)?\}\}")
|
||||
sections_raw = [
|
||||
(m.start(), m.end(), m.group(1).strip(), (m.group(2) or "").strip())
|
||||
for m in section_pattern.finditer(wikitext)
|
||||
]
|
||||
|
||||
def get_chapter(pos):
|
||||
current = ""
|
||||
for cp, cn in chapter_positions:
|
||||
if cp > pos:
|
||||
break
|
||||
current = cn
|
||||
return current
|
||||
|
||||
# Extract section contents
|
||||
provisions = []
|
||||
for i, (start, end, sec_num, sec_title) in enumerate(sections_raw):
|
||||
# Content runs until next section or chapter marker
|
||||
if i + 1 < len(sections_raw):
|
||||
content_end = sections_raw[i + 1][0]
|
||||
else:
|
||||
content_end = min(start + 5000, len(wikitext))
|
||||
|
||||
# Stop at next chapter marker if closer
|
||||
for cp, _cn in chapter_positions:
|
||||
if end < cp < content_end:
|
||||
content_end = cp
|
||||
break
|
||||
|
||||
raw_content = wikitext[end:content_end]
|
||||
content = clean_wiki_markup(raw_content)
|
||||
title = clean_wiki_markup(sec_title)
|
||||
chapter = clean_wiki_markup(get_chapter(start))
|
||||
|
||||
if content and len(content) > 5:
|
||||
provisions.append(
|
||||
{
|
||||
"provision_ref": f"sec{sec_num}",
|
||||
"section": sec_num,
|
||||
"chapter": chapter,
|
||||
"title": title,
|
||||
"content": content,
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"provisions": provisions,
|
||||
"chapters": [clean_wiki_markup(name) for _, name in chapter_positions],
|
||||
}
|
||||
|
||||
|
||||
def extract_title_from_wikitext(wikitext: str) -> str:
|
||||
"""Try to extract the Hebrew title from wikitext header."""
|
||||
m = re.search(r"\{\{ח:כותרת\|([^}]+)\}\}", wikitext)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
return ""
|
||||
|
||||
|
||||
def build_seed(
|
||||
law_id: str,
|
||||
title_he: str,
|
||||
title_en: str,
|
||||
short_name: str,
|
||||
provisions: list,
|
||||
url: str,
|
||||
status: str = "in_force",
|
||||
issued_date: str = "",
|
||||
in_force_date: str = "",
|
||||
description: str = "",
|
||||
) -> dict:
|
||||
"""Build a seed JSON structure."""
|
||||
return {
|
||||
"id": law_id,
|
||||
"type": "statute",
|
||||
"title": title_he,
|
||||
"title_en": title_en,
|
||||
"short_name": short_name,
|
||||
"status": status,
|
||||
"issued_date": issued_date,
|
||||
"in_force_date": in_force_date,
|
||||
"url": url,
|
||||
"description": description,
|
||||
"provisions": provisions,
|
||||
"definitions": [],
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Add Israeli law from Wikisource")
|
||||
parser.add_argument("page_name", help="Wikisource page name (e.g. חוק_המקרקעין)")
|
||||
parser.add_argument("--id", required=True, help="Law ID for database (e.g. land-law-1969)")
|
||||
parser.add_argument("--title-en", default="", help="English title")
|
||||
parser.add_argument("--short-name", default="", help="Short abbreviation")
|
||||
parser.add_argument("--status", default="in_force", choices=["in_force", "amended", "repealed"])
|
||||
parser.add_argument("--issued-date", default="", help="Date issued (YYYY-MM-DD)")
|
||||
parser.add_argument("--in-force-date", default="", help="Date in force (YYYY-MM-DD)")
|
||||
parser.add_argument("--description", default="", help="English description")
|
||||
parser.add_argument("--no-rebuild", action="store_true", help="Skip database rebuild")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Print stats without writing")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Step 1: Fetch from Wikisource
|
||||
print(f"Fetching '{args.page_name}' from Hebrew Wikisource...")
|
||||
wikitext = fetch_wikitext(args.page_name)
|
||||
print(f" Retrieved {len(wikitext):,} characters")
|
||||
|
||||
# Step 2: Extract title
|
||||
title_he = extract_title_from_wikitext(wikitext)
|
||||
if not title_he:
|
||||
title_he = args.page_name.replace("_", " ")
|
||||
print(f" Hebrew title: {title_he}")
|
||||
|
||||
# Step 3: Parse
|
||||
print("Parsing sections...")
|
||||
result = parse_law(wikitext)
|
||||
provisions = result["provisions"]
|
||||
chapters = result["chapters"]
|
||||
print(f" Found {len(provisions)} provisions in {len(set(chapters))} chapters/sections")
|
||||
|
||||
if not provisions:
|
||||
print("ERROR: No provisions found. The page may not be a law or uses different markup.")
|
||||
sys.exit(1)
|
||||
|
||||
# Step 4: Print chapter distribution
|
||||
ch_dist = {}
|
||||
for p in provisions:
|
||||
ch_dist[p["chapter"]] = ch_dist.get(p["chapter"], 0) + 1
|
||||
print("\n Chapter distribution:")
|
||||
for ch, count in ch_dist.items():
|
||||
print(f" {ch}: {count} sections")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n[Dry run - not writing files]")
|
||||
print(f"\nSample (first provision):")
|
||||
print(json.dumps(provisions[0], ensure_ascii=False, indent=2)[:300])
|
||||
return
|
||||
|
||||
# Step 5: Build and write seed
|
||||
url = f"https://he.wikisource.org/wiki/{args.page_name}"
|
||||
seed = build_seed(
|
||||
law_id=args.id,
|
||||
title_he=title_he,
|
||||
title_en=args.title_en,
|
||||
short_name=args.short_name,
|
||||
provisions=provisions,
|
||||
url=url,
|
||||
status=args.status,
|
||||
issued_date=args.issued_date,
|
||||
in_force_date=args.in_force_date,
|
||||
description=args.description,
|
||||
)
|
||||
|
||||
output_path = SEED_DIR / f"{args.id}.json"
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(seed, f, ensure_ascii=False, indent=2)
|
||||
print(f"\nSeed written to: {output_path}")
|
||||
|
||||
# Step 6: Rebuild database
|
||||
if not args.no_rebuild:
|
||||
print("\nRebuilding database...")
|
||||
result = subprocess.run(
|
||||
["npm", "run", "build:db"],
|
||||
cwd=str(PROJECT_DIR),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
print(result.stdout)
|
||||
print("Database rebuilt successfully!")
|
||||
else:
|
||||
print(f"ERROR rebuilding database:\n{result.stderr}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("\nSkipped database rebuild (use 'npm run build:db' manually)")
|
||||
|
||||
print(f"\nDone! Added {len(provisions)} provisions for '{title_he}'")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user