#!/usr/bin/env python3 """ Add an Israeli law to the database from Hebrew Wikisource. Usage: python scripts/add_law_from_wikisource.py "חוק_המקרקעין" --id land-law-1969 --title-en "Land Law, 5729-1969" python scripts/add_law_from_wikisource.py "חוק_השכירות_והשאילה" --id rental-law-1971 --title-en "Rental and Lending Law, 5731-1971" The script: 1. Fetches the law's wikitext from Hebrew Wikisource API 2. Parses chapters, sections, and definitions 3. Cleans wiki markup 4. Writes a seed JSON file to data/seed/ 5. Rebuilds the database (npm run build:db) """ import argparse import json import re import subprocess import sys from pathlib import Path SCRIPT_DIR = Path(__file__).parent PROJECT_DIR = SCRIPT_DIR.parent SEED_DIR = PROJECT_DIR / "data" / "seed" try: import requests except ImportError: print("Error: 'requests' package required. Install with: pip install requests") sys.exit(1) def fetch_wikitext(page_name: str) -> str: """Fetch wikitext content from Hebrew Wikisource API.""" url = "https://he.wikisource.org/w/api.php" params = { "action": "parse", "page": page_name, "prop": "wikitext", "format": "json", } headers = {"User-Agent": "IsraelLawMCP/1.0 (law database builder)"} r = requests.get(url, params=params, headers=headers, timeout=30) r.raise_for_status() data = r.json() if "error" in data: raise ValueError(f"Wikisource API error: {data['error'].get('info', 'Unknown error')}") return data["parse"]["wikitext"]["*"] def clean_wiki_markup(text: str) -> str: """Remove wiki markup from text, keeping readable content.""" # Handle {{ח:תת|...}} sub-section markers - keep text text = re.sub(r"\{\{ח:תת\|([^}]*)\}\}", r"\1", text) # Handle {{ח:פנימי|...|display}} - keep display text text = re.sub(r"\{\{ח:פנימי\|[^|]*\|([^}]*)\}\}", r"\1", text) # Handle {{ח:חיצוני|...|display}} - keep display text text = re.sub(r"\{\{ח:חיצוני\|[^|]*\|([^}]*)\}\}", r"\1", text) # Handle {{ח:הערה|...}} - keep content text = re.sub(r"\{\{ח:הערה\|([^}]*)\}\}", r"\1", text) # Remove {{ח:ת|...}} paragraph markers text = re.sub(r"\{\{ח:ת(?:\|[^}]*)?\}\}", "", text) # Remove all remaining {{ }} templates (iteratively for nested ones) while "{{" in text: new_text = re.sub(r"\{\{[^{}]*\}\}", "", text) if new_text == text: # Handle broken/unclosed templates text = re.sub(r"\{\{[^}]*$", "", text, flags=re.MULTILINE) text = re.sub(r"\}\}", "", text) break text = new_text # Remove [[ ]] wiki links, keep display text text = re.sub(r"\[\[[^|\]]*\|([^\]]*)\]\]", r"\1", text) text = re.sub(r"\[\[([^\]]*)\]\]", r"\1", text) # Clean HTML text = re.sub(r"]*>.*?", "", text, flags=re.DOTALL) text = re.sub(r"", "", text) text = re.sub(r"", "", text) text = re.sub(r"", "", text) text = re.sub(r"", "\n", text) text = re.sub(r"]*>", "", text) text = re.sub(r"", "", text) # Remove section delimiters text = re.sub(r"-{4,}", "", text) # Remove amendment info from titles text = re.sub(r"\|תיקון:[^\n]*", "", text) # Clean whitespace text = re.sub(r" +", " ", text) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def parse_law(wikitext: str) -> dict: """Parse wikitext into chapters and provisions.""" # Find chapter boundaries: {{ח:קטע2|ID|NAME}} and {{ח:קטע3|ID|NAME}} chapter_positions = [] for m in re.finditer(r"\{\{ח:קטע[23]\|([^|]+)\|([^|}]+)", wikitext): chapter_positions.append((m.start(), m.group(2).strip())) # Find sections: {{ח:סעיף|NUM|TITLE|...}} section_pattern = re.compile(r"\{\{ח:סעיף\|([^|}]+)(?:\|([^|}]*))?(?:\|[^}]*)?\}\}") sections_raw = [ (m.start(), m.end(), m.group(1).strip(), (m.group(2) or "").strip()) for m in section_pattern.finditer(wikitext) ] def get_chapter(pos): current = "" for cp, cn in chapter_positions: if cp > pos: break current = cn return current # Extract section contents provisions = [] for i, (start, end, sec_num, sec_title) in enumerate(sections_raw): # Content runs until next section or chapter marker if i + 1 < len(sections_raw): content_end = sections_raw[i + 1][0] else: content_end = min(start + 5000, len(wikitext)) # Stop at next chapter marker if closer for cp, _cn in chapter_positions: if end < cp < content_end: content_end = cp break raw_content = wikitext[end:content_end] content = clean_wiki_markup(raw_content) title = clean_wiki_markup(sec_title) chapter = clean_wiki_markup(get_chapter(start)) if content and len(content) > 5: provisions.append( { "provision_ref": f"sec{sec_num}", "section": sec_num, "chapter": chapter, "title": title, "content": content, } ) return { "provisions": provisions, "chapters": [clean_wiki_markup(name) for _, name in chapter_positions], } def extract_title_from_wikitext(wikitext: str) -> str: """Try to extract the Hebrew title from wikitext header.""" m = re.search(r"\{\{ח:כותרת\|([^}]+)\}\}", wikitext) if m: return m.group(1).strip() return "" def build_seed( law_id: str, title_he: str, title_en: str, short_name: str, provisions: list, url: str, status: str = "in_force", issued_date: str = "", in_force_date: str = "", description: str = "", ) -> dict: """Build a seed JSON structure.""" return { "id": law_id, "type": "statute", "title": title_he, "title_en": title_en, "short_name": short_name, "status": status, "issued_date": issued_date, "in_force_date": in_force_date, "url": url, "description": description, "provisions": provisions, "definitions": [], } def main(): parser = argparse.ArgumentParser(description="Add Israeli law from Wikisource") parser.add_argument("page_name", help="Wikisource page name (e.g. חוק_המקרקעין)") parser.add_argument("--id", required=True, help="Law ID for database (e.g. land-law-1969)") parser.add_argument("--title-en", default="", help="English title") parser.add_argument("--short-name", default="", help="Short abbreviation") parser.add_argument("--status", default="in_force", choices=["in_force", "amended", "repealed"]) parser.add_argument("--issued-date", default="", help="Date issued (YYYY-MM-DD)") parser.add_argument("--in-force-date", default="", help="Date in force (YYYY-MM-DD)") parser.add_argument("--description", default="", help="English description") parser.add_argument("--no-rebuild", action="store_true", help="Skip database rebuild") parser.add_argument("--dry-run", action="store_true", help="Print stats without writing") args = parser.parse_args() # Step 1: Fetch from Wikisource print(f"Fetching '{args.page_name}' from Hebrew Wikisource...") wikitext = fetch_wikitext(args.page_name) print(f" Retrieved {len(wikitext):,} characters") # Step 2: Extract title title_he = extract_title_from_wikitext(wikitext) if not title_he: title_he = args.page_name.replace("_", " ") print(f" Hebrew title: {title_he}") # Step 3: Parse print("Parsing sections...") result = parse_law(wikitext) provisions = result["provisions"] chapters = result["chapters"] print(f" Found {len(provisions)} provisions in {len(set(chapters))} chapters/sections") if not provisions: print("ERROR: No provisions found. The page may not be a law or uses different markup.") sys.exit(1) # Step 4: Print chapter distribution ch_dist = {} for p in provisions: ch_dist[p["chapter"]] = ch_dist.get(p["chapter"], 0) + 1 print("\n Chapter distribution:") for ch, count in ch_dist.items(): print(f" {ch}: {count} sections") if args.dry_run: print("\n[Dry run - not writing files]") print(f"\nSample (first provision):") print(json.dumps(provisions[0], ensure_ascii=False, indent=2)[:300]) return # Step 5: Build and write seed url = f"https://he.wikisource.org/wiki/{args.page_name}" seed = build_seed( law_id=args.id, title_he=title_he, title_en=args.title_en, short_name=args.short_name, provisions=provisions, url=url, status=args.status, issued_date=args.issued_date, in_force_date=args.in_force_date, description=args.description, ) output_path = SEED_DIR / f"{args.id}.json" with open(output_path, "w", encoding="utf-8") as f: json.dump(seed, f, ensure_ascii=False, indent=2) print(f"\nSeed written to: {output_path}") # Step 6: Rebuild database if not args.no_rebuild: print("\nRebuilding database...") result = subprocess.run( ["npm", "run", "build:db"], cwd=str(PROJECT_DIR), capture_output=True, text=True, ) if result.returncode == 0: print(result.stdout) print("Database rebuilt successfully!") else: print(f"ERROR rebuilding database:\n{result.stderr}") sys.exit(1) else: print("\nSkipped database rebuild (use 'npm run build:db' manually)") print(f"\nDone! Added {len(provisions)} provisions for '{title_he}'") if __name__ == "__main__": main()