feat: expand law seed data with full Wikisource text and add import script

Update contracts-general-part-law-1973 and planning-and-building-law-1965 with complete Hebrew provision text from Wikisource. Add land-law-1969 seed data and add_law_from_wikisource.py script for importing laws. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 17:02:17 +00:00
parent d6ae42cf7b
commit 4d64a43461
5 changed files with 6017 additions and 38 deletions
--- a/scripts/add_law_from_wikisource.py
+++ b/scripts/add_law_from_wikisource.py
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+"""
+Add an Israeli law to the database from Hebrew Wikisource.
+
+Usage:
+    python scripts/add_law_from_wikisource.py "חוק_המקרקעין" --id land-law-1969 --title-en "Land Law, 5729-1969"
+    python scripts/add_law_from_wikisource.py "חוק_השכירות_והשאילה" --id rental-law-1971 --title-en "Rental and Lending Law, 5731-1971"
+
+The script:
+1. Fetches the law's wikitext from Hebrew Wikisource API
+2. Parses chapters, sections, and definitions
+3. Cleans wiki markup
+4. Writes a seed JSON file to data/seed/
+5. Rebuilds the database (npm run build:db)
+"""
+
+import argparse
+import json
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).parent
+PROJECT_DIR = SCRIPT_DIR.parent
+SEED_DIR = PROJECT_DIR / "data" / "seed"
+
+try:
+    import requests
+except ImportError:
+    print("Error: 'requests' package required. Install with: pip install requests")
+    sys.exit(1)
+
+
+def fetch_wikitext(page_name: str) -> str:
+    """Fetch wikitext content from Hebrew Wikisource API."""
+    url = "https://he.wikisource.org/w/api.php"
+    params = {
+        "action": "parse",
+        "page": page_name,
+        "prop": "wikitext",
+        "format": "json",
+    }
+    headers = {"User-Agent": "IsraelLawMCP/1.0 (law database builder)"}
+    r = requests.get(url, params=params, headers=headers, timeout=30)
+    r.raise_for_status()
+    data = r.json()
+
+    if "error" in data:
+        raise ValueError(f"Wikisource API error: {data['error'].get('info', 'Unknown error')}")
+
+    return data["parse"]["wikitext"]["*"]
+
+
+def clean_wiki_markup(text: str) -> str:
+    """Remove wiki markup from text, keeping readable content."""
+    # Handle {{ח:תת|...}} sub-section markers - keep text
+    text = re.sub(r"\{\{ח:תת\|([^}]*)\}\}", r"\1", text)
+    # Handle {{ח:פנימי|...|display}} - keep display text
+    text = re.sub(r"\{\{ח:פנימי\|[^|]*\|([^}]*)\}\}", r"\1", text)
+    # Handle {{ח:חיצוני|...|display}} - keep display text
+    text = re.sub(r"\{\{ח:חיצוני\|[^|]*\|([^}]*)\}\}", r"\1", text)
+    # Handle {{ח:הערה|...}} - keep content
+    text = re.sub(r"\{\{ח:הערה\|([^}]*)\}\}", r"\1", text)
+    # Remove {{ח:ת|...}} paragraph markers
+    text = re.sub(r"\{\{ח:ת(?:\|[^}]*)?\}\}", "", text)
+    # Remove all remaining {{ }} templates (iteratively for nested ones)
+    while "{{" in text:
+        new_text = re.sub(r"\{\{[^{}]*\}\}", "", text)
+        if new_text == text:
+            # Handle broken/unclosed templates
+            text = re.sub(r"\{\{[^}]*$", "", text, flags=re.MULTILINE)
+            text = re.sub(r"\}\}", "", text)
+            break
+        text = new_text
+    # Remove [[ ]] wiki links, keep display text
+    text = re.sub(r"\[\[[^|\]]*\|([^\]]*)\]\]", r"\1", text)
+    text = re.sub(r"\[\[([^\]]*)\]\]", r"\1", text)
+    # Clean HTML
+    text = re.sub(r"<ref[^>]*>.*?</ref>", "", text, flags=re.DOTALL)
+    text = re.sub(r"<ref[^/]*/>", "", text)
+    text = re.sub(r"</?small>", "", text)
+    text = re.sub(r"<wbr>", "", text)
+    text = re.sub(r"<br\s*/?>", "\n", text)
+    text = re.sub(r"<div[^>]*>", "", text)
+    text = re.sub(r"</div>", "", text)
+    # Remove section delimiters
+    text = re.sub(r"-{4,}", "", text)
+    # Remove amendment info from titles
+    text = re.sub(r"\|תיקון:[^\n]*", "", text)
+    # Clean whitespace
+    text = re.sub(r" +", " ", text)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+
+
+def parse_law(wikitext: str) -> dict:
+    """Parse wikitext into chapters and provisions."""
+    # Find chapter boundaries: {{ח:קטע2|ID|NAME}} and {{ח:קטע3|ID|NAME}}
+    chapter_positions = []
+    for m in re.finditer(r"\{\{ח:קטע[23]\|([^|]+)\|([^|}]+)", wikitext):
+        chapter_positions.append((m.start(), m.group(2).strip()))
+
+    # Find sections: {{ח:סעיף|NUM|TITLE|...}}
+    section_pattern = re.compile(r"\{\{ח:סעיף\|([^|}]+)(?:\|([^|}]*))?(?:\|[^}]*)?\}\}")
+    sections_raw = [
+        (m.start(), m.end(), m.group(1).strip(), (m.group(2) or "").strip())
+        for m in section_pattern.finditer(wikitext)
+    ]
+
+    def get_chapter(pos):
+        current = ""
+        for cp, cn in chapter_positions:
+            if cp > pos:
+                break
+            current = cn
+        return current
+
+    # Extract section contents
+    provisions = []
+    for i, (start, end, sec_num, sec_title) in enumerate(sections_raw):
+        # Content runs until next section or chapter marker
+        if i + 1 < len(sections_raw):
+            content_end = sections_raw[i + 1][0]
+        else:
+            content_end = min(start + 5000, len(wikitext))
+
+        # Stop at next chapter marker if closer
+        for cp, _cn in chapter_positions:
+            if end < cp < content_end:
+                content_end = cp
+                break
+
+        raw_content = wikitext[end:content_end]
+        content = clean_wiki_markup(raw_content)
+        title = clean_wiki_markup(sec_title)
+        chapter = clean_wiki_markup(get_chapter(start))
+
+        if content and len(content) > 5:
+            provisions.append(
+                {
+                    "provision_ref": f"sec{sec_num}",
+                    "section": sec_num,
+                    "chapter": chapter,
+                    "title": title,
+                    "content": content,
+                }
+            )
+
+    return {
+        "provisions": provisions,
+        "chapters": [clean_wiki_markup(name) for _, name in chapter_positions],
+    }
+
+
+def extract_title_from_wikitext(wikitext: str) -> str:
+    """Try to extract the Hebrew title from wikitext header."""
+    m = re.search(r"\{\{ח:כותרת\|([^}]+)\}\}", wikitext)
+    if m:
+        return m.group(1).strip()
+    return ""
+
+
+def build_seed(
+    law_id: str,
+    title_he: str,
+    title_en: str,
+    short_name: str,
+    provisions: list,
+    url: str,
+    status: str = "in_force",
+    issued_date: str = "",
+    in_force_date: str = "",
+    description: str = "",
+) -> dict:
+    """Build a seed JSON structure."""
+    return {
+        "id": law_id,
+        "type": "statute",
+        "title": title_he,
+        "title_en": title_en,
+        "short_name": short_name,
+        "status": status,
+        "issued_date": issued_date,
+        "in_force_date": in_force_date,
+        "url": url,
+        "description": description,
+        "provisions": provisions,
+        "definitions": [],
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Add Israeli law from Wikisource")
+    parser.add_argument("page_name", help="Wikisource page name (e.g. חוק_המקרקעין)")
+    parser.add_argument("--id", required=True, help="Law ID for database (e.g. land-law-1969)")
+    parser.add_argument("--title-en", default="", help="English title")
+    parser.add_argument("--short-name", default="", help="Short abbreviation")
+    parser.add_argument("--status", default="in_force", choices=["in_force", "amended", "repealed"])
+    parser.add_argument("--issued-date", default="", help="Date issued (YYYY-MM-DD)")
+    parser.add_argument("--in-force-date", default="", help="Date in force (YYYY-MM-DD)")
+    parser.add_argument("--description", default="", help="English description")
+    parser.add_argument("--no-rebuild", action="store_true", help="Skip database rebuild")
+    parser.add_argument("--dry-run", action="store_true", help="Print stats without writing")
+    args = parser.parse_args()
+
+    # Step 1: Fetch from Wikisource
+    print(f"Fetching '{args.page_name}' from Hebrew Wikisource...")
+    wikitext = fetch_wikitext(args.page_name)
+    print(f"  Retrieved {len(wikitext):,} characters")
+
+    # Step 2: Extract title
+    title_he = extract_title_from_wikitext(wikitext)
+    if not title_he:
+        title_he = args.page_name.replace("_", " ")
+    print(f"  Hebrew title: {title_he}")
+
+    # Step 3: Parse
+    print("Parsing sections...")
+    result = parse_law(wikitext)
+    provisions = result["provisions"]
+    chapters = result["chapters"]
+    print(f"  Found {len(provisions)} provisions in {len(set(chapters))} chapters/sections")
+
+    if not provisions:
+        print("ERROR: No provisions found. The page may not be a law or uses different markup.")
+        sys.exit(1)
+
+    # Step 4: Print chapter distribution
+    ch_dist = {}
+    for p in provisions:
+        ch_dist[p["chapter"]] = ch_dist.get(p["chapter"], 0) + 1
+    print("\n  Chapter distribution:")
+    for ch, count in ch_dist.items():
+        print(f"    {ch}: {count} sections")
+
+    if args.dry_run:
+        print("\n[Dry run - not writing files]")
+        print(f"\nSample (first provision):")
+        print(json.dumps(provisions[0], ensure_ascii=False, indent=2)[:300])
+        return
+
+    # Step 5: Build and write seed
+    url = f"https://he.wikisource.org/wiki/{args.page_name}"
+    seed = build_seed(
+        law_id=args.id,
+        title_he=title_he,
+        title_en=args.title_en,
+        short_name=args.short_name,
+        provisions=provisions,
+        url=url,
+        status=args.status,
+        issued_date=args.issued_date,
+        in_force_date=args.in_force_date,
+        description=args.description,
+    )
+
+    output_path = SEED_DIR / f"{args.id}.json"
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(seed, f, ensure_ascii=False, indent=2)
+    print(f"\nSeed written to: {output_path}")
+
+    # Step 6: Rebuild database
+    if not args.no_rebuild:
+        print("\nRebuilding database...")
+        result = subprocess.run(
+            ["npm", "run", "build:db"],
+            cwd=str(PROJECT_DIR),
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode == 0:
+            print(result.stdout)
+            print("Database rebuilt successfully!")
+        else:
+            print(f"ERROR rebuilding database:\n{result.stderr}")
+            sys.exit(1)
+    else:
+        print("\nSkipped database rebuild (use 'npm run build:db' manually)")
+
+    print(f"\nDone! Added {len(provisions)} provisions for '{title_he}'")
+
+
+if __name__ == "__main__":
+    main()