", "", text)
+ text = re.sub(r"
", "\n", text)
+ text = re.sub(r"]*>", "", text)
+ text = re.sub(r"
", "", text)
+ # Remove section delimiters
+ text = re.sub(r"-{4,}", "", text)
+ # Remove amendment info from titles
+ text = re.sub(r"\|תיקון:[^\n]*", "", text)
+ # Clean whitespace
+ text = re.sub(r" +", " ", text)
+ text = re.sub(r"\n{3,}", "\n\n", text)
+ return text.strip()
+
+
+def parse_law(wikitext: str) -> dict:
+ """Parse wikitext into chapters and provisions."""
+ # Find chapter boundaries: {{ח:קטע2|ID|NAME}} and {{ח:קטע3|ID|NAME}}
+ chapter_positions = []
+ for m in re.finditer(r"\{\{ח:קטע[23]\|([^|]+)\|([^|}]+)", wikitext):
+ chapter_positions.append((m.start(), m.group(2).strip()))
+
+ # Find sections: {{ח:סעיף|NUM|TITLE|...}}
+ section_pattern = re.compile(r"\{\{ח:סעיף\|([^|}]+)(?:\|([^|}]*))?(?:\|[^}]*)?\}\}")
+ sections_raw = [
+ (m.start(), m.end(), m.group(1).strip(), (m.group(2) or "").strip())
+ for m in section_pattern.finditer(wikitext)
+ ]
+
+ def get_chapter(pos):
+ current = ""
+ for cp, cn in chapter_positions:
+ if cp > pos:
+ break
+ current = cn
+ return current
+
+ # Extract section contents
+ provisions = []
+ for i, (start, end, sec_num, sec_title) in enumerate(sections_raw):
+ # Content runs until next section or chapter marker
+ if i + 1 < len(sections_raw):
+ content_end = sections_raw[i + 1][0]
+ else:
+ content_end = min(start + 5000, len(wikitext))
+
+ # Stop at next chapter marker if closer
+ for cp, _cn in chapter_positions:
+ if end < cp < content_end:
+ content_end = cp
+ break
+
+ raw_content = wikitext[end:content_end]
+ content = clean_wiki_markup(raw_content)
+ title = clean_wiki_markup(sec_title)
+ chapter = clean_wiki_markup(get_chapter(start))
+
+ if content and len(content) > 5:
+ provisions.append(
+ {
+ "provision_ref": f"sec{sec_num}",
+ "section": sec_num,
+ "chapter": chapter,
+ "title": title,
+ "content": content,
+ }
+ )
+
+ return {
+ "provisions": provisions,
+ "chapters": [clean_wiki_markup(name) for _, name in chapter_positions],
+ }
+
+
+def extract_title_from_wikitext(wikitext: str) -> str:
+ """Try to extract the Hebrew title from wikitext header."""
+ m = re.search(r"\{\{ח:כותרת\|([^}]+)\}\}", wikitext)
+ if m:
+ return m.group(1).strip()
+ return ""
+
+
+def build_seed(
+ law_id: str,
+ title_he: str,
+ title_en: str,
+ short_name: str,
+ provisions: list,
+ url: str,
+ status: str = "in_force",
+ issued_date: str = "",
+ in_force_date: str = "",
+ description: str = "",
+) -> dict:
+ """Build a seed JSON structure."""
+ return {
+ "id": law_id,
+ "type": "statute",
+ "title": title_he,
+ "title_en": title_en,
+ "short_name": short_name,
+ "status": status,
+ "issued_date": issued_date,
+ "in_force_date": in_force_date,
+ "url": url,
+ "description": description,
+ "provisions": provisions,
+ "definitions": [],
+ }
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Add Israeli law from Wikisource")
+ parser.add_argument("page_name", help="Wikisource page name (e.g. חוק_המקרקעין)")
+ parser.add_argument("--id", required=True, help="Law ID for database (e.g. land-law-1969)")
+ parser.add_argument("--title-en", default="", help="English title")
+ parser.add_argument("--short-name", default="", help="Short abbreviation")
+ parser.add_argument("--status", default="in_force", choices=["in_force", "amended", "repealed"])
+ parser.add_argument("--issued-date", default="", help="Date issued (YYYY-MM-DD)")
+ parser.add_argument("--in-force-date", default="", help="Date in force (YYYY-MM-DD)")
+ parser.add_argument("--description", default="", help="English description")
+ parser.add_argument("--no-rebuild", action="store_true", help="Skip database rebuild")
+ parser.add_argument("--dry-run", action="store_true", help="Print stats without writing")
+ args = parser.parse_args()
+
+ # Step 1: Fetch from Wikisource
+ print(f"Fetching '{args.page_name}' from Hebrew Wikisource...")
+ wikitext = fetch_wikitext(args.page_name)
+ print(f" Retrieved {len(wikitext):,} characters")
+
+ # Step 2: Extract title
+ title_he = extract_title_from_wikitext(wikitext)
+ if not title_he:
+ title_he = args.page_name.replace("_", " ")
+ print(f" Hebrew title: {title_he}")
+
+ # Step 3: Parse
+ print("Parsing sections...")
+ result = parse_law(wikitext)
+ provisions = result["provisions"]
+ chapters = result["chapters"]
+ print(f" Found {len(provisions)} provisions in {len(set(chapters))} chapters/sections")
+
+ if not provisions:
+ print("ERROR: No provisions found. The page may not be a law or uses different markup.")
+ sys.exit(1)
+
+ # Step 4: Print chapter distribution
+ ch_dist = {}
+ for p in provisions:
+ ch_dist[p["chapter"]] = ch_dist.get(p["chapter"], 0) + 1
+ print("\n Chapter distribution:")
+ for ch, count in ch_dist.items():
+ print(f" {ch}: {count} sections")
+
+ if args.dry_run:
+ print("\n[Dry run - not writing files]")
+ print(f"\nSample (first provision):")
+ print(json.dumps(provisions[0], ensure_ascii=False, indent=2)[:300])
+ return
+
+ # Step 5: Build and write seed
+ url = f"https://he.wikisource.org/wiki/{args.page_name}"
+ seed = build_seed(
+ law_id=args.id,
+ title_he=title_he,
+ title_en=args.title_en,
+ short_name=args.short_name,
+ provisions=provisions,
+ url=url,
+ status=args.status,
+ issued_date=args.issued_date,
+ in_force_date=args.in_force_date,
+ description=args.description,
+ )
+
+ output_path = SEED_DIR / f"{args.id}.json"
+ with open(output_path, "w", encoding="utf-8") as f:
+ json.dump(seed, f, ensure_ascii=False, indent=2)
+ print(f"\nSeed written to: {output_path}")
+
+ # Step 6: Rebuild database
+ if not args.no_rebuild:
+ print("\nRebuilding database...")
+ result = subprocess.run(
+ ["npm", "run", "build:db"],
+ cwd=str(PROJECT_DIR),
+ capture_output=True,
+ text=True,
+ )
+ if result.returncode == 0:
+ print(result.stdout)
+ print("Database rebuilt successfully!")
+ else:
+ print(f"ERROR rebuilding database:\n{result.stderr}")
+ sys.exit(1)
+ else:
+ print("\nSkipped database rebuild (use 'npm run build:db' manually)")
+
+ print(f"\nDone! Added {len(provisions)} provisions for '{title_he}'")
+
+
+if __name__ == "__main__":
+ main()