"""Ingest the monthly "עו"ד על נדל"ן" bulletin archive into the digests radar (X12). Each staged bulletin PDF (data/bulletins/incoming) is split by LLM into case-law pointers (digest_kind='decision') + articles (digest_kind='article'), all tagged publication='עו"ד על נדל"ן'. Idempotent — per-item content_hash dedup, so re-runs only add new items. Runs on the HOST (LLM is local-only), with the mcp-server venv: mcp-server/.venv/bin/python scripts/ingest_bulletins.py [--dir PATH] [--limit N] """ import argparse import asyncio import sys import time from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src")) from legal_mcp.services import db, bulletin_library # noqa: E402 DEFAULT_DIR = Path(__file__).resolve().parent.parent / "data" / "bulletins" / "incoming" async def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--dir", default=str(DEFAULT_DIR), help="folder of bulletin PDFs") ap.add_argument("--limit", type=int, default=0, help="process at most N files (0=all)") args = ap.parse_args() folder = Path(args.dir) files = sorted(p for p in folder.glob("*.pdf")) if args.limit: files = files[: args.limit] total = len(files) print(f"ingesting {total} bulletins from {folder}", flush=True) await db.get_pool() agg = {"cases": 0, "articles": 0, "created": 0, "skipped": 0, "linked": 0} t0 = time.time() for i, f in enumerate(files, 1): try: r = await bulletin_library.ingest_bulletin(str(f)) for k in agg: agg[k] += r.get(k, 0) print(f"[{i}/{total}] {r['file']}: cases={r['cases']} articles={r['articles']} " f"created={r['created']} skipped={r['skipped']} linked={r['linked']}", flush=True) except Exception as e: # one bad bulletin must not abort the batch print(f"[{i}/{total}] FAIL {f.name}: {type(e).__name__}: {e}", flush=True) print(f"\nDONE {total} bulletins in {(time.time()-t0)/60:.1f}min | " f"cases={agg['cases']} articles={agg['articles']} created={agg['created']} " f"skipped={agg['skipped']} linked={agg['linked']}", flush=True) await db.close_pool() if __name__ == "__main__": asyncio.run(main())