legal-ai/scripts/ingest_bulletins.py

"""Ingest the monthly "עו"ד על נדל"ן" bulletin archive into the digests radar (X12).

Each staged bulletin PDF (data/bulletins/incoming) is split by LLM into case-law
pointers (digest_kind='decision') + articles (digest_kind='article'), all tagged
publication='עו"ד על נדל"ן'. Idempotent — per-item content_hash dedup, so re-runs
only add new items. Runs on the HOST (LLM is local-only), with the mcp-server venv:

    mcp-server/.venv/bin/python scripts/ingest_bulletins.py [--dir PATH] [--limit N]
"""
import argparse
import asyncio
import sys
import time
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))

from legal_mcp.services import db, bulletin_library  # noqa: E402

DEFAULT_DIR = Path(__file__).resolve().parent.parent / "data" / "bulletins" / "incoming"


async def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--dir", default=str(DEFAULT_DIR), help="folder of bulletin PDFs")
    ap.add_argument("--limit", type=int, default=0, help="process at most N files (0=all)")
    args = ap.parse_args()

    folder = Path(args.dir)
    files = sorted(p for p in folder.glob("*.pdf"))
    if args.limit:
        files = files[: args.limit]
    total = len(files)
    print(f"ingesting {total} bulletins from {folder}", flush=True)

    await db.get_pool()
    agg = {"cases": 0, "articles": 0, "created": 0, "skipped": 0, "linked": 0}
    t0 = time.time()
    for i, f in enumerate(files, 1):
        try:
            r = await bulletin_library.ingest_bulletin(str(f))
            for k in agg:
                agg[k] += r.get(k, 0)
            print(f"[{i}/{total}] {r['file']}: cases={r['cases']} articles={r['articles']} "
                  f"created={r['created']} skipped={r['skipped']} linked={r['linked']}", flush=True)
        except Exception as e:  # one bad bulletin must not abort the batch
            print(f"[{i}/{total}] FAIL {f.name}: {type(e).__name__}: {e}", flush=True)

    print(f"\nDONE {total} bulletins in {(time.time()-t0)/60:.1f}min | "
          f"cases={agg['cases']} articles={agg['articles']} created={agg['created']} "
          f"skipped={agg['skipped']} linked={agg['linked']}", flush=True)
    await db.close_pool()


if __name__ == "__main__":
    asyncio.run(main())