feat(bulletins): catalog monthly "עו"ד על נדל"ן" bulletins into the radar (X12)
עלון חודשי רב-נושאי (פרסום נפרד מהיומון היומי) → מתפצל ל-N שורות digest באותה טבלה (publication='עו"ד על נדל"ן', לא קורפוס מקביל — G2): - bulletin_splitter (LLM local-only, tools=""): מפצל ל-cases[]+articles[]; עדכוני-חקיקה מדולגים (החלטת יו"ר). - bulletin_library.ingest_bulletin: כל מצביע-פסיקה → digest_kind='decision' + embedding + autolink (כולל X13 court-fetch); כל מאמר → digest_kind='article' (טקסט-מלא + embedding, רקע בלבד — INV-DIG1 חל). - content_hash per-item הוא מפתח-הדדאפ (yomon_number ריק) → אידמפוטנטי. - db.create_digest: פרמטר digest_kind (זורם ל-INSERT + upsert). - scripts/ingest_bulletins.py (host, venv) לעיבוד הארכיון. - spec X12 §2.1. אומת (dry-run, ללא DB): עלון 180 → 4 cases+1 article · עלון 201 → 4 cases (כולל ערר-197) +1 article. עדכוני-חקיקה דולגו. claude_session נשאר local-only. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
56
scripts/ingest_bulletins.py
Normal file
56
scripts/ingest_bulletins.py
Normal file
@@ -0,0 +1,56 @@
|
||||
"""Ingest the monthly "עו"ד על נדל"ן" bulletin archive into the digests radar (X12).
|
||||
|
||||
Each staged bulletin PDF (data/bulletins/incoming) is split by LLM into case-law
|
||||
pointers (digest_kind='decision') + articles (digest_kind='article'), all tagged
|
||||
publication='עו"ד על נדל"ן'. Idempotent — per-item content_hash dedup, so re-runs
|
||||
only add new items. Runs on the HOST (LLM is local-only), with the mcp-server venv:
|
||||
|
||||
mcp-server/.venv/bin/python scripts/ingest_bulletins.py [--dir PATH] [--limit N]
|
||||
"""
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from legal_mcp.services import db, bulletin_library # noqa: E402
|
||||
|
||||
DEFAULT_DIR = Path(__file__).resolve().parent.parent / "data" / "bulletins" / "incoming"
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--dir", default=str(DEFAULT_DIR), help="folder of bulletin PDFs")
|
||||
ap.add_argument("--limit", type=int, default=0, help="process at most N files (0=all)")
|
||||
args = ap.parse_args()
|
||||
|
||||
folder = Path(args.dir)
|
||||
files = sorted(p for p in folder.glob("*.pdf"))
|
||||
if args.limit:
|
||||
files = files[: args.limit]
|
||||
total = len(files)
|
||||
print(f"ingesting {total} bulletins from {folder}", flush=True)
|
||||
|
||||
await db.get_pool()
|
||||
agg = {"cases": 0, "articles": 0, "created": 0, "skipped": 0, "linked": 0}
|
||||
t0 = time.time()
|
||||
for i, f in enumerate(files, 1):
|
||||
try:
|
||||
r = await bulletin_library.ingest_bulletin(str(f))
|
||||
for k in agg:
|
||||
agg[k] += r.get(k, 0)
|
||||
print(f"[{i}/{total}] {r['file']}: cases={r['cases']} articles={r['articles']} "
|
||||
f"created={r['created']} skipped={r['skipped']} linked={r['linked']}", flush=True)
|
||||
except Exception as e: # one bad bulletin must not abort the batch
|
||||
print(f"[{i}/{total}] FAIL {f.name}: {type(e).__name__}: {e}", flush=True)
|
||||
|
||||
print(f"\nDONE {total} bulletins in {(time.time()-t0)/60:.1f}min | "
|
||||
f"cases={agg['cases']} articles={agg['articles']} created={agg['created']} "
|
||||
f"skipped={agg['skipped']} linked={agg['linked']}", flush=True)
|
||||
await db.close_pool()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user