"""Batch ingest of "כל יום" daily digests staged in data/digests/incoming/ (X12). Sequential (NOT concurrent — same load-spike caution as ingest_incoming_batch.py) ingest of each yomon PDF via the standalone digest pipeline (``digest_library.ingest_digest``), which: - extracts text, dedups on content_hash (idempotent), - runs the local LLM metadata extractor (concept_tag, headline, underlying citation, two dates, practice_area, subject_tags), - stores a single embedding, - auto-links to the underlying ruling if it is already in the precedent library (INV-DIG3). The digest is a SECONDARY, radar-only source — it never enters the precedent / halacha pipeline and is never cited in a decision (INV-DIG1/2). After this run, relink unmatched digests once the originals are uploaded, or surface them via missing_precedent_create. Yomon number + issue date are parsed from the filename ("יומון 5158 - 31.5.26.pdf") as hints; the LLM also extracts them from the body and the explicit hint wins. The monthly bulletin (e.g. "201 יוני.pdf") is multi-topic and skipped (Phase 3). Run: mcp-server/.venv/bin/python scripts/ingest_digests_batch.py (optionally pass explicit file paths as args) Config (POSTGRES_URL, VOYAGE_API_KEY, ANTHROPIC_API_KEY) auto-loads from ~/.env. """ import asyncio import os import re import shutil import sys import traceback from pathlib import Path sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "mcp-server", "src")) from legal_mcp import config # noqa: E402 from legal_mcp.services import digest_library as svc # noqa: E402 INCOMING = Path(config.DATA_DIR) / "digests" / "incoming" PROCESSED = Path(config.DATA_DIR) / "digests" / "processed" # Matches "יומון 5158 - 31.5.26" → ("5158", "31.5.26") _NAME_RE = re.compile(r"יומון\s*(\d+)\s*-\s*(\d{1,2})\.(\d{1,2})\.(\d{2,4})") def _parse_name(fname: str) -> tuple[str, str | None]: """Return (yomon_number, iso_date_or_None) parsed from the filename.""" m = _NAME_RE.search(fname) if not m: return "", None num, dd, mm, yy = m.groups() year = int(yy) if year < 100: year += 2000 try: iso = f"{year:04d}-{int(mm):02d}-{int(dd):02d}" except ValueError: iso = None return num, iso def _discover() -> list[Path]: if not INCOMING.exists(): return [] out = [] for p in sorted(INCOMING.glob("*.pdf")): if "יומון" not in p.name: print(f"⊘ skip (not a single yomon): {p.name}", flush=True) continue out.append(p) return out async def main(argv: list[str]) -> None: files = [Path(a) for a in argv] if argv else _discover() if not files: print(f"No yomon PDFs found in {INCOMING}", flush=True) return PROCESSED.mkdir(parents=True, exist_ok=True) results = [] for idx, fp in enumerate(files): rec = {"file": fp.name} if not fp.exists(): rec["error"] = "file-missing" print(f"✗ {fp.name}: file missing", flush=True) results.append(rec) continue yomon_number, iso_date = _parse_name(fp.name) try: out = await svc.ingest_digest( file_path=fp, yomon_number=yomon_number, digest_date=iso_date, ) rec.update({ "status": out.get("status"), "digest_id": out.get("digest_id"), "yomon_number": out.get("yomon_number"), "underlying_citation": out.get("underlying_citation"), "linked_case_law_id": out.get("linked_case_law_id"), }) link = "🔗 linked" if out.get("linked_case_law_id") else "⚠ unlinked" print( f"✓ {fp.name}: {out.get('status')} | yomon={out.get('yomon_number')} | " f"{link} | {out.get('underlying_citation')}", flush=True, ) # Move to processed/ so re-runs are clean (idempotent anyway). try: shutil.move(str(fp), str(PROCESSED / fp.name)) except Exception as e: print(f" (could not move {fp.name}: {e})", flush=True) except Exception as e: rec["error"] = f"{type(e).__name__}: {e}" print(f"✗ {fp.name}: {e}", flush=True) traceback.print_exc() results.append(rec) print("\n===SUMMARY===", flush=True) for r in results: print(r, flush=True) linked = sum(1 for r in results if r.get("linked_case_law_id")) unlinked = sum( 1 for r in results if r.get("status") in ("completed", "exists") and not r.get("linked_case_law_id") ) print( f"\nTotal: {len(results)} | linked: {linked} | unlinked (need precedent upload): {unlinked}", flush=True, ) if __name__ == "__main__": asyncio.run(main(sys.argv[1:]))