feat(bulletins): staging endpoint /api/bulletins/upload (download archive first) #148

Merged
chaim merged 1 commits from worktree-bulletins-download into main 2026-06-08 07:24:26 +00:00

View File

@@ -3,6 +3,7 @@
from __future__ import annotations
import asyncio
import hashlib
import json
import logging
import os
@@ -6094,6 +6095,35 @@ async def digest_unlink(digest_id: str):
raise HTTPException(404, str(e))
# ── Monthly bulletins ("עו"ד על נדל"ן") — staging-only (pre-catalog) ──
# A SEPARATE publication from the daily "כל יום" digests: a monthly, multi-topic
# real-estate-law newsletter. Their catalog/processing is not designed yet — this
# endpoint only STAGES the PDFs to data/bulletins/incoming so we can download the
# whole back-archive first, then plan how to enrich the corpus from them. No DB.
_BULLETINS_DIR = config.DATA_DIR / "bulletins" / "incoming"
@app.post("/api/bulletins/upload")
async def bulletin_upload(file: UploadFile = File(...)):
"""Stage a monthly bulletin PDF to data/bulletins/incoming (no DB). Dedup by
content hash so re-running the n8n backfill never duplicates a file."""
suffix = Path(file.filename or "").suffix.lower()
if suffix not in ALLOWED_EXTENSIONS:
raise HTTPException(400, f"סוג קובץ לא נתמך: {suffix}")
content = await file.read()
if len(content) > MAX_FILE_SIZE:
raise HTTPException(413, "קובץ גדול מדי")
digest = hashlib.sha256(content).hexdigest()[:12]
_BULLETINS_DIR.mkdir(parents=True, exist_ok=True)
safe = re.sub(r"[^\w.\-+א-ת ]", "_", Path(file.filename or "bulletin.pdf").name)
dest = _BULLETINS_DIR / f"{digest}_{safe}"
# Idempotent: same content (any filename) already staged → skip.
if any(p.name.startswith(f"{digest}_") for p in _BULLETINS_DIR.glob(f"{digest}_*")):
return {"status": "exists", "filename": dest.name, "size": len(content)}
dest.write_bytes(content)
return {"status": "stored", "filename": dest.name, "size": len(content)}
from legal_mcp.services import internal_decisions as int_decisions_service # noqa: E402