feat(bulletins): staging endpoint /api/bulletins/upload (download archive first) #148
30
web/app.py
30
web/app.py
@@ -3,6 +3,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -6094,6 +6095,35 @@ async def digest_unlink(digest_id: str):
|
||||
raise HTTPException(404, str(e))
|
||||
|
||||
|
||||
# ── Monthly bulletins ("עו"ד על נדל"ן") — staging-only (pre-catalog) ──
|
||||
# A SEPARATE publication from the daily "כל יום" digests: a monthly, multi-topic
|
||||
# real-estate-law newsletter. Their catalog/processing is not designed yet — this
|
||||
# endpoint only STAGES the PDFs to data/bulletins/incoming so we can download the
|
||||
# whole back-archive first, then plan how to enrich the corpus from them. No DB.
|
||||
_BULLETINS_DIR = config.DATA_DIR / "bulletins" / "incoming"
|
||||
|
||||
|
||||
@app.post("/api/bulletins/upload")
|
||||
async def bulletin_upload(file: UploadFile = File(...)):
|
||||
"""Stage a monthly bulletin PDF to data/bulletins/incoming (no DB). Dedup by
|
||||
content hash so re-running the n8n backfill never duplicates a file."""
|
||||
suffix = Path(file.filename or "").suffix.lower()
|
||||
if suffix not in ALLOWED_EXTENSIONS:
|
||||
raise HTTPException(400, f"סוג קובץ לא נתמך: {suffix}")
|
||||
content = await file.read()
|
||||
if len(content) > MAX_FILE_SIZE:
|
||||
raise HTTPException(413, "קובץ גדול מדי")
|
||||
digest = hashlib.sha256(content).hexdigest()[:12]
|
||||
_BULLETINS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
safe = re.sub(r"[^\w.\-+א-ת ]", "_", Path(file.filename or "bulletin.pdf").name)
|
||||
dest = _BULLETINS_DIR / f"{digest}_{safe}"
|
||||
# Idempotent: same content (any filename) already staged → skip.
|
||||
if any(p.name.startswith(f"{digest}_") for p in _BULLETINS_DIR.glob(f"{digest}_*")):
|
||||
return {"status": "exists", "filename": dest.name, "size": len(content)}
|
||||
dest.write_bytes(content)
|
||||
return {"status": "stored", "filename": dest.name, "size": len(content)}
|
||||
|
||||
|
||||
from legal_mcp.services import internal_decisions as int_decisions_service # noqa: E402
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user