From a92f543e7fb44dd79cffee626423553d1a9f09ac Mon Sep 17 00:00:00 2001 From: Chaim Date: Mon, 8 Jun 2026 07:24:05 +0000 Subject: [PATCH] feat(bulletins): staging endpoint /api/bulletins/upload (download archive first) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit העלון החודשי "עו"ד על נדל"ן" הוא פרסום נפרד מהיומון היומי (חודשי, רב-נושאי). לפני תכנון הקטלוג — נוריד את כל הארכיון (~29) לתיקייה. endpoint זה רק מ-stage את ה-PDF ל-data/bulletins/incoming (ללא DB), dedup לפי content_hash. n8n ימשוך מ-chaim.marcus@gmail (subject "עו"ד על נדל"ן") וישלח לכאן. Co-Authored-By: Claude Opus 4.8 (1M context) --- web/app.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/web/app.py b/web/app.py index 92475d8..2c091c9 100644 --- a/web/app.py +++ b/web/app.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +import hashlib import json import logging import os @@ -6094,6 +6095,35 @@ async def digest_unlink(digest_id: str): raise HTTPException(404, str(e)) +# ── Monthly bulletins ("עו"ד על נדל"ן") — staging-only (pre-catalog) ── +# A SEPARATE publication from the daily "כל יום" digests: a monthly, multi-topic +# real-estate-law newsletter. Their catalog/processing is not designed yet — this +# endpoint only STAGES the PDFs to data/bulletins/incoming so we can download the +# whole back-archive first, then plan how to enrich the corpus from them. No DB. +_BULLETINS_DIR = config.DATA_DIR / "bulletins" / "incoming" + + +@app.post("/api/bulletins/upload") +async def bulletin_upload(file: UploadFile = File(...)): + """Stage a monthly bulletin PDF to data/bulletins/incoming (no DB). Dedup by + content hash so re-running the n8n backfill never duplicates a file.""" + suffix = Path(file.filename or "").suffix.lower() + if suffix not in ALLOWED_EXTENSIONS: + raise HTTPException(400, f"סוג קובץ לא נתמך: {suffix}") + content = await file.read() + if len(content) > MAX_FILE_SIZE: + raise HTTPException(413, "קובץ גדול מדי") + digest = hashlib.sha256(content).hexdigest()[:12] + _BULLETINS_DIR.mkdir(parents=True, exist_ok=True) + safe = re.sub(r"[^\w.\-+א-ת ]", "_", Path(file.filename or "bulletin.pdf").name) + dest = _BULLETINS_DIR / f"{digest}_{safe}" + # Idempotent: same content (any filename) already staged → skip. + if any(p.name.startswith(f"{digest}_") for p in _BULLETINS_DIR.glob(f"{digest}_*")): + return {"status": "exists", "filename": dest.name, "size": len(content)} + dest.write_bytes(content) + return {"status": "stored", "filename": dest.name, "size": len(content)} + + from legal_mcp.services import internal_decisions as int_decisions_service # noqa: E402 -- 2.49.1