feat(bulletins): staging endpoint /api/bulletins/upload (download archive first)
העלון החודשי "עו"ד על נדל"ן" הוא פרסום נפרד מהיומון היומי (חודשי, רב-נושאי). לפני תכנון הקטלוג — נוריד את כל הארכיון (~29) לתיקייה. endpoint זה רק מ-stage את ה-PDF ל-data/bulletins/incoming (ללא DB), dedup לפי content_hash. n8n ימשוך מ-chaim.marcus@gmail (subject "עו"ד על נדל"ן") וישלח לכאן. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
30
web/app.py
30
web/app.py
@@ -3,6 +3,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@@ -6094,6 +6095,35 @@ async def digest_unlink(digest_id: str):
|
|||||||
raise HTTPException(404, str(e))
|
raise HTTPException(404, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
# ── Monthly bulletins ("עו"ד על נדל"ן") — staging-only (pre-catalog) ──
|
||||||
|
# A SEPARATE publication from the daily "כל יום" digests: a monthly, multi-topic
|
||||||
|
# real-estate-law newsletter. Their catalog/processing is not designed yet — this
|
||||||
|
# endpoint only STAGES the PDFs to data/bulletins/incoming so we can download the
|
||||||
|
# whole back-archive first, then plan how to enrich the corpus from them. No DB.
|
||||||
|
_BULLETINS_DIR = config.DATA_DIR / "bulletins" / "incoming"
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/bulletins/upload")
|
||||||
|
async def bulletin_upload(file: UploadFile = File(...)):
|
||||||
|
"""Stage a monthly bulletin PDF to data/bulletins/incoming (no DB). Dedup by
|
||||||
|
content hash so re-running the n8n backfill never duplicates a file."""
|
||||||
|
suffix = Path(file.filename or "").suffix.lower()
|
||||||
|
if suffix not in ALLOWED_EXTENSIONS:
|
||||||
|
raise HTTPException(400, f"סוג קובץ לא נתמך: {suffix}")
|
||||||
|
content = await file.read()
|
||||||
|
if len(content) > MAX_FILE_SIZE:
|
||||||
|
raise HTTPException(413, "קובץ גדול מדי")
|
||||||
|
digest = hashlib.sha256(content).hexdigest()[:12]
|
||||||
|
_BULLETINS_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
safe = re.sub(r"[^\w.\-+א-ת ]", "_", Path(file.filename or "bulletin.pdf").name)
|
||||||
|
dest = _BULLETINS_DIR / f"{digest}_{safe}"
|
||||||
|
# Idempotent: same content (any filename) already staged → skip.
|
||||||
|
if any(p.name.startswith(f"{digest}_") for p in _BULLETINS_DIR.glob(f"{digest}_*")):
|
||||||
|
return {"status": "exists", "filename": dest.name, "size": len(content)}
|
||||||
|
dest.write_bytes(content)
|
||||||
|
return {"status": "stored", "filename": dest.name, "size": len(content)}
|
||||||
|
|
||||||
|
|
||||||
from legal_mcp.services import internal_decisions as int_decisions_service # noqa: E402
|
from legal_mcp.services import internal_decisions as int_decisions_service # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user