feat(storage): #106.5 — חיווט-קריאה serve_blob (presigned + dual disk-fallback) #198

Merged
chaim merged 1 commits from worktree-storage-read-wiring-presigned into main 2026-06-11 17:43:58 +00:00
2 changed files with 119 additions and 6 deletions
Showing only changes of commit 63784f1f91 - Show all commits

View File

@@ -23,7 +23,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "
import zipfile
from fastapi import BackgroundTasks, Body, FastAPI, File, Form, HTTPException, UploadFile
from fastapi.responses import FileResponse, StreamingResponse
from fastapi.responses import FileResponse, RedirectResponse, StreamingResponse
from typing import Any, Literal
from pydantic import BaseModel
@@ -31,7 +31,7 @@ import asyncpg
import httpx
from legal_mcp import config
from legal_mcp.services import chunker, db, embeddings, extractor, git_sync, metrics as metrics_service, processor, proofreader, research_md
from legal_mcp.services import chunker, db, embeddings, extractor, git_sync, metrics as metrics_service, processor, proofreader, research_md, storage
from legal_mcp.tools import cases as cases_tools, search as search_tools, workflow as workflow_tools, drafting as drafting_tools, precedents as precedents_tools
from legal_mcp.tools.envelope import envelope_unwrap
@@ -2769,6 +2769,39 @@ async def api_local_files(case_number: str):
return result
async def serve_blob(
path, *, media_type: str, filename: str,
bucket=storage.Bucket.DOCUMENTS,
):
"""Serve a case blob, routing through the storage layer (#106.5, INV-STG6).
- ``filesystem`` backend → stream the file from disk (unchanged behaviour).
- ``s3``/``dual`` backend → if the object is in MinIO, 302-redirect to a
short-lived presigned URL (bytes go browser↔MinIO, never through FastAPI);
under ``dual`` a not-yet-migrated file (e.g. the dynamically-built analysis
DOCX or research markdown) transparently falls back to its disk copy. This
is the tri-model panel's agreed cutover-safety design (2026-06-11): dual +
disk-fallback covers files outside the DB-tracked migration set.
The caller still owns its own existence/path-safety checks; this only
decides HOW to serve a path it already validated.
"""
from pathlib import Path as _Path
backend = storage.get_storage()
if backend.name != "filesystem":
try:
key = storage.normalize_key(path)
if await backend.exists(key, bucket=bucket):
url = await backend.presign_get(key, bucket=bucket, download_name=filename)
return RedirectResponse(url, status_code=302)
except Exception: # noqa: BLE001 — never 500 on a key/presign hiccup; fall back to disk
logger.exception("serve_blob: presign failed for %s; falling back to disk", path)
# not in object storage (dual fallback) — serve the disk copy if present
if not _Path(path).exists():
raise HTTPException(404, "קובץ לא נמצא")
return FileResponse(path, media_type=media_type, filename=filename)
@app.get("/api/cases/{case_number}/local-files/{folder}/{filename}")
async def api_read_local_file(case_number: str, folder: str, filename: str):
"""Read contents of a local case file."""
@@ -2781,7 +2814,7 @@ async def api_read_local_file(case_number: str, folder: str, filename: str):
path = case_dir / "documents" / folder / filename
if not path.exists() or not path.is_file():
raise HTTPException(404, "קובץ לא נמצא")
return FileResponse(path, media_type="text/plain; charset=utf-8", filename=filename)
return await serve_blob(path, media_type="text/plain; charset=utf-8", filename=filename)
# ── Research analysis (analysis-and-research.md) — parse + edit ────
@@ -2812,7 +2845,7 @@ async def api_research_analysis_download(case_number: str):
path = _research_file_path(case_number)
if not path.exists():
raise HTTPException(404, "טרם בוצע ניתוח משפטי לתיק זה")
return FileResponse(
return await serve_blob(
path,
media_type="text/markdown; charset=utf-8",
filename=f"analysis-{case_number}.md",
@@ -2833,7 +2866,7 @@ async def api_research_analysis_export_docx(case_number: str):
case_dir = config.find_case_dir(case_number)
if case_dir.exists():
commit_and_push(case_dir, f"ניתוח משפטי: {path.name}")
return FileResponse(
return await serve_blob(
path,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
filename=path.name,
@@ -3095,7 +3128,7 @@ async def api_download_export(case_number: str, filename: str):
path = export_dir / filename
if not path.exists() or not path.parent.samefile(export_dir):
raise HTTPException(404, "קובץ לא נמצא")
return FileResponse(
return await serve_blob(
path,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
filename=filename,

View File

@@ -0,0 +1,80 @@
"""Tests for #106.5 — web.app.serve_blob backend routing (INV-STG6).
Verifies the tri-model-panel cutover-safety design (2026-06-11):
- filesystem → FileResponse from disk (behaviour-preserving, current prod).
- s3/dual → 302 redirect to a presigned URL when the object is in MinIO.
- dual + miss → disk fallback (covers files outside the DB-tracked migration set
— e.g. dynamically-built analysis DOCX / research markdown).
- s3 + miss + no disk → 404.
Importing web/app.py needs a few env vars (it wires the Paperclip pool at import);
they're set before import. Skips cleanly if the heavy import can't be satisfied.
"""
from __future__ import annotations
import asyncio
import os
import sys
from pathlib import Path
import pytest
os.environ.setdefault("PAPERCLIP_DB_URL", "postgres://x:x@127.0.0.1:54329/paperclip")
os.environ.setdefault("DATA_DIR", "/home/chaim/legal-ai/data")
sys.path.insert(0, str(Path(__file__).resolve().parents[1])) # web/
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "mcp-server" / "src")) # legal_mcp
app = pytest.importorskip("app", reason="web/app.py import prerequisites unavailable")
from fastapi.responses import FileResponse, RedirectResponse # noqa: E402
from legal_mcp.services import storage # noqa: E402
DATA_DIR = Path(os.environ["DATA_DIR"])
class _FakeBackend:
def __init__(self, name: str, has: bool) -> None:
self.name, self._has = name, has
async def exists(self, key, *, bucket) -> bool: # noqa: ANN001
return self._has
async def presign_get(self, key, *, bucket, download_name=None) -> str: # noqa: ANN001
return f"https://s3.example/{key}"
@pytest.fixture()
def blob(tmp_path_factory, monkeypatch):
# a real file UNDER DATA_DIR so storage.normalize_key accepts it
d = DATA_DIR / "audit"
d.mkdir(parents=True, exist_ok=True)
f = d / "_serveblob_pytest.txt"
f.write_text("hi")
yield f
f.unlink(missing_ok=True)
def _serve(monkeypatch, name, has, path):
monkeypatch.setattr(storage, "get_storage", lambda: _FakeBackend(name, has))
return asyncio.new_event_loop().run_until_complete(
app.serve_blob(str(path), media_type="text/plain", filename="x.txt"))
def test_filesystem_serves_from_disk(blob, monkeypatch):
assert isinstance(_serve(monkeypatch, "filesystem", False, blob), FileResponse)
def test_dual_in_s3_redirects_presigned(blob, monkeypatch):
assert isinstance(_serve(monkeypatch, "dual", True, blob), RedirectResponse)
def test_dual_missing_falls_back_to_disk(blob, monkeypatch):
# the panel's safety net: a file not yet in MinIO is still served from disk
assert isinstance(_serve(monkeypatch, "dual", False, blob), FileResponse)
def test_s3_missing_no_disk_404(monkeypatch):
from fastapi import HTTPException
with pytest.raises(HTTPException) as ei:
_serve(monkeypatch, "s3", False, DATA_DIR / "audit" / "_nope.txt")
assert ei.value.status_code == 404