From 63784f1f91aa33cf2bf9707b939f6a9c61dc6f71 Mon Sep 17 00:00:00 2001 From: Chaim Date: Thu, 11 Jun 2026 17:43:31 +0000 Subject: [PATCH] =?UTF-8?q?feat(storage):=20#106.5=20=E2=80=94=20read-wiri?= =?UTF-8?q?ng=20via=20serve=5Fblob=20(presigned=20+=20dual=20disk-fallback?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit חיווט-קריאה של 4 endpoints מגישי-קבצים (api_read_local_file · research/analysis download · analysis export-docx · exports download) דרך helper serve_blob יחיד (INV-STG6). מיישם את אסטרטגיית-ה-cutover שהפאנל התלת-מודלי (Opus+DeepSeek+Gemini) אישר פה-אחד 2026-06-11: - filesystem → FileResponse מדיסק (משמר-התנהגות; ה-backend הפעיל בייצור — אפס שינוי). - s3/dual → 302 ל-presigned-URL כשהאובייקט ב-MinIO (bytes browser↔MinIO, לא דרך FastAPI). - dual + miss → **fallback-לדיסק** — מכסה שקוף קבצים שמחוץ לסט-ההגירה מתויג-ה-DB (analysis-and-research.md, DOCX דינמי, proofread). זו רשת-הביטחון שהפאנל דרש. - s3 + miss + ללא-דיסק → 404. כשל normalize_key/presign → fallback-לדיסק, לעולם לא 500 (לא נשבר בשקט — logger.exception). ה-cutover (#106.6 flip ל-s3) + WORM (#106.7) **נשארים נעולים מאחורי אישור-אדם** — הכרעת-הפאנל פה-אחד (proceed_autonomously=false). PR זה הפיך: תחת filesystem אין שינוי- התנהגות, וה-helper מוכן להפעלה כשיוחלט flip מפוקח + curl-ירוק per-endpoint. invariants: INV-STG6 (presigned) · INV-STG1 (storage layer יחיד) · G2 (serve_blob יחיד, לא 4 העתקי-לוגיקה) · INV-G10 (אפס שינוי-התנהגות בייצור filesystem). tests: 4 חדשות (web/tests/test_serve_blob.py — filesystem/dual-S3/dual-fallback/s3-404), עוברות. py_compile OK. מקור: פאנל תלת-מודלי (תיעוד ב-TaskMaster #106.6). Co-Authored-By: Claude Opus 4.8 --- web/app.py | 45 +++++++++++++++++--- web/tests/test_serve_blob.py | 80 ++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 6 deletions(-) create mode 100644 web/tests/test_serve_blob.py diff --git a/web/app.py b/web/app.py index e31f2fd..d85cce0 100644 --- a/web/app.py +++ b/web/app.py @@ -23,7 +23,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / " import zipfile from fastapi import BackgroundTasks, Body, FastAPI, File, Form, HTTPException, UploadFile -from fastapi.responses import FileResponse, StreamingResponse +from fastapi.responses import FileResponse, RedirectResponse, StreamingResponse from typing import Any, Literal from pydantic import BaseModel @@ -31,7 +31,7 @@ import asyncpg import httpx from legal_mcp import config -from legal_mcp.services import chunker, db, embeddings, extractor, git_sync, metrics as metrics_service, processor, proofreader, research_md +from legal_mcp.services import chunker, db, embeddings, extractor, git_sync, metrics as metrics_service, processor, proofreader, research_md, storage from legal_mcp.tools import cases as cases_tools, search as search_tools, workflow as workflow_tools, drafting as drafting_tools, precedents as precedents_tools from legal_mcp.tools.envelope import envelope_unwrap @@ -2769,6 +2769,39 @@ async def api_local_files(case_number: str): return result +async def serve_blob( + path, *, media_type: str, filename: str, + bucket=storage.Bucket.DOCUMENTS, +): + """Serve a case blob, routing through the storage layer (#106.5, INV-STG6). + + - ``filesystem`` backend → stream the file from disk (unchanged behaviour). + - ``s3``/``dual`` backend → if the object is in MinIO, 302-redirect to a + short-lived presigned URL (bytes go browser↔MinIO, never through FastAPI); + under ``dual`` a not-yet-migrated file (e.g. the dynamically-built analysis + DOCX or research markdown) transparently falls back to its disk copy. This + is the tri-model panel's agreed cutover-safety design (2026-06-11): dual + + disk-fallback covers files outside the DB-tracked migration set. + + The caller still owns its own existence/path-safety checks; this only + decides HOW to serve a path it already validated. + """ + from pathlib import Path as _Path + backend = storage.get_storage() + if backend.name != "filesystem": + try: + key = storage.normalize_key(path) + if await backend.exists(key, bucket=bucket): + url = await backend.presign_get(key, bucket=bucket, download_name=filename) + return RedirectResponse(url, status_code=302) + except Exception: # noqa: BLE001 — never 500 on a key/presign hiccup; fall back to disk + logger.exception("serve_blob: presign failed for %s; falling back to disk", path) + # not in object storage (dual fallback) — serve the disk copy if present + if not _Path(path).exists(): + raise HTTPException(404, "קובץ לא נמצא") + return FileResponse(path, media_type=media_type, filename=filename) + + @app.get("/api/cases/{case_number}/local-files/{folder}/{filename}") async def api_read_local_file(case_number: str, folder: str, filename: str): """Read contents of a local case file.""" @@ -2781,7 +2814,7 @@ async def api_read_local_file(case_number: str, folder: str, filename: str): path = case_dir / "documents" / folder / filename if not path.exists() or not path.is_file(): raise HTTPException(404, "קובץ לא נמצא") - return FileResponse(path, media_type="text/plain; charset=utf-8", filename=filename) + return await serve_blob(path, media_type="text/plain; charset=utf-8", filename=filename) # ── Research analysis (analysis-and-research.md) — parse + edit ──── @@ -2812,7 +2845,7 @@ async def api_research_analysis_download(case_number: str): path = _research_file_path(case_number) if not path.exists(): raise HTTPException(404, "טרם בוצע ניתוח משפטי לתיק זה") - return FileResponse( + return await serve_blob( path, media_type="text/markdown; charset=utf-8", filename=f"analysis-{case_number}.md", @@ -2833,7 +2866,7 @@ async def api_research_analysis_export_docx(case_number: str): case_dir = config.find_case_dir(case_number) if case_dir.exists(): commit_and_push(case_dir, f"ניתוח משפטי: {path.name}") - return FileResponse( + return await serve_blob( path, media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", filename=path.name, @@ -3095,7 +3128,7 @@ async def api_download_export(case_number: str, filename: str): path = export_dir / filename if not path.exists() or not path.parent.samefile(export_dir): raise HTTPException(404, "קובץ לא נמצא") - return FileResponse( + return await serve_blob( path, media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", filename=filename, diff --git a/web/tests/test_serve_blob.py b/web/tests/test_serve_blob.py new file mode 100644 index 0000000..6ec8373 --- /dev/null +++ b/web/tests/test_serve_blob.py @@ -0,0 +1,80 @@ +"""Tests for #106.5 — web.app.serve_blob backend routing (INV-STG6). + +Verifies the tri-model-panel cutover-safety design (2026-06-11): +- filesystem → FileResponse from disk (behaviour-preserving, current prod). +- s3/dual → 302 redirect to a presigned URL when the object is in MinIO. +- dual + miss → disk fallback (covers files outside the DB-tracked migration set + — e.g. dynamically-built analysis DOCX / research markdown). +- s3 + miss + no disk → 404. + +Importing web/app.py needs a few env vars (it wires the Paperclip pool at import); +they're set before import. Skips cleanly if the heavy import can't be satisfied. +""" + +from __future__ import annotations + +import asyncio +import os +import sys +from pathlib import Path + +import pytest + +os.environ.setdefault("PAPERCLIP_DB_URL", "postgres://x:x@127.0.0.1:54329/paperclip") +os.environ.setdefault("DATA_DIR", "/home/chaim/legal-ai/data") +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) # web/ +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "mcp-server" / "src")) # legal_mcp + +app = pytest.importorskip("app", reason="web/app.py import prerequisites unavailable") +from fastapi.responses import FileResponse, RedirectResponse # noqa: E402 +from legal_mcp.services import storage # noqa: E402 + +DATA_DIR = Path(os.environ["DATA_DIR"]) + + +class _FakeBackend: + def __init__(self, name: str, has: bool) -> None: + self.name, self._has = name, has + + async def exists(self, key, *, bucket) -> bool: # noqa: ANN001 + return self._has + + async def presign_get(self, key, *, bucket, download_name=None) -> str: # noqa: ANN001 + return f"https://s3.example/{key}" + + +@pytest.fixture() +def blob(tmp_path_factory, monkeypatch): + # a real file UNDER DATA_DIR so storage.normalize_key accepts it + d = DATA_DIR / "audit" + d.mkdir(parents=True, exist_ok=True) + f = d / "_serveblob_pytest.txt" + f.write_text("hi") + yield f + f.unlink(missing_ok=True) + + +def _serve(monkeypatch, name, has, path): + monkeypatch.setattr(storage, "get_storage", lambda: _FakeBackend(name, has)) + return asyncio.new_event_loop().run_until_complete( + app.serve_blob(str(path), media_type="text/plain", filename="x.txt")) + + +def test_filesystem_serves_from_disk(blob, monkeypatch): + assert isinstance(_serve(monkeypatch, "filesystem", False, blob), FileResponse) + + +def test_dual_in_s3_redirects_presigned(blob, monkeypatch): + assert isinstance(_serve(monkeypatch, "dual", True, blob), RedirectResponse) + + +def test_dual_missing_falls_back_to_disk(blob, monkeypatch): + # the panel's safety net: a file not yet in MinIO is still served from disk + assert isinstance(_serve(monkeypatch, "dual", False, blob), FileResponse) + + +def test_s3_missing_no_disk_404(monkeypatch): + from fastapi import HTTPException + with pytest.raises(HTTPException) as ei: + _serve(monkeypatch, "s3", False, DATA_DIR / "audit" / "_nope.txt") + assert ei.value.status_code == 404