feat(storage): X14 Phase 1 — unified storage layer (services/storage.py)

The single choke-point for all binary file I/O (originals, derived
artifacts, exports), replacing the scattered open()/shutil/Path.write_bytes
calls across ~8 services. Backend chosen by STORAGE_BACKEND:
- filesystem (default): disk under DATA_DIR — byte-for-byte legacy behaviour
- dual: write disk + S3, read S3→disk fallback (migration window)
- s3: MinIO via aioboto3 (lazy import; absent in the filesystem path)

Keys are DATA_DIR-relative POSIX paths; the FS backend ignores the logical
bucket and keeps the existing single tree, so the default backend is zero
behaviour change. S3 maps a governance bucket (documents/immutable/derived)
→ MinIO bucket; presigned URLs are minted against the public endpoint
(browser-reachable) and carry the Hebrew filename via RFC-5987
Content-Disposition.

- config: STORAGE_BACKEND + MINIO_* (endpoint, public-endpoint, creds,
  region, 3 bucket names, presign TTL)
- mcp_env_catalog: new "storage" category + 10 specs (X10/INV-ENV1)
- pyproject: aioboto3>=13 (consumed here, deployed with first use)
- tests: 18 unit tests (FS round-trip, key normalization/traversal guard,
  bucket resolution, backend selection, dual write-both + S3-down fallback)

No call-sites are rewired yet — that is Phase 2 (106.3). STORAGE_BACKEND
stays filesystem in prod, so behaviour is unchanged.

Invariants: keeps G2 (one storage path replaces scattered I/O); establishes
INV-STG1 (single layer), INV-STG2 (atomic keys, Hebrew name in metadata),
INV-STG3 (governance buckets), INV-STG6 (presigned serving).
Spec: docs/spec/X14-storage-minio.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-08 07:47:49 +00:00
parent ade22ca871
commit b4a28f072d
5 changed files with 751 additions and 1 deletions

View File

@@ -0,0 +1,198 @@
"""Unit tests for the unified storage layer (X14, services/storage.py).
Sync tests driving the async API via asyncio.run (matches the repo
convention — no pytest-asyncio). The filesystem backend is exercised against a
tmp DATA_DIR; the S3 path is stubbed so the suite needs no MinIO and no
aioboto3.
"""
import asyncio
import pytest
from legal_mcp import config
from legal_mcp.services import storage
from legal_mcp.services.storage import Bucket
@pytest.fixture(autouse=True)
def _tmp_datadir(tmp_path, monkeypatch):
monkeypatch.setattr(config, "DATA_DIR", tmp_path)
monkeypatch.setattr(config, "STORAGE_BACKEND", "filesystem")
storage.reset_storage_cache()
yield tmp_path
storage.reset_storage_cache()
def run(coro):
return asyncio.run(coro)
# ── normalize_key ──────────────────────────────────────────────────
def test_normalize_key_relative():
assert storage.normalize_key("cases/8174-24/originals/x.pdf") == \
"cases/8174-24/originals/x.pdf"
def test_normalize_key_rejects_parent_traversal():
with pytest.raises(ValueError):
storage.normalize_key("cases/../../etc/passwd")
def test_normalize_key_accepts_abs_under_datadir(_tmp_datadir):
abs_key = _tmp_datadir / "cases" / "x.pdf"
assert storage.normalize_key(abs_key) == "cases/x.pdf"
def test_normalize_key_rejects_abs_outside_datadir():
with pytest.raises(ValueError):
storage.normalize_key("/var/secret/x.pdf")
# ── filesystem backend ─────────────────────────────────────────────
def test_filesystem_roundtrip(_tmp_datadir):
be = storage.FilesystemBackend()
key = "cases/1/originals/a.pdf"
uri = run(be.put_bytes(key, b"hello", content_type="application/pdf"))
assert uri.startswith("file://")
assert (_tmp_datadir / key).read_bytes() == b"hello"
assert run(be.exists(key)) is True
assert run(be.get_bytes(key)) == b"hello"
run(be.delete(key))
assert run(be.exists(key)) is False
# delete is idempotent (missing_ok)
run(be.delete(key))
def test_filesystem_put_file(_tmp_datadir):
be = storage.FilesystemBackend()
src = _tmp_datadir / "src.bin"
src.write_bytes(b"payload")
run(be.put_file(src, "precedent-library/court_ruling/u.pdf"))
assert (_tmp_datadir / "precedent-library/court_ruling/u.pdf").read_bytes() == b"payload"
def test_filesystem_bucket_is_ignored_legacy_layout(_tmp_datadir):
"""Even for a non-default bucket, the FS backend keeps the single tree —
so the default backend is byte-for-byte the legacy layout."""
be = storage.FilesystemBackend()
run(be.put_bytes("thumbnails/d/p001.jpg", b"img", bucket=Bucket.DERIVED))
assert (_tmp_datadir / "thumbnails/d/p001.jpg").exists()
def test_filesystem_list_keys(_tmp_datadir):
be = storage.FilesystemBackend()
run(be.put_bytes("cases/1/a.txt", b"a"))
run(be.put_bytes("cases/1/sub/b.txt", b"b"))
run(be.put_bytes("cases/2/c.txt", b"c"))
keys = run(be.list_keys("cases/1"))
assert keys == ["cases/1/a.txt", "cases/1/sub/b.txt"]
def test_filesystem_local_path_and_ensure_local(_tmp_datadir):
be = storage.FilesystemBackend()
run(be.put_bytes("cases/1/a.pdf", b"x"))
assert be.local_path("cases/1/a.pdf") == (_tmp_datadir / "cases/1/a.pdf").resolve()
assert be.local_path("cases/1/missing.pdf") is None
# ensure_local returns the real path without copying
assert run(be.ensure_local("cases/1/a.pdf")) == (_tmp_datadir / "cases/1/a.pdf").resolve()
def test_filesystem_presign_unsupported(_tmp_datadir):
be = storage.FilesystemBackend()
with pytest.raises(NotImplementedError):
run(be.presign_get("cases/1/a.pdf"))
# ── backend selection ──────────────────────────────────────────────
def test_get_storage_default_is_filesystem(monkeypatch):
monkeypatch.setattr(config, "STORAGE_BACKEND", "filesystem")
storage.reset_storage_cache()
assert isinstance(storage.get_storage(), storage.FilesystemBackend)
def test_get_storage_unknown_falls_back(monkeypatch):
monkeypatch.setattr(config, "STORAGE_BACKEND", "bogus")
storage.reset_storage_cache()
assert isinstance(storage.get_storage(), storage.FilesystemBackend)
def test_get_storage_dual(monkeypatch):
monkeypatch.setattr(config, "STORAGE_BACKEND", "dual")
storage.reset_storage_cache()
assert isinstance(storage.get_storage(), storage.DualBackend)
# ── dual backend (S3 stubbed) ──────────────────────────────────────
class _FakeS3:
"""Minimal async S3 stub. ``store`` None ⇒ S3 'down' (raises)."""
def __init__(self, store):
self.store = store # dict or None
async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS, content_type=None, metadata=None):
if self.store is None:
raise RuntimeError("s3 down")
self.store[storage.normalize_key(key)] = data
return f"s3://stub/{storage.normalize_key(key)}"
async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS, content_type=None, metadata=None):
with open(src, "rb") as fh:
return await self.put_bytes(key, fh.read(), bucket=bucket)
async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS):
if self.store is None:
raise RuntimeError("s3 down")
return self.store[storage.normalize_key(key)]
async def exists(self, key, *, bucket=Bucket.DOCUMENTS):
return self.store is not None and storage.normalize_key(key) in self.store
async def delete(self, key, *, bucket=Bucket.DOCUMENTS):
if self.store is not None:
self.store.pop(storage.normalize_key(key), None)
def _dual_with(store):
dual = storage.DualBackend()
dual.s3 = _FakeS3(store)
return dual
def test_dual_writes_both(_tmp_datadir):
store = {}
dual = _dual_with(store)
run(dual.put_bytes("cases/1/a.pdf", b"data"))
assert (_tmp_datadir / "cases/1/a.pdf").read_bytes() == b"data" # disk
assert store["cases/1/a.pdf"] == b"data" # s3
def test_dual_s3_write_failure_does_not_break(_tmp_datadir):
dual = _dual_with(None) # s3 down
# disk write still succeeds; s3 failure is logged, not raised
run(dual.put_bytes("cases/1/a.pdf", b"data"))
assert (_tmp_datadir / "cases/1/a.pdf").read_bytes() == b"data"
def test_dual_get_prefers_s3(_tmp_datadir):
dual = _dual_with({"cases/1/a.pdf": b"from-s3"})
run(dual.fs.put_bytes("cases/1/a.pdf", b"from-disk"))
assert run(dual.get_bytes("cases/1/a.pdf")) == b"from-s3"
def test_dual_get_falls_back_to_disk(_tmp_datadir):
dual = _dual_with(None) # s3 down → must read disk
run(dual.fs.put_bytes("cases/1/a.pdf", b"from-disk"))
assert run(dual.get_bytes("cases/1/a.pdf")) == b"from-disk"
def test_bucket_name_resolution(monkeypatch):
monkeypatch.setattr(config, "MINIO_BUCKET_DOCUMENTS", "legal-documents")
monkeypatch.setattr(config, "MINIO_BUCKET_IMMUTABLE", "legal-immutable")
monkeypatch.setattr(config, "MINIO_BUCKET_DERIVED", "legal-derived")
assert storage._bucket_name(Bucket.DOCUMENTS) == "legal-documents"
assert storage._bucket_name(Bucket.IMMUTABLE) == "legal-immutable"
assert storage._bucket_name(Bucket.DERIVED) == "legal-derived"