The single choke-point for all binary file I/O (originals, derived artifacts, exports), replacing the scattered open()/shutil/Path.write_bytes calls across ~8 services. Backend chosen by STORAGE_BACKEND: - filesystem (default): disk under DATA_DIR — byte-for-byte legacy behaviour - dual: write disk + S3, read S3→disk fallback (migration window) - s3: MinIO via aioboto3 (lazy import; absent in the filesystem path) Keys are DATA_DIR-relative POSIX paths; the FS backend ignores the logical bucket and keeps the existing single tree, so the default backend is zero behaviour change. S3 maps a governance bucket (documents/immutable/derived) → MinIO bucket; presigned URLs are minted against the public endpoint (browser-reachable) and carry the Hebrew filename via RFC-5987 Content-Disposition. - config: STORAGE_BACKEND + MINIO_* (endpoint, public-endpoint, creds, region, 3 bucket names, presign TTL) - mcp_env_catalog: new "storage" category + 10 specs (X10/INV-ENV1) - pyproject: aioboto3>=13 (consumed here, deployed with first use) - tests: 18 unit tests (FS round-trip, key normalization/traversal guard, bucket resolution, backend selection, dual write-both + S3-down fallback) No call-sites are rewired yet — that is Phase 2 (106.3). STORAGE_BACKEND stays filesystem in prod, so behaviour is unchanged. Invariants: keeps G2 (one storage path replaces scattered I/O); establishes INV-STG1 (single layer), INV-STG2 (atomic keys, Hebrew name in metadata), INV-STG3 (governance buckets), INV-STG6 (presigned serving). Spec: docs/spec/X14-storage-minio.md. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
199 lines
7.4 KiB
Python
199 lines
7.4 KiB
Python
"""Unit tests for the unified storage layer (X14, services/storage.py).
|
|
|
|
Sync tests driving the async API via asyncio.run (matches the repo
|
|
convention — no pytest-asyncio). The filesystem backend is exercised against a
|
|
tmp DATA_DIR; the S3 path is stubbed so the suite needs no MinIO and no
|
|
aioboto3.
|
|
"""
|
|
import asyncio
|
|
|
|
import pytest
|
|
|
|
from legal_mcp import config
|
|
from legal_mcp.services import storage
|
|
from legal_mcp.services.storage import Bucket
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _tmp_datadir(tmp_path, monkeypatch):
|
|
monkeypatch.setattr(config, "DATA_DIR", tmp_path)
|
|
monkeypatch.setattr(config, "STORAGE_BACKEND", "filesystem")
|
|
storage.reset_storage_cache()
|
|
yield tmp_path
|
|
storage.reset_storage_cache()
|
|
|
|
|
|
def run(coro):
|
|
return asyncio.run(coro)
|
|
|
|
|
|
# ── normalize_key ──────────────────────────────────────────────────
|
|
|
|
def test_normalize_key_relative():
|
|
assert storage.normalize_key("cases/8174-24/originals/x.pdf") == \
|
|
"cases/8174-24/originals/x.pdf"
|
|
|
|
|
|
def test_normalize_key_rejects_parent_traversal():
|
|
with pytest.raises(ValueError):
|
|
storage.normalize_key("cases/../../etc/passwd")
|
|
|
|
|
|
def test_normalize_key_accepts_abs_under_datadir(_tmp_datadir):
|
|
abs_key = _tmp_datadir / "cases" / "x.pdf"
|
|
assert storage.normalize_key(abs_key) == "cases/x.pdf"
|
|
|
|
|
|
def test_normalize_key_rejects_abs_outside_datadir():
|
|
with pytest.raises(ValueError):
|
|
storage.normalize_key("/var/secret/x.pdf")
|
|
|
|
|
|
# ── filesystem backend ─────────────────────────────────────────────
|
|
|
|
def test_filesystem_roundtrip(_tmp_datadir):
|
|
be = storage.FilesystemBackend()
|
|
key = "cases/1/originals/a.pdf"
|
|
uri = run(be.put_bytes(key, b"hello", content_type="application/pdf"))
|
|
assert uri.startswith("file://")
|
|
assert (_tmp_datadir / key).read_bytes() == b"hello"
|
|
assert run(be.exists(key)) is True
|
|
assert run(be.get_bytes(key)) == b"hello"
|
|
run(be.delete(key))
|
|
assert run(be.exists(key)) is False
|
|
# delete is idempotent (missing_ok)
|
|
run(be.delete(key))
|
|
|
|
|
|
def test_filesystem_put_file(_tmp_datadir):
|
|
be = storage.FilesystemBackend()
|
|
src = _tmp_datadir / "src.bin"
|
|
src.write_bytes(b"payload")
|
|
run(be.put_file(src, "precedent-library/court_ruling/u.pdf"))
|
|
assert (_tmp_datadir / "precedent-library/court_ruling/u.pdf").read_bytes() == b"payload"
|
|
|
|
|
|
def test_filesystem_bucket_is_ignored_legacy_layout(_tmp_datadir):
|
|
"""Even for a non-default bucket, the FS backend keeps the single tree —
|
|
so the default backend is byte-for-byte the legacy layout."""
|
|
be = storage.FilesystemBackend()
|
|
run(be.put_bytes("thumbnails/d/p001.jpg", b"img", bucket=Bucket.DERIVED))
|
|
assert (_tmp_datadir / "thumbnails/d/p001.jpg").exists()
|
|
|
|
|
|
def test_filesystem_list_keys(_tmp_datadir):
|
|
be = storage.FilesystemBackend()
|
|
run(be.put_bytes("cases/1/a.txt", b"a"))
|
|
run(be.put_bytes("cases/1/sub/b.txt", b"b"))
|
|
run(be.put_bytes("cases/2/c.txt", b"c"))
|
|
keys = run(be.list_keys("cases/1"))
|
|
assert keys == ["cases/1/a.txt", "cases/1/sub/b.txt"]
|
|
|
|
|
|
def test_filesystem_local_path_and_ensure_local(_tmp_datadir):
|
|
be = storage.FilesystemBackend()
|
|
run(be.put_bytes("cases/1/a.pdf", b"x"))
|
|
assert be.local_path("cases/1/a.pdf") == (_tmp_datadir / "cases/1/a.pdf").resolve()
|
|
assert be.local_path("cases/1/missing.pdf") is None
|
|
# ensure_local returns the real path without copying
|
|
assert run(be.ensure_local("cases/1/a.pdf")) == (_tmp_datadir / "cases/1/a.pdf").resolve()
|
|
|
|
|
|
def test_filesystem_presign_unsupported(_tmp_datadir):
|
|
be = storage.FilesystemBackend()
|
|
with pytest.raises(NotImplementedError):
|
|
run(be.presign_get("cases/1/a.pdf"))
|
|
|
|
|
|
# ── backend selection ──────────────────────────────────────────────
|
|
|
|
def test_get_storage_default_is_filesystem(monkeypatch):
|
|
monkeypatch.setattr(config, "STORAGE_BACKEND", "filesystem")
|
|
storage.reset_storage_cache()
|
|
assert isinstance(storage.get_storage(), storage.FilesystemBackend)
|
|
|
|
|
|
def test_get_storage_unknown_falls_back(monkeypatch):
|
|
monkeypatch.setattr(config, "STORAGE_BACKEND", "bogus")
|
|
storage.reset_storage_cache()
|
|
assert isinstance(storage.get_storage(), storage.FilesystemBackend)
|
|
|
|
|
|
def test_get_storage_dual(monkeypatch):
|
|
monkeypatch.setattr(config, "STORAGE_BACKEND", "dual")
|
|
storage.reset_storage_cache()
|
|
assert isinstance(storage.get_storage(), storage.DualBackend)
|
|
|
|
|
|
# ── dual backend (S3 stubbed) ──────────────────────────────────────
|
|
|
|
class _FakeS3:
|
|
"""Minimal async S3 stub. ``store`` None ⇒ S3 'down' (raises)."""
|
|
|
|
def __init__(self, store):
|
|
self.store = store # dict or None
|
|
|
|
async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS, content_type=None, metadata=None):
|
|
if self.store is None:
|
|
raise RuntimeError("s3 down")
|
|
self.store[storage.normalize_key(key)] = data
|
|
return f"s3://stub/{storage.normalize_key(key)}"
|
|
|
|
async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS, content_type=None, metadata=None):
|
|
with open(src, "rb") as fh:
|
|
return await self.put_bytes(key, fh.read(), bucket=bucket)
|
|
|
|
async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS):
|
|
if self.store is None:
|
|
raise RuntimeError("s3 down")
|
|
return self.store[storage.normalize_key(key)]
|
|
|
|
async def exists(self, key, *, bucket=Bucket.DOCUMENTS):
|
|
return self.store is not None and storage.normalize_key(key) in self.store
|
|
|
|
async def delete(self, key, *, bucket=Bucket.DOCUMENTS):
|
|
if self.store is not None:
|
|
self.store.pop(storage.normalize_key(key), None)
|
|
|
|
|
|
def _dual_with(store):
|
|
dual = storage.DualBackend()
|
|
dual.s3 = _FakeS3(store)
|
|
return dual
|
|
|
|
|
|
def test_dual_writes_both(_tmp_datadir):
|
|
store = {}
|
|
dual = _dual_with(store)
|
|
run(dual.put_bytes("cases/1/a.pdf", b"data"))
|
|
assert (_tmp_datadir / "cases/1/a.pdf").read_bytes() == b"data" # disk
|
|
assert store["cases/1/a.pdf"] == b"data" # s3
|
|
|
|
|
|
def test_dual_s3_write_failure_does_not_break(_tmp_datadir):
|
|
dual = _dual_with(None) # s3 down
|
|
# disk write still succeeds; s3 failure is logged, not raised
|
|
run(dual.put_bytes("cases/1/a.pdf", b"data"))
|
|
assert (_tmp_datadir / "cases/1/a.pdf").read_bytes() == b"data"
|
|
|
|
|
|
def test_dual_get_prefers_s3(_tmp_datadir):
|
|
dual = _dual_with({"cases/1/a.pdf": b"from-s3"})
|
|
run(dual.fs.put_bytes("cases/1/a.pdf", b"from-disk"))
|
|
assert run(dual.get_bytes("cases/1/a.pdf")) == b"from-s3"
|
|
|
|
|
|
def test_dual_get_falls_back_to_disk(_tmp_datadir):
|
|
dual = _dual_with(None) # s3 down → must read disk
|
|
run(dual.fs.put_bytes("cases/1/a.pdf", b"from-disk"))
|
|
assert run(dual.get_bytes("cases/1/a.pdf")) == b"from-disk"
|
|
|
|
|
|
def test_bucket_name_resolution(monkeypatch):
|
|
monkeypatch.setattr(config, "MINIO_BUCKET_DOCUMENTS", "legal-documents")
|
|
monkeypatch.setattr(config, "MINIO_BUCKET_IMMUTABLE", "legal-immutable")
|
|
monkeypatch.setattr(config, "MINIO_BUCKET_DERIVED", "legal-derived")
|
|
assert storage._bucket_name(Bucket.DOCUMENTS) == "legal-documents"
|
|
assert storage._bucket_name(Bucket.IMMUTABLE) == "legal-immutable"
|
|
assert storage._bucket_name(Bucket.DERIVED) == "legal-derived"
|