"""Unit tests for the unified storage layer (X14, services/storage.py). Sync tests driving the async API via asyncio.run (matches the repo convention — no pytest-asyncio). The filesystem backend is exercised against a tmp DATA_DIR; the S3 path is stubbed so the suite needs no MinIO and no aioboto3. """ import asyncio import pytest from legal_mcp import config from legal_mcp.services import storage from legal_mcp.services.storage import Bucket @pytest.fixture(autouse=True) def _tmp_datadir(tmp_path, monkeypatch): monkeypatch.setattr(config, "DATA_DIR", tmp_path) monkeypatch.setattr(config, "STORAGE_BACKEND", "filesystem") storage.reset_storage_cache() yield tmp_path storage.reset_storage_cache() def run(coro): return asyncio.run(coro) # ── normalize_key ────────────────────────────────────────────────── def test_normalize_key_relative(): assert storage.normalize_key("cases/8174-24/originals/x.pdf") == \ "cases/8174-24/originals/x.pdf" def test_normalize_key_rejects_parent_traversal(): with pytest.raises(ValueError): storage.normalize_key("cases/../../etc/passwd") def test_normalize_key_accepts_abs_under_datadir(_tmp_datadir): abs_key = _tmp_datadir / "cases" / "x.pdf" assert storage.normalize_key(abs_key) == "cases/x.pdf" def test_normalize_key_rejects_abs_outside_datadir(): with pytest.raises(ValueError): storage.normalize_key("/var/secret/x.pdf") # ── filesystem backend ───────────────────────────────────────────── def test_filesystem_roundtrip(_tmp_datadir): be = storage.FilesystemBackend() key = "cases/1/originals/a.pdf" uri = run(be.put_bytes(key, b"hello", content_type="application/pdf")) assert uri.startswith("file://") assert (_tmp_datadir / key).read_bytes() == b"hello" assert run(be.exists(key)) is True assert run(be.get_bytes(key)) == b"hello" run(be.delete(key)) assert run(be.exists(key)) is False # delete is idempotent (missing_ok) run(be.delete(key)) def test_filesystem_put_file(_tmp_datadir): be = storage.FilesystemBackend() src = _tmp_datadir / "src.bin" src.write_bytes(b"payload") run(be.put_file(src, "precedent-library/court_ruling/u.pdf")) assert (_tmp_datadir / "precedent-library/court_ruling/u.pdf").read_bytes() == b"payload" def test_filesystem_bucket_is_ignored_legacy_layout(_tmp_datadir): """Even for a non-default bucket, the FS backend keeps the single tree — so the default backend is byte-for-byte the legacy layout.""" be = storage.FilesystemBackend() run(be.put_bytes("thumbnails/d/p001.jpg", b"img", bucket=Bucket.DERIVED)) assert (_tmp_datadir / "thumbnails/d/p001.jpg").exists() def test_filesystem_list_keys(_tmp_datadir): be = storage.FilesystemBackend() run(be.put_bytes("cases/1/a.txt", b"a")) run(be.put_bytes("cases/1/sub/b.txt", b"b")) run(be.put_bytes("cases/2/c.txt", b"c")) keys = run(be.list_keys("cases/1")) assert keys == ["cases/1/a.txt", "cases/1/sub/b.txt"] def test_filesystem_local_path_and_ensure_local(_tmp_datadir): be = storage.FilesystemBackend() run(be.put_bytes("cases/1/a.pdf", b"x")) assert be.local_path("cases/1/a.pdf") == (_tmp_datadir / "cases/1/a.pdf").resolve() assert be.local_path("cases/1/missing.pdf") is None # ensure_local returns the real path without copying assert run(be.ensure_local("cases/1/a.pdf")) == (_tmp_datadir / "cases/1/a.pdf").resolve() def test_filesystem_presign_unsupported(_tmp_datadir): be = storage.FilesystemBackend() with pytest.raises(NotImplementedError): run(be.presign_get("cases/1/a.pdf")) # ── backend selection ────────────────────────────────────────────── def test_get_storage_default_is_filesystem(monkeypatch): monkeypatch.setattr(config, "STORAGE_BACKEND", "filesystem") storage.reset_storage_cache() assert isinstance(storage.get_storage(), storage.FilesystemBackend) def test_get_storage_unknown_falls_back(monkeypatch): monkeypatch.setattr(config, "STORAGE_BACKEND", "bogus") storage.reset_storage_cache() assert isinstance(storage.get_storage(), storage.FilesystemBackend) def test_get_storage_dual(monkeypatch): monkeypatch.setattr(config, "STORAGE_BACKEND", "dual") storage.reset_storage_cache() assert isinstance(storage.get_storage(), storage.DualBackend) # ── dual backend (S3 stubbed) ────────────────────────────────────── class _FakeS3: """Minimal async S3 stub. ``store`` None ⇒ S3 'down' (raises).""" def __init__(self, store): self.store = store # dict or None async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS, content_type=None, metadata=None): if self.store is None: raise RuntimeError("s3 down") self.store[storage.normalize_key(key)] = data return f"s3://stub/{storage.normalize_key(key)}" async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS, content_type=None, metadata=None): with open(src, "rb") as fh: return await self.put_bytes(key, fh.read(), bucket=bucket) async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS): if self.store is None: raise RuntimeError("s3 down") return self.store[storage.normalize_key(key)] async def exists(self, key, *, bucket=Bucket.DOCUMENTS): return self.store is not None and storage.normalize_key(key) in self.store async def delete(self, key, *, bucket=Bucket.DOCUMENTS): if self.store is not None: self.store.pop(storage.normalize_key(key), None) def _dual_with(store): dual = storage.DualBackend() dual.s3 = _FakeS3(store) return dual def test_dual_writes_both(_tmp_datadir): store = {} dual = _dual_with(store) run(dual.put_bytes("cases/1/a.pdf", b"data")) assert (_tmp_datadir / "cases/1/a.pdf").read_bytes() == b"data" # disk assert store["cases/1/a.pdf"] == b"data" # s3 def test_dual_s3_write_failure_does_not_break(_tmp_datadir): dual = _dual_with(None) # s3 down # disk write still succeeds; s3 failure is logged, not raised run(dual.put_bytes("cases/1/a.pdf", b"data")) assert (_tmp_datadir / "cases/1/a.pdf").read_bytes() == b"data" def test_dual_get_prefers_s3(_tmp_datadir): dual = _dual_with({"cases/1/a.pdf": b"from-s3"}) run(dual.fs.put_bytes("cases/1/a.pdf", b"from-disk")) assert run(dual.get_bytes("cases/1/a.pdf")) == b"from-s3" def test_dual_get_falls_back_to_disk(_tmp_datadir): dual = _dual_with(None) # s3 down → must read disk run(dual.fs.put_bytes("cases/1/a.pdf", b"from-disk")) assert run(dual.get_bytes("cases/1/a.pdf")) == b"from-disk" def test_bucket_name_resolution(monkeypatch): monkeypatch.setattr(config, "MINIO_BUCKET_DOCUMENTS", "legal-documents") monkeypatch.setattr(config, "MINIO_BUCKET_IMMUTABLE", "legal-immutable") monkeypatch.setattr(config, "MINIO_BUCKET_DERIVED", "legal-derived") assert storage._bucket_name(Bucket.DOCUMENTS) == "legal-documents" assert storage._bucket_name(Bucket.IMMUTABLE) == "legal-immutable" assert storage._bucket_name(Bucket.DERIVED) == "legal-derived"