diff --git a/mcp-server/pyproject.toml b/mcp-server/pyproject.toml index 89ba7e1..47b50cc 100644 --- a/mcp-server/pyproject.toml +++ b/mcp-server/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ "uvicorn[standard]>=0.30.0", "httpx>=0.27.0", "infisicalsdk>=1.0.0", + "aioboto3>=13.0.0", # X14 object storage (MinIO/S3) — services/storage.py ] [project.optional-dependencies] diff --git a/mcp-server/src/legal_mcp/config.py b/mcp-server/src/legal_mcp/config.py index fd907ac..00407e6 100644 --- a/mcp-server/src/legal_mcp/config.py +++ b/mcp-server/src/legal_mcp/config.py @@ -202,6 +202,32 @@ EXPORTS_DIR = DATA_DIR / "exports" # legacy exports only # Cases directory — flat structure: data/cases/{case_number}/ CASES_DIR = DATA_DIR / "cases" +# ── Object storage (X14 / MinIO) ─────────────────────────────────── +# Single storage layer (services/storage.py) replaces the scattered file +# I/O across ~8 services (INV-STG1 / G2). Backend selector: +# "filesystem" (default) — disk under DATA_DIR; current behaviour, no change. +# "dual" — write disk + S3, read S3→disk fallback (migration). +# "s3" — MinIO only. +# See docs/spec/X14-storage-minio.md. +STORAGE_BACKEND = os.environ.get("STORAGE_BACKEND", "filesystem").strip().lower() +# Endpoint reached server-side (internal Docker network: http://minio:9000). +MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "http://minio:9000") +# Public endpoint used when MINTING presigned URLs for the browser (INV-STG6) — +# the browser cannot resolve the internal hostname. Falls back to the internal +# endpoint when unset (e.g. local dev). +MINIO_PUBLIC_ENDPOINT = os.environ.get("MINIO_PUBLIC_ENDPOINT", MINIO_ENDPOINT) +MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY", "") +MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY", "") +MINIO_REGION = os.environ.get("MINIO_REGION", "us-east-1") +# Logical bucket → name. Governance boundaries (INV-STG3): documents +# (versioned), immutable (versioned + Object-Lock COMPLIANCE for final +# decisions, INV-STG4), derived (thumbnails/extracted text — regenerable). +MINIO_BUCKET_DOCUMENTS = os.environ.get("MINIO_BUCKET_DOCUMENTS", "legal-documents") +MINIO_BUCKET_IMMUTABLE = os.environ.get("MINIO_BUCKET_IMMUTABLE", "legal-immutable") +MINIO_BUCKET_DERIVED = os.environ.get("MINIO_BUCKET_DERIVED", "legal-derived") +# Default presigned-URL TTL (seconds). SigV4 hard max is 7 days; keep short. +MINIO_PRESIGN_TTL = int(os.environ.get("MINIO_PRESIGN_TTL", "900")) + def find_case_dir(case_number: str) -> Path: """Return the case directory for a given case number.""" diff --git a/mcp-server/src/legal_mcp/services/storage.py b/mcp-server/src/legal_mcp/services/storage.py new file mode 100644 index 0000000..b7a4f94 --- /dev/null +++ b/mcp-server/src/legal_mcp/services/storage.py @@ -0,0 +1,472 @@ +"""Unified object-storage layer (X14, INV-STG1). + +THE single choke-point for all binary file I/O — originals, derived +artifacts (thumbnails / extracted text), and exports. It replaces the +scattered ``open()`` / ``shutil.copy`` / ``Path.write_bytes`` calls spread +across ~8 services (G2: one storage path, no parallel routes). See +docs/spec/X14-storage-minio.md. + +Keys +---- +A *key* is a DATA_DIR-relative POSIX path, e.g.:: + + cases/8174-24/documents/originals/.pdf + precedent-library/thumbnails//p001.jpg + +The filesystem backend maps ``key -> DATA_DIR / key``, preserving the exact +current on-disk layout (zero behaviour change when ``STORAGE_BACKEND`` is the +default ``filesystem``). The S3 backend maps a logical *bucket* +(documents/immutable/derived) to a MinIO bucket and uses the key verbatim as +the object key. + +INV-STG2: keys are atomic ASCII/UUID paths; a Hebrew original filename is +carried as object metadata / a DB column, never as the key itself. +INV-STG5: pgvector stays the source of truth for text + embeddings — this +layer stores blobs only. INV-STG6: presigned URLs (minted against the public +endpoint) serve bytes straight to the browser. + +Backends (config.STORAGE_BACKEND) +--------------------------------- +- ``filesystem`` (default) — disk only; current behaviour. +- ``dual`` — write disk + S3; read S3, fall back to disk. + The migration window; disk stays authoritative. +- ``s3`` — MinIO only. + +``aioboto3`` is imported lazily so this module loads even where the dependency +is absent (the default filesystem backend needs nothing extra). +""" +from __future__ import annotations + +import logging +import shutil +import tempfile +from enum import Enum +from pathlib import Path, PurePosixPath +from typing import Iterable + +from legal_mcp import config + +logger = logging.getLogger(__name__) + + +class Bucket(str, Enum): + """Logical governance buckets (INV-STG3). Resolved to MinIO bucket names + via config; ignored by the filesystem backend (which keeps one tree).""" + + DOCUMENTS = "documents" + IMMUTABLE = "immutable" + DERIVED = "derived" + + +def _bucket_name(bucket: Bucket) -> str: + return { + Bucket.DOCUMENTS: config.MINIO_BUCKET_DOCUMENTS, + Bucket.IMMUTABLE: config.MINIO_BUCKET_IMMUTABLE, + Bucket.DERIVED: config.MINIO_BUCKET_DERIVED, + }[bucket] + + +def normalize_key(key: str | Path) -> str: + """Return a clean DATA_DIR-relative POSIX key. + + Rejects absolute paths and ``..`` traversal (defence in depth — keys are + built internally, never from raw user input). An absolute path under + DATA_DIR is accepted and re-relativised so call-sites can pass either a + key or a full ``Path`` during the migration. + """ + p = Path(key) + if p.is_absolute(): + try: + p = p.relative_to(config.DATA_DIR) + except ValueError as exc: + raise ValueError(f"absolute path outside DATA_DIR: {key!r}") from exc + posix = PurePosixPath(p.as_posix()) + parts = posix.parts + if not parts or any(part == ".." for part in parts): + raise ValueError(f"invalid storage key: {key!r}") + return posix.as_posix().lstrip("/") + + +class StorageBackend: + """Abstract backend. All methods are async except the cheap path helpers.""" + + name = "abstract" + + async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS, + content_type=None, metadata=None) -> str: + raise NotImplementedError + + async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS, + content_type=None, metadata=None) -> str: + with open(src, "rb") as fh: + return await self.put_bytes( + key, fh.read(), bucket=bucket, + content_type=content_type, metadata=metadata, + ) + + async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS) -> bytes: + raise NotImplementedError + + async def exists(self, key, *, bucket=Bucket.DOCUMENTS) -> bool: + raise NotImplementedError + + async def delete(self, key, *, bucket=Bucket.DOCUMENTS) -> None: + raise NotImplementedError + + async def list_keys(self, prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]: + raise NotImplementedError + + async def presign_get(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None, + download_name=None) -> str: + raise NotImplementedError + + async def presign_put(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None, + content_type=None) -> str: + raise NotImplementedError + + def local_path(self, key, *, bucket=Bucket.DOCUMENTS) -> Path | None: + """Return a real filesystem path if one exists *without* downloading, + else ``None``. Use :meth:`ensure_local` when a path is required.""" + return None + + async def ensure_local(self, key, *, bucket=Bucket.DOCUMENTS) -> Path: + """Return a local path to the object, downloading to a temp file if the + backend has no on-disk copy. Caller owns cleanup of temp files.""" + path = self.local_path(key, bucket=bucket) + if path is not None: + return path + data = await self.get_bytes(key, bucket=bucket) + suffix = PurePosixPath(normalize_key(key)).suffix + tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix) + tmp.write(data) + tmp.close() + return Path(tmp.name) + + +class FilesystemBackend(StorageBackend): + """Disk under DATA_DIR. ``bucket`` is ignored — the existing single tree is + preserved verbatim, so the default backend is byte-for-byte the legacy + behaviour.""" + + name = "filesystem" + + def _abs(self, key, *, bucket=Bucket.DOCUMENTS) -> Path: + rel = normalize_key(key) + path = (Path(config.DATA_DIR) / rel).resolve() + root = Path(config.DATA_DIR).resolve() + if root not in path.parents and path != root: + raise ValueError(f"resolved path escapes DATA_DIR: {key!r}") + return path + + async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS, + content_type=None, metadata=None) -> str: + path = self._abs(key, bucket=bucket) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(data) + return f"file://{path}" + + async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS, + content_type=None, metadata=None) -> str: + path = self._abs(key, bucket=bucket) + path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, path) # preserve mtime, as the legacy code did + return f"file://{path}" + + async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS) -> bytes: + return self._abs(key, bucket=bucket).read_bytes() + + async def exists(self, key, *, bucket=Bucket.DOCUMENTS) -> bool: + return self._abs(key, bucket=bucket).exists() + + async def delete(self, key, *, bucket=Bucket.DOCUMENTS) -> None: + self._abs(key, bucket=bucket).unlink(missing_ok=True) + + async def list_keys(self, prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]: + root = Path(config.DATA_DIR).resolve() + base = self._abs(prefix, bucket=bucket) if prefix else root + if not base.exists(): + return [] + out: list[str] = [] + for p in sorted(base.rglob("*")): + if p.is_file(): + out.append(p.resolve().relative_to(root).as_posix()) + return out + + def local_path(self, key, *, bucket=Bucket.DOCUMENTS) -> Path | None: + path = self._abs(key, bucket=bucket) + return path if path.exists() else None + + async def presign_get(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None, + download_name=None) -> str: + raise NotImplementedError( + "presigned URLs require the S3 backend (set STORAGE_BACKEND=dual|s3)" + ) + + async def presign_put(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None, + content_type=None) -> str: + raise NotImplementedError( + "presigned URLs require the S3 backend (set STORAGE_BACKEND=dual|s3)" + ) + + +class S3Backend(StorageBackend): + """MinIO via aioboto3. Server-side ops use the internal endpoint; presigned + URLs are minted against the public endpoint so the browser can reach them + (INV-STG6).""" + + name = "s3" + + def __init__(self) -> None: + self._session = None # lazy aioboto3 session + + def _boto(self): + import aioboto3 # lazy — absent in the default filesystem path + from botocore.config import Config as BotoConfig + if self._session is None: + self._session = aioboto3.Session() + cfg = BotoConfig(signature_version="s3v4", s3={"addressing_style": "path"}) + return aioboto3, BotoConfig, cfg + + def _client(self, *, public: bool = False): + _aioboto3, _BotoConfig, cfg = self._boto() + endpoint = config.MINIO_PUBLIC_ENDPOINT if public else config.MINIO_ENDPOINT + return self._session.client( + "s3", + endpoint_url=endpoint, + aws_access_key_id=config.MINIO_ACCESS_KEY, + aws_secret_access_key=config.MINIO_SECRET_KEY, + region_name=config.MINIO_REGION, + config=cfg, + ) + + async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS, + content_type=None, metadata=None) -> str: + k = normalize_key(key) + extra = {} + if content_type: + extra["ContentType"] = content_type + if metadata: + extra["Metadata"] = {kk: str(vv) for kk, vv in metadata.items()} + async with self._client() as s3: + await s3.put_object(Bucket=_bucket_name(bucket), Key=k, Body=data, **extra) + return f"s3://{_bucket_name(bucket)}/{k}" + + async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS) -> bytes: + k = normalize_key(key) + async with self._client() as s3: + resp = await s3.get_object(Bucket=_bucket_name(bucket), Key=k) + async with resp["Body"] as stream: + return await stream.read() + + async def exists(self, key, *, bucket=Bucket.DOCUMENTS) -> bool: + from botocore.exceptions import ClientError + k = normalize_key(key) + async with self._client() as s3: + try: + await s3.head_object(Bucket=_bucket_name(bucket), Key=k) + return True + except ClientError as exc: + if exc.response["Error"]["Code"] in ("404", "NoSuchKey", "NotFound"): + return False + raise + + async def delete(self, key, *, bucket=Bucket.DOCUMENTS) -> None: + k = normalize_key(key) + async with self._client() as s3: + await s3.delete_object(Bucket=_bucket_name(bucket), Key=k) + + async def list_keys(self, prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]: + pfx = normalize_key(prefix) if prefix else "" + out: list[str] = [] + async with self._client() as s3: + paginator = s3.get_paginator("list_objects_v2") + async for page in paginator.paginate(Bucket=_bucket_name(bucket), Prefix=pfx): + for obj in page.get("Contents", []): + out.append(obj["Key"]) + return out + + async def presign_get(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None, + download_name=None) -> str: + k = normalize_key(key) + params = {"Bucket": _bucket_name(bucket), "Key": k} + if download_name: + # RFC 5987 — keep the Hebrew original filename on download (INV-STG2) + from urllib.parse import quote + params["ResponseContentDisposition"] = ( + f"attachment; filename*=UTF-8''{quote(download_name)}" + ) + async with self._client(public=True) as s3: + return await s3.generate_presigned_url( + "get_object", Params=params, + ExpiresIn=ttl or config.MINIO_PRESIGN_TTL, + ) + + async def presign_put(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None, + content_type=None) -> str: + k = normalize_key(key) + params = {"Bucket": _bucket_name(bucket), "Key": k} + if content_type: + params["ContentType"] = content_type + async with self._client(public=True) as s3: + return await s3.generate_presigned_url( + "put_object", Params=params, + ExpiresIn=ttl or config.MINIO_PRESIGN_TTL, + ) + + +class DualBackend(StorageBackend): + """Migration window: writes go to BOTH disk and S3 (disk authoritative); + reads prefer S3 and fall back to disk. An S3 write failure is logged (never + swallowed — engineering §6) but does not break the app while disk holds the + canonical copy.""" + + name = "dual" + + def __init__(self) -> None: + self.fs = FilesystemBackend() + self.s3 = S3Backend() + + async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS, + content_type=None, metadata=None) -> str: + uri = await self.fs.put_bytes(key, data, bucket=bucket, + content_type=content_type, metadata=metadata) + try: + await self.s3.put_bytes(key, data, bucket=bucket, + content_type=content_type, metadata=metadata) + except Exception as exc: # noqa: BLE001 — log, don't swallow + logger.warning("dual put_bytes: S3 mirror failed for %s: %s", key, exc) + return uri + + async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS, + content_type=None, metadata=None) -> str: + uri = await self.fs.put_file(src, key, bucket=bucket, + content_type=content_type, metadata=metadata) + try: + await self.s3.put_file(src, key, bucket=bucket, + content_type=content_type, metadata=metadata) + except Exception as exc: # noqa: BLE001 + logger.warning("dual put_file: S3 mirror failed for %s: %s", key, exc) + return uri + + async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS) -> bytes: + try: + return await self.s3.get_bytes(key, bucket=bucket) + except Exception as exc: # noqa: BLE001 — fall back to disk + logger.debug("dual get_bytes: S3 miss for %s (%s); using disk", key, exc) + return await self.fs.get_bytes(key, bucket=bucket) + + async def exists(self, key, *, bucket=Bucket.DOCUMENTS) -> bool: + if await self.fs.exists(key, bucket=bucket): + return True + try: + return await self.s3.exists(key, bucket=bucket) + except Exception: # noqa: BLE001 + return False + + async def delete(self, key, *, bucket=Bucket.DOCUMENTS) -> None: + await self.fs.delete(key, bucket=bucket) + try: + await self.s3.delete(key, bucket=bucket) + except Exception as exc: # noqa: BLE001 + logger.warning("dual delete: S3 delete failed for %s: %s", key, exc) + + async def list_keys(self, prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]: + return await self.fs.list_keys(prefix, bucket=bucket) + + def local_path(self, key, *, bucket=Bucket.DOCUMENTS) -> Path | None: + return self.fs.local_path(key, bucket=bucket) + + async def presign_get(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None, + download_name=None) -> str: + return await self.s3.presign_get(key, bucket=bucket, ttl=ttl, + download_name=download_name) + + async def presign_put(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None, + content_type=None) -> str: + return await self.s3.presign_put(key, bucket=bucket, ttl=ttl, + content_type=content_type) + + +_BACKENDS = { + "filesystem": FilesystemBackend, + "dual": DualBackend, + "s3": S3Backend, +} + +_singleton: StorageBackend | None = None + + +def get_storage() -> StorageBackend: + """Return the process-wide storage backend selected by config.STORAGE_BACKEND + (cached). Unknown values fall back to ``filesystem`` with a warning rather + than crashing the app.""" + global _singleton + if _singleton is None: + cls = _BACKENDS.get(config.STORAGE_BACKEND) + if cls is None: + logger.warning( + "unknown STORAGE_BACKEND=%r — falling back to filesystem", + config.STORAGE_BACKEND, + ) + cls = FilesystemBackend + _singleton = cls() + logger.info("storage backend = %s", _singleton.name) + return _singleton + + +def reset_storage_cache() -> None: + """Drop the cached backend (tests / after an env change).""" + global _singleton + _singleton = None + + +# ── module-level convenience wrappers ────────────────────────────── +# Thin pass-throughs so call-sites can ``from legal_mcp.services import storage`` +# and use ``await storage.put_bytes(...)`` without fetching the singleton. + +async def put_bytes(key, data, *, bucket=Bucket.DOCUMENTS, content_type=None, + metadata=None) -> str: + return await get_storage().put_bytes( + key, data, bucket=bucket, content_type=content_type, metadata=metadata) + + +async def put_file(src, key, *, bucket=Bucket.DOCUMENTS, content_type=None, + metadata=None) -> str: + return await get_storage().put_file( + src, key, bucket=bucket, content_type=content_type, metadata=metadata) + + +async def get_bytes(key, *, bucket=Bucket.DOCUMENTS) -> bytes: + return await get_storage().get_bytes(key, bucket=bucket) + + +async def exists(key, *, bucket=Bucket.DOCUMENTS) -> bool: + return await get_storage().exists(key, bucket=bucket) + + +async def delete(key, *, bucket=Bucket.DOCUMENTS) -> None: + return await get_storage().delete(key, bucket=bucket) + + +async def list_keys(prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]: + return await get_storage().list_keys(prefix, bucket=bucket) + + +async def presign_get(key, *, bucket=Bucket.DOCUMENTS, ttl=None, + download_name=None) -> str: + return await get_storage().presign_get( + key, bucket=bucket, ttl=ttl, download_name=download_name) + + +async def presign_put(key, *, bucket=Bucket.DOCUMENTS, ttl=None, + content_type=None) -> str: + return await get_storage().presign_put( + key, bucket=bucket, ttl=ttl, content_type=content_type) + + +def local_path(key, *, bucket=Bucket.DOCUMENTS) -> Path | None: + return get_storage().local_path(key, bucket=bucket) + + +async def ensure_local(key, *, bucket=Bucket.DOCUMENTS) -> Path: + return await get_storage().ensure_local(key, bucket=bucket) diff --git a/mcp-server/tests/test_storage.py b/mcp-server/tests/test_storage.py new file mode 100644 index 0000000..2b8c1ca --- /dev/null +++ b/mcp-server/tests/test_storage.py @@ -0,0 +1,198 @@ +"""Unit tests for the unified storage layer (X14, services/storage.py). + +Sync tests driving the async API via asyncio.run (matches the repo +convention — no pytest-asyncio). The filesystem backend is exercised against a +tmp DATA_DIR; the S3 path is stubbed so the suite needs no MinIO and no +aioboto3. +""" +import asyncio + +import pytest + +from legal_mcp import config +from legal_mcp.services import storage +from legal_mcp.services.storage import Bucket + + +@pytest.fixture(autouse=True) +def _tmp_datadir(tmp_path, monkeypatch): + monkeypatch.setattr(config, "DATA_DIR", tmp_path) + monkeypatch.setattr(config, "STORAGE_BACKEND", "filesystem") + storage.reset_storage_cache() + yield tmp_path + storage.reset_storage_cache() + + +def run(coro): + return asyncio.run(coro) + + +# ── normalize_key ────────────────────────────────────────────────── + +def test_normalize_key_relative(): + assert storage.normalize_key("cases/8174-24/originals/x.pdf") == \ + "cases/8174-24/originals/x.pdf" + + +def test_normalize_key_rejects_parent_traversal(): + with pytest.raises(ValueError): + storage.normalize_key("cases/../../etc/passwd") + + +def test_normalize_key_accepts_abs_under_datadir(_tmp_datadir): + abs_key = _tmp_datadir / "cases" / "x.pdf" + assert storage.normalize_key(abs_key) == "cases/x.pdf" + + +def test_normalize_key_rejects_abs_outside_datadir(): + with pytest.raises(ValueError): + storage.normalize_key("/var/secret/x.pdf") + + +# ── filesystem backend ───────────────────────────────────────────── + +def test_filesystem_roundtrip(_tmp_datadir): + be = storage.FilesystemBackend() + key = "cases/1/originals/a.pdf" + uri = run(be.put_bytes(key, b"hello", content_type="application/pdf")) + assert uri.startswith("file://") + assert (_tmp_datadir / key).read_bytes() == b"hello" + assert run(be.exists(key)) is True + assert run(be.get_bytes(key)) == b"hello" + run(be.delete(key)) + assert run(be.exists(key)) is False + # delete is idempotent (missing_ok) + run(be.delete(key)) + + +def test_filesystem_put_file(_tmp_datadir): + be = storage.FilesystemBackend() + src = _tmp_datadir / "src.bin" + src.write_bytes(b"payload") + run(be.put_file(src, "precedent-library/court_ruling/u.pdf")) + assert (_tmp_datadir / "precedent-library/court_ruling/u.pdf").read_bytes() == b"payload" + + +def test_filesystem_bucket_is_ignored_legacy_layout(_tmp_datadir): + """Even for a non-default bucket, the FS backend keeps the single tree — + so the default backend is byte-for-byte the legacy layout.""" + be = storage.FilesystemBackend() + run(be.put_bytes("thumbnails/d/p001.jpg", b"img", bucket=Bucket.DERIVED)) + assert (_tmp_datadir / "thumbnails/d/p001.jpg").exists() + + +def test_filesystem_list_keys(_tmp_datadir): + be = storage.FilesystemBackend() + run(be.put_bytes("cases/1/a.txt", b"a")) + run(be.put_bytes("cases/1/sub/b.txt", b"b")) + run(be.put_bytes("cases/2/c.txt", b"c")) + keys = run(be.list_keys("cases/1")) + assert keys == ["cases/1/a.txt", "cases/1/sub/b.txt"] + + +def test_filesystem_local_path_and_ensure_local(_tmp_datadir): + be = storage.FilesystemBackend() + run(be.put_bytes("cases/1/a.pdf", b"x")) + assert be.local_path("cases/1/a.pdf") == (_tmp_datadir / "cases/1/a.pdf").resolve() + assert be.local_path("cases/1/missing.pdf") is None + # ensure_local returns the real path without copying + assert run(be.ensure_local("cases/1/a.pdf")) == (_tmp_datadir / "cases/1/a.pdf").resolve() + + +def test_filesystem_presign_unsupported(_tmp_datadir): + be = storage.FilesystemBackend() + with pytest.raises(NotImplementedError): + run(be.presign_get("cases/1/a.pdf")) + + +# ── backend selection ────────────────────────────────────────────── + +def test_get_storage_default_is_filesystem(monkeypatch): + monkeypatch.setattr(config, "STORAGE_BACKEND", "filesystem") + storage.reset_storage_cache() + assert isinstance(storage.get_storage(), storage.FilesystemBackend) + + +def test_get_storage_unknown_falls_back(monkeypatch): + monkeypatch.setattr(config, "STORAGE_BACKEND", "bogus") + storage.reset_storage_cache() + assert isinstance(storage.get_storage(), storage.FilesystemBackend) + + +def test_get_storage_dual(monkeypatch): + monkeypatch.setattr(config, "STORAGE_BACKEND", "dual") + storage.reset_storage_cache() + assert isinstance(storage.get_storage(), storage.DualBackend) + + +# ── dual backend (S3 stubbed) ────────────────────────────────────── + +class _FakeS3: + """Minimal async S3 stub. ``store`` None ⇒ S3 'down' (raises).""" + + def __init__(self, store): + self.store = store # dict or None + + async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS, content_type=None, metadata=None): + if self.store is None: + raise RuntimeError("s3 down") + self.store[storage.normalize_key(key)] = data + return f"s3://stub/{storage.normalize_key(key)}" + + async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS, content_type=None, metadata=None): + with open(src, "rb") as fh: + return await self.put_bytes(key, fh.read(), bucket=bucket) + + async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS): + if self.store is None: + raise RuntimeError("s3 down") + return self.store[storage.normalize_key(key)] + + async def exists(self, key, *, bucket=Bucket.DOCUMENTS): + return self.store is not None and storage.normalize_key(key) in self.store + + async def delete(self, key, *, bucket=Bucket.DOCUMENTS): + if self.store is not None: + self.store.pop(storage.normalize_key(key), None) + + +def _dual_with(store): + dual = storage.DualBackend() + dual.s3 = _FakeS3(store) + return dual + + +def test_dual_writes_both(_tmp_datadir): + store = {} + dual = _dual_with(store) + run(dual.put_bytes("cases/1/a.pdf", b"data")) + assert (_tmp_datadir / "cases/1/a.pdf").read_bytes() == b"data" # disk + assert store["cases/1/a.pdf"] == b"data" # s3 + + +def test_dual_s3_write_failure_does_not_break(_tmp_datadir): + dual = _dual_with(None) # s3 down + # disk write still succeeds; s3 failure is logged, not raised + run(dual.put_bytes("cases/1/a.pdf", b"data")) + assert (_tmp_datadir / "cases/1/a.pdf").read_bytes() == b"data" + + +def test_dual_get_prefers_s3(_tmp_datadir): + dual = _dual_with({"cases/1/a.pdf": b"from-s3"}) + run(dual.fs.put_bytes("cases/1/a.pdf", b"from-disk")) + assert run(dual.get_bytes("cases/1/a.pdf")) == b"from-s3" + + +def test_dual_get_falls_back_to_disk(_tmp_datadir): + dual = _dual_with(None) # s3 down → must read disk + run(dual.fs.put_bytes("cases/1/a.pdf", b"from-disk")) + assert run(dual.get_bytes("cases/1/a.pdf")) == b"from-disk" + + +def test_bucket_name_resolution(monkeypatch): + monkeypatch.setattr(config, "MINIO_BUCKET_DOCUMENTS", "legal-documents") + monkeypatch.setattr(config, "MINIO_BUCKET_IMMUTABLE", "legal-immutable") + monkeypatch.setattr(config, "MINIO_BUCKET_DERIVED", "legal-derived") + assert storage._bucket_name(Bucket.DOCUMENTS) == "legal-documents" + assert storage._bucket_name(Bucket.IMMUTABLE) == "legal-immutable" + assert storage._bucket_name(Bucket.DERIVED) == "legal-derived" diff --git a/web/mcp_env_catalog.py b/web/mcp_env_catalog.py index 34d2ca6..8bc2787 100644 --- a/web/mcp_env_catalog.py +++ b/web/mcp_env_catalog.py @@ -13,7 +13,8 @@ from typing import Any, Literal EnvType = Literal["bool", "int", "float", "string"] EnvCategory = Literal[ - "multimodal", "rerank", "halacha", "credentials", "connection", "general" + "multimodal", "rerank", "halacha", "credentials", "connection", + "storage", "general" ] @@ -89,6 +90,58 @@ ENV_CATALOG: dict[str, EnvSpec] = { "סף confidence ל-auto-approve של הלכות שחולצו", is_secret=False, is_editable=True, default=0.80, min=0.0, max=1.0, ), + # ── storage (X14 / MinIO) ────────────────────────────────────── + "STORAGE_BACKEND": EnvSpec( + "STORAGE_BACKEND", "storage", "string", + "מנוע אחסון: filesystem (דיסק) / dual (דיסק+S3) / s3 (MinIO בלבד)", + is_secret=False, is_editable=True, default="filesystem", + enum_values=("filesystem", "dual", "s3"), + ), + "MINIO_ENDPOINT": EnvSpec( + "MINIO_ENDPOINT", "storage", "string", + "endpoint פנימי של MinIO (server-side, רשת Docker)", + is_secret=False, is_editable=False, default="http://minio:9000", + ), + "MINIO_PUBLIC_ENDPOINT": EnvSpec( + "MINIO_PUBLIC_ENDPOINT", "storage", "string", + "endpoint ציבורי ל-presigned URLs (גישת דפדפן)", + is_secret=False, is_editable=False, default="https://s3.nautilus.marcusgroup.org", + ), + "MINIO_ACCESS_KEY": EnvSpec( + "MINIO_ACCESS_KEY", "storage", "string", + "MinIO access key (service-account מוגבל ל-3 הדליות)", + is_secret=True, is_editable=False, + ), + "MINIO_SECRET_KEY": EnvSpec( + "MINIO_SECRET_KEY", "storage", "string", + "MinIO secret key", + is_secret=True, is_editable=False, + ), + "MINIO_REGION": EnvSpec( + "MINIO_REGION", "storage", "string", + "אזור S3 (MinIO מתעלם — לחתימת SigV4)", + is_secret=False, is_editable=False, default="us-east-1", + ), + "MINIO_BUCKET_DOCUMENTS": EnvSpec( + "MINIO_BUCKET_DOCUMENTS", "storage", "string", + "דלי מסמכי-מקור (versioning)", + is_secret=False, is_editable=False, default="legal-documents", + ), + "MINIO_BUCKET_IMMUTABLE": EnvSpec( + "MINIO_BUCKET_IMMUTABLE", "storage", "string", + "דלי החלטות סופיות (versioning + Object-Lock COMPLIANCE)", + is_secret=False, is_editable=False, default="legal-immutable", + ), + "MINIO_BUCKET_DERIVED": EnvSpec( + "MINIO_BUCKET_DERIVED", "storage", "string", + "דלי נגזרים (thumbnails / extracted — ניתן-לשחזור)", + is_secret=False, is_editable=False, default="legal-derived", + ), + "MINIO_PRESIGN_TTL": EnvSpec( + "MINIO_PRESIGN_TTL", "storage", "int", + "תוקף presigned URL בשניות (מקס' SigV4 = 7 ימים)", + is_secret=False, is_editable=True, default=900, min=60, max=604800, + ), # ── general ──────────────────────────────────────────────────── "VOYAGE_MODEL": EnvSpec( "VOYAGE_MODEL", "general", "string",