feat(storage): X14 Phase 1 — unified storage layer (services/storage.py) #151

Merged
chaim merged 1 commits from worktree-storage-minio-phase1 into main 2026-06-08 07:48:19 +00:00
5 changed files with 751 additions and 1 deletions

View File

@@ -21,6 +21,7 @@ dependencies = [
"uvicorn[standard]>=0.30.0",
"httpx>=0.27.0",
"infisicalsdk>=1.0.0",
"aioboto3>=13.0.0", # X14 object storage (MinIO/S3) — services/storage.py
]
[project.optional-dependencies]

View File

@@ -202,6 +202,32 @@ EXPORTS_DIR = DATA_DIR / "exports" # legacy exports only
# Cases directory — flat structure: data/cases/{case_number}/
CASES_DIR = DATA_DIR / "cases"
# ── Object storage (X14 / MinIO) ───────────────────────────────────
# Single storage layer (services/storage.py) replaces the scattered file
# I/O across ~8 services (INV-STG1 / G2). Backend selector:
# "filesystem" (default) — disk under DATA_DIR; current behaviour, no change.
# "dual" — write disk + S3, read S3→disk fallback (migration).
# "s3" — MinIO only.
# See docs/spec/X14-storage-minio.md.
STORAGE_BACKEND = os.environ.get("STORAGE_BACKEND", "filesystem").strip().lower()
# Endpoint reached server-side (internal Docker network: http://minio:9000).
MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "http://minio:9000")
# Public endpoint used when MINTING presigned URLs for the browser (INV-STG6) —
# the browser cannot resolve the internal hostname. Falls back to the internal
# endpoint when unset (e.g. local dev).
MINIO_PUBLIC_ENDPOINT = os.environ.get("MINIO_PUBLIC_ENDPOINT", MINIO_ENDPOINT)
MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY", "")
MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY", "")
MINIO_REGION = os.environ.get("MINIO_REGION", "us-east-1")
# Logical bucket → name. Governance boundaries (INV-STG3): documents
# (versioned), immutable (versioned + Object-Lock COMPLIANCE for final
# decisions, INV-STG4), derived (thumbnails/extracted text — regenerable).
MINIO_BUCKET_DOCUMENTS = os.environ.get("MINIO_BUCKET_DOCUMENTS", "legal-documents")
MINIO_BUCKET_IMMUTABLE = os.environ.get("MINIO_BUCKET_IMMUTABLE", "legal-immutable")
MINIO_BUCKET_DERIVED = os.environ.get("MINIO_BUCKET_DERIVED", "legal-derived")
# Default presigned-URL TTL (seconds). SigV4 hard max is 7 days; keep short.
MINIO_PRESIGN_TTL = int(os.environ.get("MINIO_PRESIGN_TTL", "900"))
def find_case_dir(case_number: str) -> Path:
"""Return the case directory for a given case number."""

View File

@@ -0,0 +1,472 @@
"""Unified object-storage layer (X14, INV-STG1).
THE single choke-point for all binary file I/O — originals, derived
artifacts (thumbnails / extracted text), and exports. It replaces the
scattered ``open()`` / ``shutil.copy`` / ``Path.write_bytes`` calls spread
across ~8 services (G2: one storage path, no parallel routes). See
docs/spec/X14-storage-minio.md.
Keys
----
A *key* is a DATA_DIR-relative POSIX path, e.g.::
cases/8174-24/documents/originals/<uuid>.pdf
precedent-library/thumbnails/<case_law_id>/p001.jpg
The filesystem backend maps ``key -> DATA_DIR / key``, preserving the exact
current on-disk layout (zero behaviour change when ``STORAGE_BACKEND`` is the
default ``filesystem``). The S3 backend maps a logical *bucket*
(documents/immutable/derived) to a MinIO bucket and uses the key verbatim as
the object key.
INV-STG2: keys are atomic ASCII/UUID paths; a Hebrew original filename is
carried as object metadata / a DB column, never as the key itself.
INV-STG5: pgvector stays the source of truth for text + embeddings — this
layer stores blobs only. INV-STG6: presigned URLs (minted against the public
endpoint) serve bytes straight to the browser.
Backends (config.STORAGE_BACKEND)
---------------------------------
- ``filesystem`` (default) — disk only; current behaviour.
- ``dual`` — write disk + S3; read S3, fall back to disk.
The migration window; disk stays authoritative.
- ``s3`` — MinIO only.
``aioboto3`` is imported lazily so this module loads even where the dependency
is absent (the default filesystem backend needs nothing extra).
"""
from __future__ import annotations
import logging
import shutil
import tempfile
from enum import Enum
from pathlib import Path, PurePosixPath
from typing import Iterable
from legal_mcp import config
logger = logging.getLogger(__name__)
class Bucket(str, Enum):
"""Logical governance buckets (INV-STG3). Resolved to MinIO bucket names
via config; ignored by the filesystem backend (which keeps one tree)."""
DOCUMENTS = "documents"
IMMUTABLE = "immutable"
DERIVED = "derived"
def _bucket_name(bucket: Bucket) -> str:
return {
Bucket.DOCUMENTS: config.MINIO_BUCKET_DOCUMENTS,
Bucket.IMMUTABLE: config.MINIO_BUCKET_IMMUTABLE,
Bucket.DERIVED: config.MINIO_BUCKET_DERIVED,
}[bucket]
def normalize_key(key: str | Path) -> str:
"""Return a clean DATA_DIR-relative POSIX key.
Rejects absolute paths and ``..`` traversal (defence in depth — keys are
built internally, never from raw user input). An absolute path under
DATA_DIR is accepted and re-relativised so call-sites can pass either a
key or a full ``Path`` during the migration.
"""
p = Path(key)
if p.is_absolute():
try:
p = p.relative_to(config.DATA_DIR)
except ValueError as exc:
raise ValueError(f"absolute path outside DATA_DIR: {key!r}") from exc
posix = PurePosixPath(p.as_posix())
parts = posix.parts
if not parts or any(part == ".." for part in parts):
raise ValueError(f"invalid storage key: {key!r}")
return posix.as_posix().lstrip("/")
class StorageBackend:
"""Abstract backend. All methods are async except the cheap path helpers."""
name = "abstract"
async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS,
content_type=None, metadata=None) -> str:
raise NotImplementedError
async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS,
content_type=None, metadata=None) -> str:
with open(src, "rb") as fh:
return await self.put_bytes(
key, fh.read(), bucket=bucket,
content_type=content_type, metadata=metadata,
)
async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS) -> bytes:
raise NotImplementedError
async def exists(self, key, *, bucket=Bucket.DOCUMENTS) -> bool:
raise NotImplementedError
async def delete(self, key, *, bucket=Bucket.DOCUMENTS) -> None:
raise NotImplementedError
async def list_keys(self, prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]:
raise NotImplementedError
async def presign_get(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
download_name=None) -> str:
raise NotImplementedError
async def presign_put(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
content_type=None) -> str:
raise NotImplementedError
def local_path(self, key, *, bucket=Bucket.DOCUMENTS) -> Path | None:
"""Return a real filesystem path if one exists *without* downloading,
else ``None``. Use :meth:`ensure_local` when a path is required."""
return None
async def ensure_local(self, key, *, bucket=Bucket.DOCUMENTS) -> Path:
"""Return a local path to the object, downloading to a temp file if the
backend has no on-disk copy. Caller owns cleanup of temp files."""
path = self.local_path(key, bucket=bucket)
if path is not None:
return path
data = await self.get_bytes(key, bucket=bucket)
suffix = PurePosixPath(normalize_key(key)).suffix
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
tmp.write(data)
tmp.close()
return Path(tmp.name)
class FilesystemBackend(StorageBackend):
"""Disk under DATA_DIR. ``bucket`` is ignored — the existing single tree is
preserved verbatim, so the default backend is byte-for-byte the legacy
behaviour."""
name = "filesystem"
def _abs(self, key, *, bucket=Bucket.DOCUMENTS) -> Path:
rel = normalize_key(key)
path = (Path(config.DATA_DIR) / rel).resolve()
root = Path(config.DATA_DIR).resolve()
if root not in path.parents and path != root:
raise ValueError(f"resolved path escapes DATA_DIR: {key!r}")
return path
async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS,
content_type=None, metadata=None) -> str:
path = self._abs(key, bucket=bucket)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(data)
return f"file://{path}"
async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS,
content_type=None, metadata=None) -> str:
path = self._abs(key, bucket=bucket)
path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src, path) # preserve mtime, as the legacy code did
return f"file://{path}"
async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS) -> bytes:
return self._abs(key, bucket=bucket).read_bytes()
async def exists(self, key, *, bucket=Bucket.DOCUMENTS) -> bool:
return self._abs(key, bucket=bucket).exists()
async def delete(self, key, *, bucket=Bucket.DOCUMENTS) -> None:
self._abs(key, bucket=bucket).unlink(missing_ok=True)
async def list_keys(self, prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]:
root = Path(config.DATA_DIR).resolve()
base = self._abs(prefix, bucket=bucket) if prefix else root
if not base.exists():
return []
out: list[str] = []
for p in sorted(base.rglob("*")):
if p.is_file():
out.append(p.resolve().relative_to(root).as_posix())
return out
def local_path(self, key, *, bucket=Bucket.DOCUMENTS) -> Path | None:
path = self._abs(key, bucket=bucket)
return path if path.exists() else None
async def presign_get(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
download_name=None) -> str:
raise NotImplementedError(
"presigned URLs require the S3 backend (set STORAGE_BACKEND=dual|s3)"
)
async def presign_put(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
content_type=None) -> str:
raise NotImplementedError(
"presigned URLs require the S3 backend (set STORAGE_BACKEND=dual|s3)"
)
class S3Backend(StorageBackend):
"""MinIO via aioboto3. Server-side ops use the internal endpoint; presigned
URLs are minted against the public endpoint so the browser can reach them
(INV-STG6)."""
name = "s3"
def __init__(self) -> None:
self._session = None # lazy aioboto3 session
def _boto(self):
import aioboto3 # lazy — absent in the default filesystem path
from botocore.config import Config as BotoConfig
if self._session is None:
self._session = aioboto3.Session()
cfg = BotoConfig(signature_version="s3v4", s3={"addressing_style": "path"})
return aioboto3, BotoConfig, cfg
def _client(self, *, public: bool = False):
_aioboto3, _BotoConfig, cfg = self._boto()
endpoint = config.MINIO_PUBLIC_ENDPOINT if public else config.MINIO_ENDPOINT
return self._session.client(
"s3",
endpoint_url=endpoint,
aws_access_key_id=config.MINIO_ACCESS_KEY,
aws_secret_access_key=config.MINIO_SECRET_KEY,
region_name=config.MINIO_REGION,
config=cfg,
)
async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS,
content_type=None, metadata=None) -> str:
k = normalize_key(key)
extra = {}
if content_type:
extra["ContentType"] = content_type
if metadata:
extra["Metadata"] = {kk: str(vv) for kk, vv in metadata.items()}
async with self._client() as s3:
await s3.put_object(Bucket=_bucket_name(bucket), Key=k, Body=data, **extra)
return f"s3://{_bucket_name(bucket)}/{k}"
async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS) -> bytes:
k = normalize_key(key)
async with self._client() as s3:
resp = await s3.get_object(Bucket=_bucket_name(bucket), Key=k)
async with resp["Body"] as stream:
return await stream.read()
async def exists(self, key, *, bucket=Bucket.DOCUMENTS) -> bool:
from botocore.exceptions import ClientError
k = normalize_key(key)
async with self._client() as s3:
try:
await s3.head_object(Bucket=_bucket_name(bucket), Key=k)
return True
except ClientError as exc:
if exc.response["Error"]["Code"] in ("404", "NoSuchKey", "NotFound"):
return False
raise
async def delete(self, key, *, bucket=Bucket.DOCUMENTS) -> None:
k = normalize_key(key)
async with self._client() as s3:
await s3.delete_object(Bucket=_bucket_name(bucket), Key=k)
async def list_keys(self, prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]:
pfx = normalize_key(prefix) if prefix else ""
out: list[str] = []
async with self._client() as s3:
paginator = s3.get_paginator("list_objects_v2")
async for page in paginator.paginate(Bucket=_bucket_name(bucket), Prefix=pfx):
for obj in page.get("Contents", []):
out.append(obj["Key"])
return out
async def presign_get(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
download_name=None) -> str:
k = normalize_key(key)
params = {"Bucket": _bucket_name(bucket), "Key": k}
if download_name:
# RFC 5987 — keep the Hebrew original filename on download (INV-STG2)
from urllib.parse import quote
params["ResponseContentDisposition"] = (
f"attachment; filename*=UTF-8''{quote(download_name)}"
)
async with self._client(public=True) as s3:
return await s3.generate_presigned_url(
"get_object", Params=params,
ExpiresIn=ttl or config.MINIO_PRESIGN_TTL,
)
async def presign_put(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
content_type=None) -> str:
k = normalize_key(key)
params = {"Bucket": _bucket_name(bucket), "Key": k}
if content_type:
params["ContentType"] = content_type
async with self._client(public=True) as s3:
return await s3.generate_presigned_url(
"put_object", Params=params,
ExpiresIn=ttl or config.MINIO_PRESIGN_TTL,
)
class DualBackend(StorageBackend):
"""Migration window: writes go to BOTH disk and S3 (disk authoritative);
reads prefer S3 and fall back to disk. An S3 write failure is logged (never
swallowed — engineering §6) but does not break the app while disk holds the
canonical copy."""
name = "dual"
def __init__(self) -> None:
self.fs = FilesystemBackend()
self.s3 = S3Backend()
async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS,
content_type=None, metadata=None) -> str:
uri = await self.fs.put_bytes(key, data, bucket=bucket,
content_type=content_type, metadata=metadata)
try:
await self.s3.put_bytes(key, data, bucket=bucket,
content_type=content_type, metadata=metadata)
except Exception as exc: # noqa: BLE001 — log, don't swallow
logger.warning("dual put_bytes: S3 mirror failed for %s: %s", key, exc)
return uri
async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS,
content_type=None, metadata=None) -> str:
uri = await self.fs.put_file(src, key, bucket=bucket,
content_type=content_type, metadata=metadata)
try:
await self.s3.put_file(src, key, bucket=bucket,
content_type=content_type, metadata=metadata)
except Exception as exc: # noqa: BLE001
logger.warning("dual put_file: S3 mirror failed for %s: %s", key, exc)
return uri
async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS) -> bytes:
try:
return await self.s3.get_bytes(key, bucket=bucket)
except Exception as exc: # noqa: BLE001 — fall back to disk
logger.debug("dual get_bytes: S3 miss for %s (%s); using disk", key, exc)
return await self.fs.get_bytes(key, bucket=bucket)
async def exists(self, key, *, bucket=Bucket.DOCUMENTS) -> bool:
if await self.fs.exists(key, bucket=bucket):
return True
try:
return await self.s3.exists(key, bucket=bucket)
except Exception: # noqa: BLE001
return False
async def delete(self, key, *, bucket=Bucket.DOCUMENTS) -> None:
await self.fs.delete(key, bucket=bucket)
try:
await self.s3.delete(key, bucket=bucket)
except Exception as exc: # noqa: BLE001
logger.warning("dual delete: S3 delete failed for %s: %s", key, exc)
async def list_keys(self, prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]:
return await self.fs.list_keys(prefix, bucket=bucket)
def local_path(self, key, *, bucket=Bucket.DOCUMENTS) -> Path | None:
return self.fs.local_path(key, bucket=bucket)
async def presign_get(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
download_name=None) -> str:
return await self.s3.presign_get(key, bucket=bucket, ttl=ttl,
download_name=download_name)
async def presign_put(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
content_type=None) -> str:
return await self.s3.presign_put(key, bucket=bucket, ttl=ttl,
content_type=content_type)
_BACKENDS = {
"filesystem": FilesystemBackend,
"dual": DualBackend,
"s3": S3Backend,
}
_singleton: StorageBackend | None = None
def get_storage() -> StorageBackend:
"""Return the process-wide storage backend selected by config.STORAGE_BACKEND
(cached). Unknown values fall back to ``filesystem`` with a warning rather
than crashing the app."""
global _singleton
if _singleton is None:
cls = _BACKENDS.get(config.STORAGE_BACKEND)
if cls is None:
logger.warning(
"unknown STORAGE_BACKEND=%r — falling back to filesystem",
config.STORAGE_BACKEND,
)
cls = FilesystemBackend
_singleton = cls()
logger.info("storage backend = %s", _singleton.name)
return _singleton
def reset_storage_cache() -> None:
"""Drop the cached backend (tests / after an env change)."""
global _singleton
_singleton = None
# ── module-level convenience wrappers ──────────────────────────────
# Thin pass-throughs so call-sites can ``from legal_mcp.services import storage``
# and use ``await storage.put_bytes(...)`` without fetching the singleton.
async def put_bytes(key, data, *, bucket=Bucket.DOCUMENTS, content_type=None,
metadata=None) -> str:
return await get_storage().put_bytes(
key, data, bucket=bucket, content_type=content_type, metadata=metadata)
async def put_file(src, key, *, bucket=Bucket.DOCUMENTS, content_type=None,
metadata=None) -> str:
return await get_storage().put_file(
src, key, bucket=bucket, content_type=content_type, metadata=metadata)
async def get_bytes(key, *, bucket=Bucket.DOCUMENTS) -> bytes:
return await get_storage().get_bytes(key, bucket=bucket)
async def exists(key, *, bucket=Bucket.DOCUMENTS) -> bool:
return await get_storage().exists(key, bucket=bucket)
async def delete(key, *, bucket=Bucket.DOCUMENTS) -> None:
return await get_storage().delete(key, bucket=bucket)
async def list_keys(prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]:
return await get_storage().list_keys(prefix, bucket=bucket)
async def presign_get(key, *, bucket=Bucket.DOCUMENTS, ttl=None,
download_name=None) -> str:
return await get_storage().presign_get(
key, bucket=bucket, ttl=ttl, download_name=download_name)
async def presign_put(key, *, bucket=Bucket.DOCUMENTS, ttl=None,
content_type=None) -> str:
return await get_storage().presign_put(
key, bucket=bucket, ttl=ttl, content_type=content_type)
def local_path(key, *, bucket=Bucket.DOCUMENTS) -> Path | None:
return get_storage().local_path(key, bucket=bucket)
async def ensure_local(key, *, bucket=Bucket.DOCUMENTS) -> Path:
return await get_storage().ensure_local(key, bucket=bucket)

View File

@@ -0,0 +1,198 @@
"""Unit tests for the unified storage layer (X14, services/storage.py).
Sync tests driving the async API via asyncio.run (matches the repo
convention — no pytest-asyncio). The filesystem backend is exercised against a
tmp DATA_DIR; the S3 path is stubbed so the suite needs no MinIO and no
aioboto3.
"""
import asyncio
import pytest
from legal_mcp import config
from legal_mcp.services import storage
from legal_mcp.services.storage import Bucket
@pytest.fixture(autouse=True)
def _tmp_datadir(tmp_path, monkeypatch):
monkeypatch.setattr(config, "DATA_DIR", tmp_path)
monkeypatch.setattr(config, "STORAGE_BACKEND", "filesystem")
storage.reset_storage_cache()
yield tmp_path
storage.reset_storage_cache()
def run(coro):
return asyncio.run(coro)
# ── normalize_key ──────────────────────────────────────────────────
def test_normalize_key_relative():
assert storage.normalize_key("cases/8174-24/originals/x.pdf") == \
"cases/8174-24/originals/x.pdf"
def test_normalize_key_rejects_parent_traversal():
with pytest.raises(ValueError):
storage.normalize_key("cases/../../etc/passwd")
def test_normalize_key_accepts_abs_under_datadir(_tmp_datadir):
abs_key = _tmp_datadir / "cases" / "x.pdf"
assert storage.normalize_key(abs_key) == "cases/x.pdf"
def test_normalize_key_rejects_abs_outside_datadir():
with pytest.raises(ValueError):
storage.normalize_key("/var/secret/x.pdf")
# ── filesystem backend ─────────────────────────────────────────────
def test_filesystem_roundtrip(_tmp_datadir):
be = storage.FilesystemBackend()
key = "cases/1/originals/a.pdf"
uri = run(be.put_bytes(key, b"hello", content_type="application/pdf"))
assert uri.startswith("file://")
assert (_tmp_datadir / key).read_bytes() == b"hello"
assert run(be.exists(key)) is True
assert run(be.get_bytes(key)) == b"hello"
run(be.delete(key))
assert run(be.exists(key)) is False
# delete is idempotent (missing_ok)
run(be.delete(key))
def test_filesystem_put_file(_tmp_datadir):
be = storage.FilesystemBackend()
src = _tmp_datadir / "src.bin"
src.write_bytes(b"payload")
run(be.put_file(src, "precedent-library/court_ruling/u.pdf"))
assert (_tmp_datadir / "precedent-library/court_ruling/u.pdf").read_bytes() == b"payload"
def test_filesystem_bucket_is_ignored_legacy_layout(_tmp_datadir):
"""Even for a non-default bucket, the FS backend keeps the single tree —
so the default backend is byte-for-byte the legacy layout."""
be = storage.FilesystemBackend()
run(be.put_bytes("thumbnails/d/p001.jpg", b"img", bucket=Bucket.DERIVED))
assert (_tmp_datadir / "thumbnails/d/p001.jpg").exists()
def test_filesystem_list_keys(_tmp_datadir):
be = storage.FilesystemBackend()
run(be.put_bytes("cases/1/a.txt", b"a"))
run(be.put_bytes("cases/1/sub/b.txt", b"b"))
run(be.put_bytes("cases/2/c.txt", b"c"))
keys = run(be.list_keys("cases/1"))
assert keys == ["cases/1/a.txt", "cases/1/sub/b.txt"]
def test_filesystem_local_path_and_ensure_local(_tmp_datadir):
be = storage.FilesystemBackend()
run(be.put_bytes("cases/1/a.pdf", b"x"))
assert be.local_path("cases/1/a.pdf") == (_tmp_datadir / "cases/1/a.pdf").resolve()
assert be.local_path("cases/1/missing.pdf") is None
# ensure_local returns the real path without copying
assert run(be.ensure_local("cases/1/a.pdf")) == (_tmp_datadir / "cases/1/a.pdf").resolve()
def test_filesystem_presign_unsupported(_tmp_datadir):
be = storage.FilesystemBackend()
with pytest.raises(NotImplementedError):
run(be.presign_get("cases/1/a.pdf"))
# ── backend selection ──────────────────────────────────────────────
def test_get_storage_default_is_filesystem(monkeypatch):
monkeypatch.setattr(config, "STORAGE_BACKEND", "filesystem")
storage.reset_storage_cache()
assert isinstance(storage.get_storage(), storage.FilesystemBackend)
def test_get_storage_unknown_falls_back(monkeypatch):
monkeypatch.setattr(config, "STORAGE_BACKEND", "bogus")
storage.reset_storage_cache()
assert isinstance(storage.get_storage(), storage.FilesystemBackend)
def test_get_storage_dual(monkeypatch):
monkeypatch.setattr(config, "STORAGE_BACKEND", "dual")
storage.reset_storage_cache()
assert isinstance(storage.get_storage(), storage.DualBackend)
# ── dual backend (S3 stubbed) ──────────────────────────────────────
class _FakeS3:
"""Minimal async S3 stub. ``store`` None ⇒ S3 'down' (raises)."""
def __init__(self, store):
self.store = store # dict or None
async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS, content_type=None, metadata=None):
if self.store is None:
raise RuntimeError("s3 down")
self.store[storage.normalize_key(key)] = data
return f"s3://stub/{storage.normalize_key(key)}"
async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS, content_type=None, metadata=None):
with open(src, "rb") as fh:
return await self.put_bytes(key, fh.read(), bucket=bucket)
async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS):
if self.store is None:
raise RuntimeError("s3 down")
return self.store[storage.normalize_key(key)]
async def exists(self, key, *, bucket=Bucket.DOCUMENTS):
return self.store is not None and storage.normalize_key(key) in self.store
async def delete(self, key, *, bucket=Bucket.DOCUMENTS):
if self.store is not None:
self.store.pop(storage.normalize_key(key), None)
def _dual_with(store):
dual = storage.DualBackend()
dual.s3 = _FakeS3(store)
return dual
def test_dual_writes_both(_tmp_datadir):
store = {}
dual = _dual_with(store)
run(dual.put_bytes("cases/1/a.pdf", b"data"))
assert (_tmp_datadir / "cases/1/a.pdf").read_bytes() == b"data" # disk
assert store["cases/1/a.pdf"] == b"data" # s3
def test_dual_s3_write_failure_does_not_break(_tmp_datadir):
dual = _dual_with(None) # s3 down
# disk write still succeeds; s3 failure is logged, not raised
run(dual.put_bytes("cases/1/a.pdf", b"data"))
assert (_tmp_datadir / "cases/1/a.pdf").read_bytes() == b"data"
def test_dual_get_prefers_s3(_tmp_datadir):
dual = _dual_with({"cases/1/a.pdf": b"from-s3"})
run(dual.fs.put_bytes("cases/1/a.pdf", b"from-disk"))
assert run(dual.get_bytes("cases/1/a.pdf")) == b"from-s3"
def test_dual_get_falls_back_to_disk(_tmp_datadir):
dual = _dual_with(None) # s3 down → must read disk
run(dual.fs.put_bytes("cases/1/a.pdf", b"from-disk"))
assert run(dual.get_bytes("cases/1/a.pdf")) == b"from-disk"
def test_bucket_name_resolution(monkeypatch):
monkeypatch.setattr(config, "MINIO_BUCKET_DOCUMENTS", "legal-documents")
monkeypatch.setattr(config, "MINIO_BUCKET_IMMUTABLE", "legal-immutable")
monkeypatch.setattr(config, "MINIO_BUCKET_DERIVED", "legal-derived")
assert storage._bucket_name(Bucket.DOCUMENTS) == "legal-documents"
assert storage._bucket_name(Bucket.IMMUTABLE) == "legal-immutable"
assert storage._bucket_name(Bucket.DERIVED) == "legal-derived"

View File

@@ -13,7 +13,8 @@ from typing import Any, Literal
EnvType = Literal["bool", "int", "float", "string"]
EnvCategory = Literal[
"multimodal", "rerank", "halacha", "credentials", "connection", "general"
"multimodal", "rerank", "halacha", "credentials", "connection",
"storage", "general"
]
@@ -89,6 +90,58 @@ ENV_CATALOG: dict[str, EnvSpec] = {
"סף confidence ל-auto-approve של הלכות שחולצו",
is_secret=False, is_editable=True, default=0.80, min=0.0, max=1.0,
),
# ── storage (X14 / MinIO) ──────────────────────────────────────
"STORAGE_BACKEND": EnvSpec(
"STORAGE_BACKEND", "storage", "string",
"מנוע אחסון: filesystem (דיסק) / dual (דיסק+S3) / s3 (MinIO בלבד)",
is_secret=False, is_editable=True, default="filesystem",
enum_values=("filesystem", "dual", "s3"),
),
"MINIO_ENDPOINT": EnvSpec(
"MINIO_ENDPOINT", "storage", "string",
"endpoint פנימי של MinIO (server-side, רשת Docker)",
is_secret=False, is_editable=False, default="http://minio:9000",
),
"MINIO_PUBLIC_ENDPOINT": EnvSpec(
"MINIO_PUBLIC_ENDPOINT", "storage", "string",
"endpoint ציבורי ל-presigned URLs (גישת דפדפן)",
is_secret=False, is_editable=False, default="https://s3.nautilus.marcusgroup.org",
),
"MINIO_ACCESS_KEY": EnvSpec(
"MINIO_ACCESS_KEY", "storage", "string",
"MinIO access key (service-account מוגבל ל-3 הדליות)",
is_secret=True, is_editable=False,
),
"MINIO_SECRET_KEY": EnvSpec(
"MINIO_SECRET_KEY", "storage", "string",
"MinIO secret key",
is_secret=True, is_editable=False,
),
"MINIO_REGION": EnvSpec(
"MINIO_REGION", "storage", "string",
"אזור S3 (MinIO מתעלם — לחתימת SigV4)",
is_secret=False, is_editable=False, default="us-east-1",
),
"MINIO_BUCKET_DOCUMENTS": EnvSpec(
"MINIO_BUCKET_DOCUMENTS", "storage", "string",
"דלי מסמכי-מקור (versioning)",
is_secret=False, is_editable=False, default="legal-documents",
),
"MINIO_BUCKET_IMMUTABLE": EnvSpec(
"MINIO_BUCKET_IMMUTABLE", "storage", "string",
"דלי החלטות סופיות (versioning + Object-Lock COMPLIANCE)",
is_secret=False, is_editable=False, default="legal-immutable",
),
"MINIO_BUCKET_DERIVED": EnvSpec(
"MINIO_BUCKET_DERIVED", "storage", "string",
"דלי נגזרים (thumbnails / extracted — ניתן-לשחזור)",
is_secret=False, is_editable=False, default="legal-derived",
),
"MINIO_PRESIGN_TTL": EnvSpec(
"MINIO_PRESIGN_TTL", "storage", "int",
"תוקף presigned URL בשניות (מקס' SigV4 = 7 ימים)",
is_secret=False, is_editable=True, default=900, min=60, max=604800,
),
# ── general ────────────────────────────────────────────────────
"VOYAGE_MODEL": EnvSpec(
"VOYAGE_MODEL", "general", "string",