feat(storage): X14 Phase 1 — unified storage layer (services/storage.py) #151
@@ -21,6 +21,7 @@ dependencies = [
|
|||||||
"uvicorn[standard]>=0.30.0",
|
"uvicorn[standard]>=0.30.0",
|
||||||
"httpx>=0.27.0",
|
"httpx>=0.27.0",
|
||||||
"infisicalsdk>=1.0.0",
|
"infisicalsdk>=1.0.0",
|
||||||
|
"aioboto3>=13.0.0", # X14 object storage (MinIO/S3) — services/storage.py
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|||||||
@@ -202,6 +202,32 @@ EXPORTS_DIR = DATA_DIR / "exports" # legacy exports only
|
|||||||
# Cases directory — flat structure: data/cases/{case_number}/
|
# Cases directory — flat structure: data/cases/{case_number}/
|
||||||
CASES_DIR = DATA_DIR / "cases"
|
CASES_DIR = DATA_DIR / "cases"
|
||||||
|
|
||||||
|
# ── Object storage (X14 / MinIO) ───────────────────────────────────
|
||||||
|
# Single storage layer (services/storage.py) replaces the scattered file
|
||||||
|
# I/O across ~8 services (INV-STG1 / G2). Backend selector:
|
||||||
|
# "filesystem" (default) — disk under DATA_DIR; current behaviour, no change.
|
||||||
|
# "dual" — write disk + S3, read S3→disk fallback (migration).
|
||||||
|
# "s3" — MinIO only.
|
||||||
|
# See docs/spec/X14-storage-minio.md.
|
||||||
|
STORAGE_BACKEND = os.environ.get("STORAGE_BACKEND", "filesystem").strip().lower()
|
||||||
|
# Endpoint reached server-side (internal Docker network: http://minio:9000).
|
||||||
|
MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "http://minio:9000")
|
||||||
|
# Public endpoint used when MINTING presigned URLs for the browser (INV-STG6) —
|
||||||
|
# the browser cannot resolve the internal hostname. Falls back to the internal
|
||||||
|
# endpoint when unset (e.g. local dev).
|
||||||
|
MINIO_PUBLIC_ENDPOINT = os.environ.get("MINIO_PUBLIC_ENDPOINT", MINIO_ENDPOINT)
|
||||||
|
MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY", "")
|
||||||
|
MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY", "")
|
||||||
|
MINIO_REGION = os.environ.get("MINIO_REGION", "us-east-1")
|
||||||
|
# Logical bucket → name. Governance boundaries (INV-STG3): documents
|
||||||
|
# (versioned), immutable (versioned + Object-Lock COMPLIANCE for final
|
||||||
|
# decisions, INV-STG4), derived (thumbnails/extracted text — regenerable).
|
||||||
|
MINIO_BUCKET_DOCUMENTS = os.environ.get("MINIO_BUCKET_DOCUMENTS", "legal-documents")
|
||||||
|
MINIO_BUCKET_IMMUTABLE = os.environ.get("MINIO_BUCKET_IMMUTABLE", "legal-immutable")
|
||||||
|
MINIO_BUCKET_DERIVED = os.environ.get("MINIO_BUCKET_DERIVED", "legal-derived")
|
||||||
|
# Default presigned-URL TTL (seconds). SigV4 hard max is 7 days; keep short.
|
||||||
|
MINIO_PRESIGN_TTL = int(os.environ.get("MINIO_PRESIGN_TTL", "900"))
|
||||||
|
|
||||||
|
|
||||||
def find_case_dir(case_number: str) -> Path:
|
def find_case_dir(case_number: str) -> Path:
|
||||||
"""Return the case directory for a given case number."""
|
"""Return the case directory for a given case number."""
|
||||||
|
|||||||
472
mcp-server/src/legal_mcp/services/storage.py
Normal file
472
mcp-server/src/legal_mcp/services/storage.py
Normal file
@@ -0,0 +1,472 @@
|
|||||||
|
"""Unified object-storage layer (X14, INV-STG1).
|
||||||
|
|
||||||
|
THE single choke-point for all binary file I/O — originals, derived
|
||||||
|
artifacts (thumbnails / extracted text), and exports. It replaces the
|
||||||
|
scattered ``open()`` / ``shutil.copy`` / ``Path.write_bytes`` calls spread
|
||||||
|
across ~8 services (G2: one storage path, no parallel routes). See
|
||||||
|
docs/spec/X14-storage-minio.md.
|
||||||
|
|
||||||
|
Keys
|
||||||
|
----
|
||||||
|
A *key* is a DATA_DIR-relative POSIX path, e.g.::
|
||||||
|
|
||||||
|
cases/8174-24/documents/originals/<uuid>.pdf
|
||||||
|
precedent-library/thumbnails/<case_law_id>/p001.jpg
|
||||||
|
|
||||||
|
The filesystem backend maps ``key -> DATA_DIR / key``, preserving the exact
|
||||||
|
current on-disk layout (zero behaviour change when ``STORAGE_BACKEND`` is the
|
||||||
|
default ``filesystem``). The S3 backend maps a logical *bucket*
|
||||||
|
(documents/immutable/derived) to a MinIO bucket and uses the key verbatim as
|
||||||
|
the object key.
|
||||||
|
|
||||||
|
INV-STG2: keys are atomic ASCII/UUID paths; a Hebrew original filename is
|
||||||
|
carried as object metadata / a DB column, never as the key itself.
|
||||||
|
INV-STG5: pgvector stays the source of truth for text + embeddings — this
|
||||||
|
layer stores blobs only. INV-STG6: presigned URLs (minted against the public
|
||||||
|
endpoint) serve bytes straight to the browser.
|
||||||
|
|
||||||
|
Backends (config.STORAGE_BACKEND)
|
||||||
|
---------------------------------
|
||||||
|
- ``filesystem`` (default) — disk only; current behaviour.
|
||||||
|
- ``dual`` — write disk + S3; read S3, fall back to disk.
|
||||||
|
The migration window; disk stays authoritative.
|
||||||
|
- ``s3`` — MinIO only.
|
||||||
|
|
||||||
|
``aioboto3`` is imported lazily so this module loads even where the dependency
|
||||||
|
is absent (the default filesystem backend needs nothing extra).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from enum import Enum
|
||||||
|
from pathlib import Path, PurePosixPath
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from legal_mcp import config
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Bucket(str, Enum):
|
||||||
|
"""Logical governance buckets (INV-STG3). Resolved to MinIO bucket names
|
||||||
|
via config; ignored by the filesystem backend (which keeps one tree)."""
|
||||||
|
|
||||||
|
DOCUMENTS = "documents"
|
||||||
|
IMMUTABLE = "immutable"
|
||||||
|
DERIVED = "derived"
|
||||||
|
|
||||||
|
|
||||||
|
def _bucket_name(bucket: Bucket) -> str:
|
||||||
|
return {
|
||||||
|
Bucket.DOCUMENTS: config.MINIO_BUCKET_DOCUMENTS,
|
||||||
|
Bucket.IMMUTABLE: config.MINIO_BUCKET_IMMUTABLE,
|
||||||
|
Bucket.DERIVED: config.MINIO_BUCKET_DERIVED,
|
||||||
|
}[bucket]
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_key(key: str | Path) -> str:
|
||||||
|
"""Return a clean DATA_DIR-relative POSIX key.
|
||||||
|
|
||||||
|
Rejects absolute paths and ``..`` traversal (defence in depth — keys are
|
||||||
|
built internally, never from raw user input). An absolute path under
|
||||||
|
DATA_DIR is accepted and re-relativised so call-sites can pass either a
|
||||||
|
key or a full ``Path`` during the migration.
|
||||||
|
"""
|
||||||
|
p = Path(key)
|
||||||
|
if p.is_absolute():
|
||||||
|
try:
|
||||||
|
p = p.relative_to(config.DATA_DIR)
|
||||||
|
except ValueError as exc:
|
||||||
|
raise ValueError(f"absolute path outside DATA_DIR: {key!r}") from exc
|
||||||
|
posix = PurePosixPath(p.as_posix())
|
||||||
|
parts = posix.parts
|
||||||
|
if not parts or any(part == ".." for part in parts):
|
||||||
|
raise ValueError(f"invalid storage key: {key!r}")
|
||||||
|
return posix.as_posix().lstrip("/")
|
||||||
|
|
||||||
|
|
||||||
|
class StorageBackend:
|
||||||
|
"""Abstract backend. All methods are async except the cheap path helpers."""
|
||||||
|
|
||||||
|
name = "abstract"
|
||||||
|
|
||||||
|
async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS,
|
||||||
|
content_type=None, metadata=None) -> str:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS,
|
||||||
|
content_type=None, metadata=None) -> str:
|
||||||
|
with open(src, "rb") as fh:
|
||||||
|
return await self.put_bytes(
|
||||||
|
key, fh.read(), bucket=bucket,
|
||||||
|
content_type=content_type, metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS) -> bytes:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
async def exists(self, key, *, bucket=Bucket.DOCUMENTS) -> bool:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
async def delete(self, key, *, bucket=Bucket.DOCUMENTS) -> None:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
async def list_keys(self, prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
async def presign_get(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
|
||||||
|
download_name=None) -> str:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
async def presign_put(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
|
||||||
|
content_type=None) -> str:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def local_path(self, key, *, bucket=Bucket.DOCUMENTS) -> Path | None:
|
||||||
|
"""Return a real filesystem path if one exists *without* downloading,
|
||||||
|
else ``None``. Use :meth:`ensure_local` when a path is required."""
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def ensure_local(self, key, *, bucket=Bucket.DOCUMENTS) -> Path:
|
||||||
|
"""Return a local path to the object, downloading to a temp file if the
|
||||||
|
backend has no on-disk copy. Caller owns cleanup of temp files."""
|
||||||
|
path = self.local_path(key, bucket=bucket)
|
||||||
|
if path is not None:
|
||||||
|
return path
|
||||||
|
data = await self.get_bytes(key, bucket=bucket)
|
||||||
|
suffix = PurePosixPath(normalize_key(key)).suffix
|
||||||
|
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
|
||||||
|
tmp.write(data)
|
||||||
|
tmp.close()
|
||||||
|
return Path(tmp.name)
|
||||||
|
|
||||||
|
|
||||||
|
class FilesystemBackend(StorageBackend):
|
||||||
|
"""Disk under DATA_DIR. ``bucket`` is ignored — the existing single tree is
|
||||||
|
preserved verbatim, so the default backend is byte-for-byte the legacy
|
||||||
|
behaviour."""
|
||||||
|
|
||||||
|
name = "filesystem"
|
||||||
|
|
||||||
|
def _abs(self, key, *, bucket=Bucket.DOCUMENTS) -> Path:
|
||||||
|
rel = normalize_key(key)
|
||||||
|
path = (Path(config.DATA_DIR) / rel).resolve()
|
||||||
|
root = Path(config.DATA_DIR).resolve()
|
||||||
|
if root not in path.parents and path != root:
|
||||||
|
raise ValueError(f"resolved path escapes DATA_DIR: {key!r}")
|
||||||
|
return path
|
||||||
|
|
||||||
|
async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS,
|
||||||
|
content_type=None, metadata=None) -> str:
|
||||||
|
path = self._abs(key, bucket=bucket)
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
path.write_bytes(data)
|
||||||
|
return f"file://{path}"
|
||||||
|
|
||||||
|
async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS,
|
||||||
|
content_type=None, metadata=None) -> str:
|
||||||
|
path = self._abs(key, bucket=bucket)
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
shutil.copy2(src, path) # preserve mtime, as the legacy code did
|
||||||
|
return f"file://{path}"
|
||||||
|
|
||||||
|
async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS) -> bytes:
|
||||||
|
return self._abs(key, bucket=bucket).read_bytes()
|
||||||
|
|
||||||
|
async def exists(self, key, *, bucket=Bucket.DOCUMENTS) -> bool:
|
||||||
|
return self._abs(key, bucket=bucket).exists()
|
||||||
|
|
||||||
|
async def delete(self, key, *, bucket=Bucket.DOCUMENTS) -> None:
|
||||||
|
self._abs(key, bucket=bucket).unlink(missing_ok=True)
|
||||||
|
|
||||||
|
async def list_keys(self, prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]:
|
||||||
|
root = Path(config.DATA_DIR).resolve()
|
||||||
|
base = self._abs(prefix, bucket=bucket) if prefix else root
|
||||||
|
if not base.exists():
|
||||||
|
return []
|
||||||
|
out: list[str] = []
|
||||||
|
for p in sorted(base.rglob("*")):
|
||||||
|
if p.is_file():
|
||||||
|
out.append(p.resolve().relative_to(root).as_posix())
|
||||||
|
return out
|
||||||
|
|
||||||
|
def local_path(self, key, *, bucket=Bucket.DOCUMENTS) -> Path | None:
|
||||||
|
path = self._abs(key, bucket=bucket)
|
||||||
|
return path if path.exists() else None
|
||||||
|
|
||||||
|
async def presign_get(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
|
||||||
|
download_name=None) -> str:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"presigned URLs require the S3 backend (set STORAGE_BACKEND=dual|s3)"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def presign_put(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
|
||||||
|
content_type=None) -> str:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"presigned URLs require the S3 backend (set STORAGE_BACKEND=dual|s3)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class S3Backend(StorageBackend):
|
||||||
|
"""MinIO via aioboto3. Server-side ops use the internal endpoint; presigned
|
||||||
|
URLs are minted against the public endpoint so the browser can reach them
|
||||||
|
(INV-STG6)."""
|
||||||
|
|
||||||
|
name = "s3"
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._session = None # lazy aioboto3 session
|
||||||
|
|
||||||
|
def _boto(self):
|
||||||
|
import aioboto3 # lazy — absent in the default filesystem path
|
||||||
|
from botocore.config import Config as BotoConfig
|
||||||
|
if self._session is None:
|
||||||
|
self._session = aioboto3.Session()
|
||||||
|
cfg = BotoConfig(signature_version="s3v4", s3={"addressing_style": "path"})
|
||||||
|
return aioboto3, BotoConfig, cfg
|
||||||
|
|
||||||
|
def _client(self, *, public: bool = False):
|
||||||
|
_aioboto3, _BotoConfig, cfg = self._boto()
|
||||||
|
endpoint = config.MINIO_PUBLIC_ENDPOINT if public else config.MINIO_ENDPOINT
|
||||||
|
return self._session.client(
|
||||||
|
"s3",
|
||||||
|
endpoint_url=endpoint,
|
||||||
|
aws_access_key_id=config.MINIO_ACCESS_KEY,
|
||||||
|
aws_secret_access_key=config.MINIO_SECRET_KEY,
|
||||||
|
region_name=config.MINIO_REGION,
|
||||||
|
config=cfg,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS,
|
||||||
|
content_type=None, metadata=None) -> str:
|
||||||
|
k = normalize_key(key)
|
||||||
|
extra = {}
|
||||||
|
if content_type:
|
||||||
|
extra["ContentType"] = content_type
|
||||||
|
if metadata:
|
||||||
|
extra["Metadata"] = {kk: str(vv) for kk, vv in metadata.items()}
|
||||||
|
async with self._client() as s3:
|
||||||
|
await s3.put_object(Bucket=_bucket_name(bucket), Key=k, Body=data, **extra)
|
||||||
|
return f"s3://{_bucket_name(bucket)}/{k}"
|
||||||
|
|
||||||
|
async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS) -> bytes:
|
||||||
|
k = normalize_key(key)
|
||||||
|
async with self._client() as s3:
|
||||||
|
resp = await s3.get_object(Bucket=_bucket_name(bucket), Key=k)
|
||||||
|
async with resp["Body"] as stream:
|
||||||
|
return await stream.read()
|
||||||
|
|
||||||
|
async def exists(self, key, *, bucket=Bucket.DOCUMENTS) -> bool:
|
||||||
|
from botocore.exceptions import ClientError
|
||||||
|
k = normalize_key(key)
|
||||||
|
async with self._client() as s3:
|
||||||
|
try:
|
||||||
|
await s3.head_object(Bucket=_bucket_name(bucket), Key=k)
|
||||||
|
return True
|
||||||
|
except ClientError as exc:
|
||||||
|
if exc.response["Error"]["Code"] in ("404", "NoSuchKey", "NotFound"):
|
||||||
|
return False
|
||||||
|
raise
|
||||||
|
|
||||||
|
async def delete(self, key, *, bucket=Bucket.DOCUMENTS) -> None:
|
||||||
|
k = normalize_key(key)
|
||||||
|
async with self._client() as s3:
|
||||||
|
await s3.delete_object(Bucket=_bucket_name(bucket), Key=k)
|
||||||
|
|
||||||
|
async def list_keys(self, prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]:
|
||||||
|
pfx = normalize_key(prefix) if prefix else ""
|
||||||
|
out: list[str] = []
|
||||||
|
async with self._client() as s3:
|
||||||
|
paginator = s3.get_paginator("list_objects_v2")
|
||||||
|
async for page in paginator.paginate(Bucket=_bucket_name(bucket), Prefix=pfx):
|
||||||
|
for obj in page.get("Contents", []):
|
||||||
|
out.append(obj["Key"])
|
||||||
|
return out
|
||||||
|
|
||||||
|
async def presign_get(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
|
||||||
|
download_name=None) -> str:
|
||||||
|
k = normalize_key(key)
|
||||||
|
params = {"Bucket": _bucket_name(bucket), "Key": k}
|
||||||
|
if download_name:
|
||||||
|
# RFC 5987 — keep the Hebrew original filename on download (INV-STG2)
|
||||||
|
from urllib.parse import quote
|
||||||
|
params["ResponseContentDisposition"] = (
|
||||||
|
f"attachment; filename*=UTF-8''{quote(download_name)}"
|
||||||
|
)
|
||||||
|
async with self._client(public=True) as s3:
|
||||||
|
return await s3.generate_presigned_url(
|
||||||
|
"get_object", Params=params,
|
||||||
|
ExpiresIn=ttl or config.MINIO_PRESIGN_TTL,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def presign_put(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
|
||||||
|
content_type=None) -> str:
|
||||||
|
k = normalize_key(key)
|
||||||
|
params = {"Bucket": _bucket_name(bucket), "Key": k}
|
||||||
|
if content_type:
|
||||||
|
params["ContentType"] = content_type
|
||||||
|
async with self._client(public=True) as s3:
|
||||||
|
return await s3.generate_presigned_url(
|
||||||
|
"put_object", Params=params,
|
||||||
|
ExpiresIn=ttl or config.MINIO_PRESIGN_TTL,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class DualBackend(StorageBackend):
|
||||||
|
"""Migration window: writes go to BOTH disk and S3 (disk authoritative);
|
||||||
|
reads prefer S3 and fall back to disk. An S3 write failure is logged (never
|
||||||
|
swallowed — engineering §6) but does not break the app while disk holds the
|
||||||
|
canonical copy."""
|
||||||
|
|
||||||
|
name = "dual"
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.fs = FilesystemBackend()
|
||||||
|
self.s3 = S3Backend()
|
||||||
|
|
||||||
|
async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS,
|
||||||
|
content_type=None, metadata=None) -> str:
|
||||||
|
uri = await self.fs.put_bytes(key, data, bucket=bucket,
|
||||||
|
content_type=content_type, metadata=metadata)
|
||||||
|
try:
|
||||||
|
await self.s3.put_bytes(key, data, bucket=bucket,
|
||||||
|
content_type=content_type, metadata=metadata)
|
||||||
|
except Exception as exc: # noqa: BLE001 — log, don't swallow
|
||||||
|
logger.warning("dual put_bytes: S3 mirror failed for %s: %s", key, exc)
|
||||||
|
return uri
|
||||||
|
|
||||||
|
async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS,
|
||||||
|
content_type=None, metadata=None) -> str:
|
||||||
|
uri = await self.fs.put_file(src, key, bucket=bucket,
|
||||||
|
content_type=content_type, metadata=metadata)
|
||||||
|
try:
|
||||||
|
await self.s3.put_file(src, key, bucket=bucket,
|
||||||
|
content_type=content_type, metadata=metadata)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
logger.warning("dual put_file: S3 mirror failed for %s: %s", key, exc)
|
||||||
|
return uri
|
||||||
|
|
||||||
|
async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS) -> bytes:
|
||||||
|
try:
|
||||||
|
return await self.s3.get_bytes(key, bucket=bucket)
|
||||||
|
except Exception as exc: # noqa: BLE001 — fall back to disk
|
||||||
|
logger.debug("dual get_bytes: S3 miss for %s (%s); using disk", key, exc)
|
||||||
|
return await self.fs.get_bytes(key, bucket=bucket)
|
||||||
|
|
||||||
|
async def exists(self, key, *, bucket=Bucket.DOCUMENTS) -> bool:
|
||||||
|
if await self.fs.exists(key, bucket=bucket):
|
||||||
|
return True
|
||||||
|
try:
|
||||||
|
return await self.s3.exists(key, bucket=bucket)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def delete(self, key, *, bucket=Bucket.DOCUMENTS) -> None:
|
||||||
|
await self.fs.delete(key, bucket=bucket)
|
||||||
|
try:
|
||||||
|
await self.s3.delete(key, bucket=bucket)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
logger.warning("dual delete: S3 delete failed for %s: %s", key, exc)
|
||||||
|
|
||||||
|
async def list_keys(self, prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]:
|
||||||
|
return await self.fs.list_keys(prefix, bucket=bucket)
|
||||||
|
|
||||||
|
def local_path(self, key, *, bucket=Bucket.DOCUMENTS) -> Path | None:
|
||||||
|
return self.fs.local_path(key, bucket=bucket)
|
||||||
|
|
||||||
|
async def presign_get(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
|
||||||
|
download_name=None) -> str:
|
||||||
|
return await self.s3.presign_get(key, bucket=bucket, ttl=ttl,
|
||||||
|
download_name=download_name)
|
||||||
|
|
||||||
|
async def presign_put(self, key, *, bucket=Bucket.DOCUMENTS, ttl=None,
|
||||||
|
content_type=None) -> str:
|
||||||
|
return await self.s3.presign_put(key, bucket=bucket, ttl=ttl,
|
||||||
|
content_type=content_type)
|
||||||
|
|
||||||
|
|
||||||
|
_BACKENDS = {
|
||||||
|
"filesystem": FilesystemBackend,
|
||||||
|
"dual": DualBackend,
|
||||||
|
"s3": S3Backend,
|
||||||
|
}
|
||||||
|
|
||||||
|
_singleton: StorageBackend | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_storage() -> StorageBackend:
|
||||||
|
"""Return the process-wide storage backend selected by config.STORAGE_BACKEND
|
||||||
|
(cached). Unknown values fall back to ``filesystem`` with a warning rather
|
||||||
|
than crashing the app."""
|
||||||
|
global _singleton
|
||||||
|
if _singleton is None:
|
||||||
|
cls = _BACKENDS.get(config.STORAGE_BACKEND)
|
||||||
|
if cls is None:
|
||||||
|
logger.warning(
|
||||||
|
"unknown STORAGE_BACKEND=%r — falling back to filesystem",
|
||||||
|
config.STORAGE_BACKEND,
|
||||||
|
)
|
||||||
|
cls = FilesystemBackend
|
||||||
|
_singleton = cls()
|
||||||
|
logger.info("storage backend = %s", _singleton.name)
|
||||||
|
return _singleton
|
||||||
|
|
||||||
|
|
||||||
|
def reset_storage_cache() -> None:
|
||||||
|
"""Drop the cached backend (tests / after an env change)."""
|
||||||
|
global _singleton
|
||||||
|
_singleton = None
|
||||||
|
|
||||||
|
|
||||||
|
# ── module-level convenience wrappers ──────────────────────────────
|
||||||
|
# Thin pass-throughs so call-sites can ``from legal_mcp.services import storage``
|
||||||
|
# and use ``await storage.put_bytes(...)`` without fetching the singleton.
|
||||||
|
|
||||||
|
async def put_bytes(key, data, *, bucket=Bucket.DOCUMENTS, content_type=None,
|
||||||
|
metadata=None) -> str:
|
||||||
|
return await get_storage().put_bytes(
|
||||||
|
key, data, bucket=bucket, content_type=content_type, metadata=metadata)
|
||||||
|
|
||||||
|
|
||||||
|
async def put_file(src, key, *, bucket=Bucket.DOCUMENTS, content_type=None,
|
||||||
|
metadata=None) -> str:
|
||||||
|
return await get_storage().put_file(
|
||||||
|
src, key, bucket=bucket, content_type=content_type, metadata=metadata)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_bytes(key, *, bucket=Bucket.DOCUMENTS) -> bytes:
|
||||||
|
return await get_storage().get_bytes(key, bucket=bucket)
|
||||||
|
|
||||||
|
|
||||||
|
async def exists(key, *, bucket=Bucket.DOCUMENTS) -> bool:
|
||||||
|
return await get_storage().exists(key, bucket=bucket)
|
||||||
|
|
||||||
|
|
||||||
|
async def delete(key, *, bucket=Bucket.DOCUMENTS) -> None:
|
||||||
|
return await get_storage().delete(key, bucket=bucket)
|
||||||
|
|
||||||
|
|
||||||
|
async def list_keys(prefix, *, bucket=Bucket.DOCUMENTS) -> list[str]:
|
||||||
|
return await get_storage().list_keys(prefix, bucket=bucket)
|
||||||
|
|
||||||
|
|
||||||
|
async def presign_get(key, *, bucket=Bucket.DOCUMENTS, ttl=None,
|
||||||
|
download_name=None) -> str:
|
||||||
|
return await get_storage().presign_get(
|
||||||
|
key, bucket=bucket, ttl=ttl, download_name=download_name)
|
||||||
|
|
||||||
|
|
||||||
|
async def presign_put(key, *, bucket=Bucket.DOCUMENTS, ttl=None,
|
||||||
|
content_type=None) -> str:
|
||||||
|
return await get_storage().presign_put(
|
||||||
|
key, bucket=bucket, ttl=ttl, content_type=content_type)
|
||||||
|
|
||||||
|
|
||||||
|
def local_path(key, *, bucket=Bucket.DOCUMENTS) -> Path | None:
|
||||||
|
return get_storage().local_path(key, bucket=bucket)
|
||||||
|
|
||||||
|
|
||||||
|
async def ensure_local(key, *, bucket=Bucket.DOCUMENTS) -> Path:
|
||||||
|
return await get_storage().ensure_local(key, bucket=bucket)
|
||||||
198
mcp-server/tests/test_storage.py
Normal file
198
mcp-server/tests/test_storage.py
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
"""Unit tests for the unified storage layer (X14, services/storage.py).
|
||||||
|
|
||||||
|
Sync tests driving the async API via asyncio.run (matches the repo
|
||||||
|
convention — no pytest-asyncio). The filesystem backend is exercised against a
|
||||||
|
tmp DATA_DIR; the S3 path is stubbed so the suite needs no MinIO and no
|
||||||
|
aioboto3.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from legal_mcp import config
|
||||||
|
from legal_mcp.services import storage
|
||||||
|
from legal_mcp.services.storage import Bucket
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _tmp_datadir(tmp_path, monkeypatch):
|
||||||
|
monkeypatch.setattr(config, "DATA_DIR", tmp_path)
|
||||||
|
monkeypatch.setattr(config, "STORAGE_BACKEND", "filesystem")
|
||||||
|
storage.reset_storage_cache()
|
||||||
|
yield tmp_path
|
||||||
|
storage.reset_storage_cache()
|
||||||
|
|
||||||
|
|
||||||
|
def run(coro):
|
||||||
|
return asyncio.run(coro)
|
||||||
|
|
||||||
|
|
||||||
|
# ── normalize_key ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_normalize_key_relative():
|
||||||
|
assert storage.normalize_key("cases/8174-24/originals/x.pdf") == \
|
||||||
|
"cases/8174-24/originals/x.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_key_rejects_parent_traversal():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
storage.normalize_key("cases/../../etc/passwd")
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_key_accepts_abs_under_datadir(_tmp_datadir):
|
||||||
|
abs_key = _tmp_datadir / "cases" / "x.pdf"
|
||||||
|
assert storage.normalize_key(abs_key) == "cases/x.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_key_rejects_abs_outside_datadir():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
storage.normalize_key("/var/secret/x.pdf")
|
||||||
|
|
||||||
|
|
||||||
|
# ── filesystem backend ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_filesystem_roundtrip(_tmp_datadir):
|
||||||
|
be = storage.FilesystemBackend()
|
||||||
|
key = "cases/1/originals/a.pdf"
|
||||||
|
uri = run(be.put_bytes(key, b"hello", content_type="application/pdf"))
|
||||||
|
assert uri.startswith("file://")
|
||||||
|
assert (_tmp_datadir / key).read_bytes() == b"hello"
|
||||||
|
assert run(be.exists(key)) is True
|
||||||
|
assert run(be.get_bytes(key)) == b"hello"
|
||||||
|
run(be.delete(key))
|
||||||
|
assert run(be.exists(key)) is False
|
||||||
|
# delete is idempotent (missing_ok)
|
||||||
|
run(be.delete(key))
|
||||||
|
|
||||||
|
|
||||||
|
def test_filesystem_put_file(_tmp_datadir):
|
||||||
|
be = storage.FilesystemBackend()
|
||||||
|
src = _tmp_datadir / "src.bin"
|
||||||
|
src.write_bytes(b"payload")
|
||||||
|
run(be.put_file(src, "precedent-library/court_ruling/u.pdf"))
|
||||||
|
assert (_tmp_datadir / "precedent-library/court_ruling/u.pdf").read_bytes() == b"payload"
|
||||||
|
|
||||||
|
|
||||||
|
def test_filesystem_bucket_is_ignored_legacy_layout(_tmp_datadir):
|
||||||
|
"""Even for a non-default bucket, the FS backend keeps the single tree —
|
||||||
|
so the default backend is byte-for-byte the legacy layout."""
|
||||||
|
be = storage.FilesystemBackend()
|
||||||
|
run(be.put_bytes("thumbnails/d/p001.jpg", b"img", bucket=Bucket.DERIVED))
|
||||||
|
assert (_tmp_datadir / "thumbnails/d/p001.jpg").exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_filesystem_list_keys(_tmp_datadir):
|
||||||
|
be = storage.FilesystemBackend()
|
||||||
|
run(be.put_bytes("cases/1/a.txt", b"a"))
|
||||||
|
run(be.put_bytes("cases/1/sub/b.txt", b"b"))
|
||||||
|
run(be.put_bytes("cases/2/c.txt", b"c"))
|
||||||
|
keys = run(be.list_keys("cases/1"))
|
||||||
|
assert keys == ["cases/1/a.txt", "cases/1/sub/b.txt"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_filesystem_local_path_and_ensure_local(_tmp_datadir):
|
||||||
|
be = storage.FilesystemBackend()
|
||||||
|
run(be.put_bytes("cases/1/a.pdf", b"x"))
|
||||||
|
assert be.local_path("cases/1/a.pdf") == (_tmp_datadir / "cases/1/a.pdf").resolve()
|
||||||
|
assert be.local_path("cases/1/missing.pdf") is None
|
||||||
|
# ensure_local returns the real path without copying
|
||||||
|
assert run(be.ensure_local("cases/1/a.pdf")) == (_tmp_datadir / "cases/1/a.pdf").resolve()
|
||||||
|
|
||||||
|
|
||||||
|
def test_filesystem_presign_unsupported(_tmp_datadir):
|
||||||
|
be = storage.FilesystemBackend()
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
run(be.presign_get("cases/1/a.pdf"))
|
||||||
|
|
||||||
|
|
||||||
|
# ── backend selection ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_get_storage_default_is_filesystem(monkeypatch):
|
||||||
|
monkeypatch.setattr(config, "STORAGE_BACKEND", "filesystem")
|
||||||
|
storage.reset_storage_cache()
|
||||||
|
assert isinstance(storage.get_storage(), storage.FilesystemBackend)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_storage_unknown_falls_back(monkeypatch):
|
||||||
|
monkeypatch.setattr(config, "STORAGE_BACKEND", "bogus")
|
||||||
|
storage.reset_storage_cache()
|
||||||
|
assert isinstance(storage.get_storage(), storage.FilesystemBackend)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_storage_dual(monkeypatch):
|
||||||
|
monkeypatch.setattr(config, "STORAGE_BACKEND", "dual")
|
||||||
|
storage.reset_storage_cache()
|
||||||
|
assert isinstance(storage.get_storage(), storage.DualBackend)
|
||||||
|
|
||||||
|
|
||||||
|
# ── dual backend (S3 stubbed) ──────────────────────────────────────
|
||||||
|
|
||||||
|
class _FakeS3:
|
||||||
|
"""Minimal async S3 stub. ``store`` None ⇒ S3 'down' (raises)."""
|
||||||
|
|
||||||
|
def __init__(self, store):
|
||||||
|
self.store = store # dict or None
|
||||||
|
|
||||||
|
async def put_bytes(self, key, data, *, bucket=Bucket.DOCUMENTS, content_type=None, metadata=None):
|
||||||
|
if self.store is None:
|
||||||
|
raise RuntimeError("s3 down")
|
||||||
|
self.store[storage.normalize_key(key)] = data
|
||||||
|
return f"s3://stub/{storage.normalize_key(key)}"
|
||||||
|
|
||||||
|
async def put_file(self, src, key, *, bucket=Bucket.DOCUMENTS, content_type=None, metadata=None):
|
||||||
|
with open(src, "rb") as fh:
|
||||||
|
return await self.put_bytes(key, fh.read(), bucket=bucket)
|
||||||
|
|
||||||
|
async def get_bytes(self, key, *, bucket=Bucket.DOCUMENTS):
|
||||||
|
if self.store is None:
|
||||||
|
raise RuntimeError("s3 down")
|
||||||
|
return self.store[storage.normalize_key(key)]
|
||||||
|
|
||||||
|
async def exists(self, key, *, bucket=Bucket.DOCUMENTS):
|
||||||
|
return self.store is not None and storage.normalize_key(key) in self.store
|
||||||
|
|
||||||
|
async def delete(self, key, *, bucket=Bucket.DOCUMENTS):
|
||||||
|
if self.store is not None:
|
||||||
|
self.store.pop(storage.normalize_key(key), None)
|
||||||
|
|
||||||
|
|
||||||
|
def _dual_with(store):
|
||||||
|
dual = storage.DualBackend()
|
||||||
|
dual.s3 = _FakeS3(store)
|
||||||
|
return dual
|
||||||
|
|
||||||
|
|
||||||
|
def test_dual_writes_both(_tmp_datadir):
|
||||||
|
store = {}
|
||||||
|
dual = _dual_with(store)
|
||||||
|
run(dual.put_bytes("cases/1/a.pdf", b"data"))
|
||||||
|
assert (_tmp_datadir / "cases/1/a.pdf").read_bytes() == b"data" # disk
|
||||||
|
assert store["cases/1/a.pdf"] == b"data" # s3
|
||||||
|
|
||||||
|
|
||||||
|
def test_dual_s3_write_failure_does_not_break(_tmp_datadir):
|
||||||
|
dual = _dual_with(None) # s3 down
|
||||||
|
# disk write still succeeds; s3 failure is logged, not raised
|
||||||
|
run(dual.put_bytes("cases/1/a.pdf", b"data"))
|
||||||
|
assert (_tmp_datadir / "cases/1/a.pdf").read_bytes() == b"data"
|
||||||
|
|
||||||
|
|
||||||
|
def test_dual_get_prefers_s3(_tmp_datadir):
|
||||||
|
dual = _dual_with({"cases/1/a.pdf": b"from-s3"})
|
||||||
|
run(dual.fs.put_bytes("cases/1/a.pdf", b"from-disk"))
|
||||||
|
assert run(dual.get_bytes("cases/1/a.pdf")) == b"from-s3"
|
||||||
|
|
||||||
|
|
||||||
|
def test_dual_get_falls_back_to_disk(_tmp_datadir):
|
||||||
|
dual = _dual_with(None) # s3 down → must read disk
|
||||||
|
run(dual.fs.put_bytes("cases/1/a.pdf", b"from-disk"))
|
||||||
|
assert run(dual.get_bytes("cases/1/a.pdf")) == b"from-disk"
|
||||||
|
|
||||||
|
|
||||||
|
def test_bucket_name_resolution(monkeypatch):
|
||||||
|
monkeypatch.setattr(config, "MINIO_BUCKET_DOCUMENTS", "legal-documents")
|
||||||
|
monkeypatch.setattr(config, "MINIO_BUCKET_IMMUTABLE", "legal-immutable")
|
||||||
|
monkeypatch.setattr(config, "MINIO_BUCKET_DERIVED", "legal-derived")
|
||||||
|
assert storage._bucket_name(Bucket.DOCUMENTS) == "legal-documents"
|
||||||
|
assert storage._bucket_name(Bucket.IMMUTABLE) == "legal-immutable"
|
||||||
|
assert storage._bucket_name(Bucket.DERIVED) == "legal-derived"
|
||||||
@@ -13,7 +13,8 @@ from typing import Any, Literal
|
|||||||
|
|
||||||
EnvType = Literal["bool", "int", "float", "string"]
|
EnvType = Literal["bool", "int", "float", "string"]
|
||||||
EnvCategory = Literal[
|
EnvCategory = Literal[
|
||||||
"multimodal", "rerank", "halacha", "credentials", "connection", "general"
|
"multimodal", "rerank", "halacha", "credentials", "connection",
|
||||||
|
"storage", "general"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -89,6 +90,58 @@ ENV_CATALOG: dict[str, EnvSpec] = {
|
|||||||
"סף confidence ל-auto-approve של הלכות שחולצו",
|
"סף confidence ל-auto-approve של הלכות שחולצו",
|
||||||
is_secret=False, is_editable=True, default=0.80, min=0.0, max=1.0,
|
is_secret=False, is_editable=True, default=0.80, min=0.0, max=1.0,
|
||||||
),
|
),
|
||||||
|
# ── storage (X14 / MinIO) ──────────────────────────────────────
|
||||||
|
"STORAGE_BACKEND": EnvSpec(
|
||||||
|
"STORAGE_BACKEND", "storage", "string",
|
||||||
|
"מנוע אחסון: filesystem (דיסק) / dual (דיסק+S3) / s3 (MinIO בלבד)",
|
||||||
|
is_secret=False, is_editable=True, default="filesystem",
|
||||||
|
enum_values=("filesystem", "dual", "s3"),
|
||||||
|
),
|
||||||
|
"MINIO_ENDPOINT": EnvSpec(
|
||||||
|
"MINIO_ENDPOINT", "storage", "string",
|
||||||
|
"endpoint פנימי של MinIO (server-side, רשת Docker)",
|
||||||
|
is_secret=False, is_editable=False, default="http://minio:9000",
|
||||||
|
),
|
||||||
|
"MINIO_PUBLIC_ENDPOINT": EnvSpec(
|
||||||
|
"MINIO_PUBLIC_ENDPOINT", "storage", "string",
|
||||||
|
"endpoint ציבורי ל-presigned URLs (גישת דפדפן)",
|
||||||
|
is_secret=False, is_editable=False, default="https://s3.nautilus.marcusgroup.org",
|
||||||
|
),
|
||||||
|
"MINIO_ACCESS_KEY": EnvSpec(
|
||||||
|
"MINIO_ACCESS_KEY", "storage", "string",
|
||||||
|
"MinIO access key (service-account מוגבל ל-3 הדליות)",
|
||||||
|
is_secret=True, is_editable=False,
|
||||||
|
),
|
||||||
|
"MINIO_SECRET_KEY": EnvSpec(
|
||||||
|
"MINIO_SECRET_KEY", "storage", "string",
|
||||||
|
"MinIO secret key",
|
||||||
|
is_secret=True, is_editable=False,
|
||||||
|
),
|
||||||
|
"MINIO_REGION": EnvSpec(
|
||||||
|
"MINIO_REGION", "storage", "string",
|
||||||
|
"אזור S3 (MinIO מתעלם — לחתימת SigV4)",
|
||||||
|
is_secret=False, is_editable=False, default="us-east-1",
|
||||||
|
),
|
||||||
|
"MINIO_BUCKET_DOCUMENTS": EnvSpec(
|
||||||
|
"MINIO_BUCKET_DOCUMENTS", "storage", "string",
|
||||||
|
"דלי מסמכי-מקור (versioning)",
|
||||||
|
is_secret=False, is_editable=False, default="legal-documents",
|
||||||
|
),
|
||||||
|
"MINIO_BUCKET_IMMUTABLE": EnvSpec(
|
||||||
|
"MINIO_BUCKET_IMMUTABLE", "storage", "string",
|
||||||
|
"דלי החלטות סופיות (versioning + Object-Lock COMPLIANCE)",
|
||||||
|
is_secret=False, is_editable=False, default="legal-immutable",
|
||||||
|
),
|
||||||
|
"MINIO_BUCKET_DERIVED": EnvSpec(
|
||||||
|
"MINIO_BUCKET_DERIVED", "storage", "string",
|
||||||
|
"דלי נגזרים (thumbnails / extracted — ניתן-לשחזור)",
|
||||||
|
is_secret=False, is_editable=False, default="legal-derived",
|
||||||
|
),
|
||||||
|
"MINIO_PRESIGN_TTL": EnvSpec(
|
||||||
|
"MINIO_PRESIGN_TTL", "storage", "int",
|
||||||
|
"תוקף presigned URL בשניות (מקס' SigV4 = 7 ימים)",
|
||||||
|
is_secret=False, is_editable=True, default=900, min=60, max=604800,
|
||||||
|
),
|
||||||
# ── general ────────────────────────────────────────────────────
|
# ── general ────────────────────────────────────────────────────
|
||||||
"VOYAGE_MODEL": EnvSpec(
|
"VOYAGE_MODEL": EnvSpec(
|
||||||
"VOYAGE_MODEL", "general", "string",
|
"VOYAGE_MODEL", "general", "string",
|
||||||
|
|||||||
Reference in New Issue
Block a user