"""INV-STG1 leak-guard — no blob may be written to disk without going through, or being mirrored to, the storage layer (services/storage.py). After the cutover to ``STORAGE_BACKEND=s3`` a direct disk write under DATA_DIR that bypasses storage creates an orphan: a file in the old folders that never reaches MinIO (lost on cleanup, not served, not backed up). This static guard fails CI on any NEW direct blob-write (``write_bytes``/``write_text``/ ``shutil.copy*``/``shutil.move``/``open(...,'wb')``) in the web API or services that is not explicitly acknowledged with a ``# noqa: STG1`` marker. Marking a line means the author has CONSCIOUSLY handled it — either sealed it (``_seal_blob`` / ``storage.mirror`` right after, for paths the disk-based pipeline still reads) or justified it as benign (temp file, staging-then-unlink, git-per-case metadata, log/flag, BytesIO buffer, storage fallback). New unmarked writes block the build until the author does the same. """ from __future__ import annotations import re from pathlib import Path import pytest _ROOT = Path(__file__).resolve().parents[2] # storage.py is the storage layer itself — its disk writes ARE the implementation. _EXCLUDE = {"storage.py"} _SCAN = [ _ROOT / "web" / "app.py", *(p for p in sorted((_ROOT / "mcp-server" / "src" / "legal_mcp" / "services").glob("*.py")) if p.name not in _EXCLUDE), ] # Direct-disk-write patterns that could land a blob in the old folders. _PATTERNS = re.compile( r"\.write_bytes\(|\.write_text\(|shutil\.copy2?\(|shutil\.move\(|open\([^)]*,\s*['\"][wax]b?['\"]" ) _MARKER = "noqa: STG1" def _violations() -> list[str]: out: list[str] = [] for f in _SCAN: if not f.exists(): continue for i, line in enumerate(f.read_text(encoding="utf-8").splitlines(), 1): s = line.strip() if s.startswith("#"): continue if _PATTERNS.search(line) and _MARKER not in line: out.append(f"{f.relative_to(_ROOT)}:{i}: {s[:100]}") return out def test_no_unmarked_blob_disk_writes(): violations = _violations() assert not violations, ( "INV-STG1: direct blob-disk-write(s) without a `# noqa: STG1` marker — " "seal each via `_seal_blob`/`storage.mirror` (if the pipeline reads the " "disk path) or justify it as benign on the line:\n " + "\n ".join(violations) )