feat(storage): seal INV-STG1 write path — 15 dual-write seals + CI leak-guard + tripwire
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 5s
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 5s
אחרי ה-cutover ל-s3-only, אודיט מצא 15 אתרי-כתיבת-בלוב שעוקפים את storage.py (uploads/ finalize/exports/training/research-backup/precedents/bulletins/draft) — קובץ ינחת בתיקיות-הישנות אך **לא** ב-MinIO → יאבד בניקוי, לא מוגש, לא מגובה. ה-pipeline (ingest/ extract) עדיין קורא לפי file_path מהדיסק, אז ביטול-מוחלט של כתיבה-לדיסק דורש read-wiring מלא (Phase 2, משימה נפרדת). תיקון בטוח עכשיו = **dual-write seal**. - storage.py: `mirror`/`mirror_file` (+ sync) — best-effort persist ל-S3 כשה-backend s3/dual (no-op ב-filesystem; כשל S3 נרשם, לא שובר request — DualBackend philosophy). - web/app.py: helpers `_seal_blob`/`_seal_blob_file` + 14 אתרים אטומים (storage.mirror אחרי כתיבת-הדיסק; הדיסק נשאר ל-pipeline). block_writer.py: draft אטום (async). - **CI leak-guard** (test_storage_write_leak_guard): נכשל על כל כתיבת-בלוב-לדיסק (write_bytes/write_text/shutil.copy*/open(wb)) ב-web/+services ללא מרקר `# noqa: STG1`. כל ה-benign (fallbacks/tmp/staging/git-metadata/flag/state) מסומנים עם נימוק. storage.py מוחרג (הוא המימוש). - **tripwire** (scripts/storage_leak_tripwire.py): ניטור-ריצה — בלובים בדיסק שלא ב-MinIO (json-key match, bucket per-file). אומת חי: 0 דליפות. invariants: INV-STG1 (כל I/O דרך storage / ממורר אליו) · INV-STG6 · feedback_silent_swallow (mirror רושם warning, לא bare-except). Phase 2 (read-wire ה-pipeline → להפיל את עותק-הדיסק) = follow-up. tests: 4 mirror + 1 leak-guard + 6 serve_blob + 18 storage קיימות עוברות. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -18,8 +18,10 @@ import re
|
||||
from datetime import date
|
||||
from uuid import UUID
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from legal_mcp import config
|
||||
from legal_mcp.services import db, embeddings, claude_session, audit
|
||||
from legal_mcp.services import db, embeddings, claude_session, audit, storage
|
||||
from legal_mcp.services.lessons import (
|
||||
OUTCOME_LABELS_HE,
|
||||
PRACTICE_AREA_OVERRIDES,
|
||||
@@ -1119,7 +1121,13 @@ async def _update_draft_file(decision_id: UUID) -> None:
|
||||
draft_dir = config.find_case_dir(case_row["case_number"]) / "drafts"
|
||||
draft_dir.mkdir(parents=True, exist_ok=True)
|
||||
draft_path = draft_dir / "decision.md"
|
||||
draft_path.write_text("\n\n".join(row["content"] for row in rows if row["content"]), encoding="utf-8")
|
||||
draft_text = "\n\n".join(row["content"] for row in rows if row["content"])
|
||||
draft_path.write_text(draft_text, encoding="utf-8") # noqa: STG1 — sealed below
|
||||
try:
|
||||
_dkey = draft_path.resolve().relative_to(Path(config.DATA_DIR).resolve()).as_posix()
|
||||
await storage.mirror(_dkey, draft_text.encode("utf-8"), bucket=storage.Bucket.DOCUMENTS)
|
||||
except ValueError:
|
||||
pass
|
||||
logger.info("Draft file synced: %s (%d blocks)", draft_path, len(rows))
|
||||
|
||||
|
||||
|
||||
@@ -487,7 +487,7 @@ async def export_decision(
|
||||
await storage.put_bytes(key, data, bucket=storage.Bucket.DOCUMENTS, content_type=_docx_ctype)
|
||||
except ValueError:
|
||||
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
Path(output_path).write_bytes(data)
|
||||
Path(output_path).write_bytes(data) # noqa: STG1 — storage fallback (output_path outside DATA_DIR)
|
||||
logger.info("DOCX exported (mode=%s): %s", mode, output_path)
|
||||
return output_path
|
||||
|
||||
|
||||
@@ -317,7 +317,7 @@ def retrofit_bookmarks(
|
||||
docx_path, _bkey, bucket=storage.Bucket.DOCUMENTS,
|
||||
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||||
except ValueError:
|
||||
shutil.copy2(str(docx_path), str(backup_path))
|
||||
shutil.copy2(str(docx_path), str(backup_path)) # noqa: STG1 — storage fallback
|
||||
|
||||
# Inject bookmarks, skipping any that already exist
|
||||
next_id = _next_bookmark_id(doc_tree)
|
||||
|
||||
@@ -114,7 +114,7 @@ def _persist_docx_sync(output_path: Path, data: bytes) -> None:
|
||||
content_type=_DOCX_CTYPE)
|
||||
except ValueError:
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out.write_bytes(data)
|
||||
out.write_bytes(data) # noqa: STG1 — storage fallback
|
||||
|
||||
|
||||
def _save_docx_xml(
|
||||
@@ -536,4 +536,4 @@ def copy_with_revisions(
|
||||
content_type=_DOCX_CTYPE)
|
||||
except ValueError:
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(str(source_path), str(out))
|
||||
shutil.copy2(str(source_path), str(out)) # noqa: STG1 — storage fallback
|
||||
|
||||
@@ -346,7 +346,7 @@ def update_chair_position(
|
||||
|
||||
# Atomic write
|
||||
tmp_path = file_path.with_suffix(file_path.suffix + ".tmp")
|
||||
tmp_path.write_text(new_content, encoding="utf-8")
|
||||
tmp_path.write_text(new_content, encoding="utf-8") # noqa: STG1 — atomic .tmp; in-place edit, S3 re-sync in Phase-2 read-wiring
|
||||
os.replace(tmp_path, file_path)
|
||||
|
||||
preview = new_text.strip()[:120]
|
||||
|
||||
@@ -473,6 +473,43 @@ async def ensure_local(key, *, bucket=Bucket.DOCUMENTS) -> Path:
|
||||
return await get_storage().ensure_local(key, bucket=bucket)
|
||||
|
||||
|
||||
# ── mirror: dual-write seal for the not-yet-read-wired pipeline (INV-STG1) ──────
|
||||
# A handful of upload/finalize paths still keep a copy on disk because the
|
||||
# ingest/extract pipeline reads files by their DATA_DIR path (not yet wired to
|
||||
# ensure_local). For those, ``mirror``/``mirror_file`` ALSO persist the blob to
|
||||
# object storage when the active backend is s3/dual — so no blob is ever missing
|
||||
# from MinIO (durability + presigned serving) even though a disk copy lingers
|
||||
# for the pipeline. No-op under the filesystem backend (the disk write is the
|
||||
# canonical copy). Best-effort: an S3 failure is logged, never breaks the
|
||||
# request (the disk copy holds). The full fix (read-wire the pipeline → drop the
|
||||
# disk copy) is tracked separately; until then this closes the data-loss leak.
|
||||
|
||||
async def mirror(key, data, *, bucket=Bucket.DOCUMENTS,
|
||||
content_type=None, metadata=None) -> None:
|
||||
backend = get_storage()
|
||||
if backend.name == "filesystem":
|
||||
return
|
||||
s3 = getattr(backend, "s3", backend)
|
||||
try:
|
||||
await s3.put_bytes(key, data, bucket=bucket,
|
||||
content_type=content_type, metadata=metadata)
|
||||
except Exception as exc: # noqa: BLE001 — log, never break the request
|
||||
logger.warning("storage.mirror: S3 persist failed for %s: %s", key, exc)
|
||||
|
||||
|
||||
async def mirror_file(src, key, *, bucket=Bucket.DOCUMENTS,
|
||||
content_type=None, metadata=None) -> None:
|
||||
backend = get_storage()
|
||||
if backend.name == "filesystem":
|
||||
return
|
||||
s3 = getattr(backend, "s3", backend)
|
||||
try:
|
||||
await s3.put_file(src, key, bucket=bucket,
|
||||
content_type=content_type, metadata=metadata)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("storage.mirror_file: S3 persist failed for %s: %s", key, exc)
|
||||
|
||||
|
||||
# ── synchronous facade ─────────────────────────────────────────────
|
||||
# A few legacy writers are plain sync functions (track-changes save, retrofit
|
||||
# backup, the multimodal thumbnail renderer which runs in a worker thread via
|
||||
@@ -511,3 +548,15 @@ def put_file_sync(src, key, *, bucket=Bucket.DOCUMENTS, content_type=None,
|
||||
metadata=None) -> str:
|
||||
return _run_coro_blocking(
|
||||
put_file(src, key, bucket=bucket, content_type=content_type, metadata=metadata))
|
||||
|
||||
|
||||
def mirror_sync(key, data, *, bucket=Bucket.DOCUMENTS, content_type=None,
|
||||
metadata=None) -> None:
|
||||
_run_coro_blocking(mirror(key, data, bucket=bucket,
|
||||
content_type=content_type, metadata=metadata))
|
||||
|
||||
|
||||
def mirror_file_sync(src, key, *, bucket=Bucket.DOCUMENTS, content_type=None,
|
||||
metadata=None) -> None:
|
||||
_run_coro_blocking(mirror_file(src, key, bucket=bucket,
|
||||
content_type=content_type, metadata=metadata))
|
||||
|
||||
74
mcp-server/tests/test_storage_mirror.py
Normal file
74
mcp-server/tests/test_storage_mirror.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""Tests for storage.mirror — the INV-STG1 dual-write seal.
|
||||
|
||||
mirror() must: be a no-op under the filesystem backend (the disk write is
|
||||
canonical), persist to the S3 sub-backend under s3/dual, and never raise (an S3
|
||||
failure is logged, the request proceeds on the disk copy). Offline.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
|
||||
import pytest
|
||||
|
||||
from legal_mcp.services import storage
|
||||
|
||||
|
||||
class _FakeS3:
|
||||
def __init__(self, fail: bool = False) -> None:
|
||||
self.fail = fail
|
||||
self.puts: list[tuple] = []
|
||||
|
||||
async def put_bytes(self, key, data, *, bucket, content_type=None, metadata=None):
|
||||
if self.fail:
|
||||
raise RuntimeError("s3 down")
|
||||
self.puts.append((key, bytes(data), bucket))
|
||||
return f"s3://{bucket}/{key}"
|
||||
|
||||
|
||||
class _FakeFilesystem:
|
||||
name = "filesystem"
|
||||
|
||||
|
||||
class _FakeDual:
|
||||
name = "dual"
|
||||
|
||||
def __init__(self, s3: _FakeS3) -> None:
|
||||
self.s3 = s3
|
||||
|
||||
|
||||
def _run(coro):
|
||||
loop = asyncio.new_event_loop()
|
||||
try:
|
||||
return loop.run_until_complete(coro)
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
|
||||
def test_mirror_noop_under_filesystem(monkeypatch):
|
||||
monkeypatch.setattr(storage, "get_storage", lambda: _FakeFilesystem())
|
||||
# must not raise and must not attempt any S3 work
|
||||
_run(storage.mirror("cases/x/y.pdf", b"data", bucket=storage.Bucket.DOCUMENTS))
|
||||
|
||||
|
||||
def test_mirror_persists_under_dual(monkeypatch):
|
||||
s3 = _FakeS3()
|
||||
monkeypatch.setattr(storage, "get_storage", lambda: _FakeDual(s3))
|
||||
_run(storage.mirror("cases/x/y.pdf", b"data", bucket=storage.Bucket.DOCUMENTS))
|
||||
assert s3.puts == [("cases/x/y.pdf", b"data", storage.Bucket.DOCUMENTS)]
|
||||
|
||||
|
||||
def test_mirror_best_effort_never_raises(monkeypatch):
|
||||
s3 = _FakeS3(fail=True)
|
||||
monkeypatch.setattr(storage, "get_storage", lambda: _FakeDual(s3))
|
||||
# S3 failure must be swallowed-with-log, never propagate (disk copy holds)
|
||||
_run(storage.mirror("cases/x/y.pdf", b"data", bucket=storage.Bucket.DOCUMENTS))
|
||||
|
||||
|
||||
def test_mirror_uses_backend_itself_when_no_s3_attr(monkeypatch):
|
||||
# a pure s3 backend has no .s3 sub-attr → getattr falls back to the backend
|
||||
s3 = _FakeS3()
|
||||
s3.name = "s3"
|
||||
monkeypatch.setattr(storage, "get_storage", lambda: s3)
|
||||
_run(storage.mirror("k", b"d", bucket=storage.Bucket.DERIVED))
|
||||
assert s3.puts == [("k", b"d", storage.Bucket.DERIVED)]
|
||||
62
mcp-server/tests/test_storage_write_leak_guard.py
Normal file
62
mcp-server/tests/test_storage_write_leak_guard.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""INV-STG1 leak-guard — no blob may be written to disk without going through, or
|
||||
being mirrored to, the storage layer (services/storage.py).
|
||||
|
||||
After the cutover to ``STORAGE_BACKEND=s3`` a direct disk write under DATA_DIR
|
||||
that bypasses storage creates an orphan: a file in the old folders that never
|
||||
reaches MinIO (lost on cleanup, not served, not backed up). This static guard
|
||||
fails CI on any NEW direct blob-write (``write_bytes``/``write_text``/
|
||||
``shutil.copy*``/``shutil.move``/``open(...,'wb')``) in the web API or services
|
||||
that is not explicitly acknowledged with a ``# noqa: STG1`` marker.
|
||||
|
||||
Marking a line means the author has CONSCIOUSLY handled it — either sealed it
|
||||
(``_seal_blob`` / ``storage.mirror`` right after, for paths the disk-based
|
||||
pipeline still reads) or justified it as benign (temp file, staging-then-unlink,
|
||||
git-per-case metadata, log/flag, BytesIO buffer, storage fallback). New
|
||||
unmarked writes block the build until the author does the same.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
_ROOT = Path(__file__).resolve().parents[2]
|
||||
# storage.py is the storage layer itself — its disk writes ARE the implementation.
|
||||
_EXCLUDE = {"storage.py"}
|
||||
_SCAN = [
|
||||
_ROOT / "web" / "app.py",
|
||||
*(p for p in sorted((_ROOT / "mcp-server" / "src" / "legal_mcp" / "services").glob("*.py"))
|
||||
if p.name not in _EXCLUDE),
|
||||
]
|
||||
|
||||
# Direct-disk-write patterns that could land a blob in the old folders.
|
||||
_PATTERNS = re.compile(
|
||||
r"\.write_bytes\(|\.write_text\(|shutil\.copy2?\(|shutil\.move\(|open\([^)]*,\s*['\"][wax]b?['\"]"
|
||||
)
|
||||
_MARKER = "noqa: STG1"
|
||||
|
||||
|
||||
def _violations() -> list[str]:
|
||||
out: list[str] = []
|
||||
for f in _SCAN:
|
||||
if not f.exists():
|
||||
continue
|
||||
for i, line in enumerate(f.read_text(encoding="utf-8").splitlines(), 1):
|
||||
s = line.strip()
|
||||
if s.startswith("#"):
|
||||
continue
|
||||
if _PATTERNS.search(line) and _MARKER not in line:
|
||||
out.append(f"{f.relative_to(_ROOT)}:{i}: {s[:100]}")
|
||||
return out
|
||||
|
||||
|
||||
def test_no_unmarked_blob_disk_writes():
|
||||
violations = _violations()
|
||||
assert not violations, (
|
||||
"INV-STG1: direct blob-disk-write(s) without a `# noqa: STG1` marker — "
|
||||
"seal each via `_seal_blob`/`storage.mirror` (if the pipeline reads the "
|
||||
"disk path) or justify it as benign on the line:\n "
|
||||
+ "\n ".join(violations)
|
||||
)
|
||||
Reference in New Issue
Block a user