Merge pull request 'feat(storage): אטימת מסלול-הכתיבה INV-STG1 — 15 seals + CI leak-guard + tripwire' (#205) from worktree-seal-storage-write-path into main
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m30s
G12 Leak-Guard / leak-guard (push) Successful in 5s

This commit was merged in pull request #205.
This commit is contained in:
2026-06-11 19:57:54 +00:00
11 changed files with 355 additions and 25 deletions

View File

@@ -18,8 +18,10 @@ import re
from datetime import date from datetime import date
from uuid import UUID from uuid import UUID
from pathlib import Path
from legal_mcp import config from legal_mcp import config
from legal_mcp.services import db, embeddings, claude_session, audit from legal_mcp.services import db, embeddings, claude_session, audit, storage
from legal_mcp.services.lessons import ( from legal_mcp.services.lessons import (
OUTCOME_LABELS_HE, OUTCOME_LABELS_HE,
PRACTICE_AREA_OVERRIDES, PRACTICE_AREA_OVERRIDES,
@@ -1119,7 +1121,13 @@ async def _update_draft_file(decision_id: UUID) -> None:
draft_dir = config.find_case_dir(case_row["case_number"]) / "drafts" draft_dir = config.find_case_dir(case_row["case_number"]) / "drafts"
draft_dir.mkdir(parents=True, exist_ok=True) draft_dir.mkdir(parents=True, exist_ok=True)
draft_path = draft_dir / "decision.md" draft_path = draft_dir / "decision.md"
draft_path.write_text("\n\n".join(row["content"] for row in rows if row["content"]), encoding="utf-8") draft_text = "\n\n".join(row["content"] for row in rows if row["content"])
draft_path.write_text(draft_text, encoding="utf-8") # noqa: STG1 — sealed below
try:
_dkey = draft_path.resolve().relative_to(Path(config.DATA_DIR).resolve()).as_posix()
await storage.mirror(_dkey, draft_text.encode("utf-8"), bucket=storage.Bucket.DOCUMENTS)
except ValueError:
pass
logger.info("Draft file synced: %s (%d blocks)", draft_path, len(rows)) logger.info("Draft file synced: %s (%d blocks)", draft_path, len(rows))

View File

@@ -487,7 +487,7 @@ async def export_decision(
await storage.put_bytes(key, data, bucket=storage.Bucket.DOCUMENTS, content_type=_docx_ctype) await storage.put_bytes(key, data, bucket=storage.Bucket.DOCUMENTS, content_type=_docx_ctype)
except ValueError: except ValueError:
Path(output_path).parent.mkdir(parents=True, exist_ok=True) Path(output_path).parent.mkdir(parents=True, exist_ok=True)
Path(output_path).write_bytes(data) Path(output_path).write_bytes(data) # noqa: STG1 — storage fallback (output_path outside DATA_DIR)
logger.info("DOCX exported (mode=%s): %s", mode, output_path) logger.info("DOCX exported (mode=%s): %s", mode, output_path)
return output_path return output_path

View File

@@ -317,7 +317,7 @@ def retrofit_bookmarks(
docx_path, _bkey, bucket=storage.Bucket.DOCUMENTS, docx_path, _bkey, bucket=storage.Bucket.DOCUMENTS,
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document") content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
except ValueError: except ValueError:
shutil.copy2(str(docx_path), str(backup_path)) shutil.copy2(str(docx_path), str(backup_path)) # noqa: STG1 — storage fallback
# Inject bookmarks, skipping any that already exist # Inject bookmarks, skipping any that already exist
next_id = _next_bookmark_id(doc_tree) next_id = _next_bookmark_id(doc_tree)

View File

@@ -114,7 +114,7 @@ def _persist_docx_sync(output_path: Path, data: bytes) -> None:
content_type=_DOCX_CTYPE) content_type=_DOCX_CTYPE)
except ValueError: except ValueError:
out.parent.mkdir(parents=True, exist_ok=True) out.parent.mkdir(parents=True, exist_ok=True)
out.write_bytes(data) out.write_bytes(data) # noqa: STG1 — storage fallback
def _save_docx_xml( def _save_docx_xml(
@@ -536,4 +536,4 @@ def copy_with_revisions(
content_type=_DOCX_CTYPE) content_type=_DOCX_CTYPE)
except ValueError: except ValueError:
out.parent.mkdir(parents=True, exist_ok=True) out.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(str(source_path), str(out)) shutil.copy2(str(source_path), str(out)) # noqa: STG1 — storage fallback

View File

@@ -346,7 +346,7 @@ def update_chair_position(
# Atomic write # Atomic write
tmp_path = file_path.with_suffix(file_path.suffix + ".tmp") tmp_path = file_path.with_suffix(file_path.suffix + ".tmp")
tmp_path.write_text(new_content, encoding="utf-8") tmp_path.write_text(new_content, encoding="utf-8") # noqa: STG1 — atomic .tmp; in-place edit, S3 re-sync in Phase-2 read-wiring
os.replace(tmp_path, file_path) os.replace(tmp_path, file_path)
preview = new_text.strip()[:120] preview = new_text.strip()[:120]

View File

@@ -473,6 +473,43 @@ async def ensure_local(key, *, bucket=Bucket.DOCUMENTS) -> Path:
return await get_storage().ensure_local(key, bucket=bucket) return await get_storage().ensure_local(key, bucket=bucket)
# ── mirror: dual-write seal for the not-yet-read-wired pipeline (INV-STG1) ──────
# A handful of upload/finalize paths still keep a copy on disk because the
# ingest/extract pipeline reads files by their DATA_DIR path (not yet wired to
# ensure_local). For those, ``mirror``/``mirror_file`` ALSO persist the blob to
# object storage when the active backend is s3/dual — so no blob is ever missing
# from MinIO (durability + presigned serving) even though a disk copy lingers
# for the pipeline. No-op under the filesystem backend (the disk write is the
# canonical copy). Best-effort: an S3 failure is logged, never breaks the
# request (the disk copy holds). The full fix (read-wire the pipeline → drop the
# disk copy) is tracked separately; until then this closes the data-loss leak.
async def mirror(key, data, *, bucket=Bucket.DOCUMENTS,
content_type=None, metadata=None) -> None:
backend = get_storage()
if backend.name == "filesystem":
return
s3 = getattr(backend, "s3", backend)
try:
await s3.put_bytes(key, data, bucket=bucket,
content_type=content_type, metadata=metadata)
except Exception as exc: # noqa: BLE001 — log, never break the request
logger.warning("storage.mirror: S3 persist failed for %s: %s", key, exc)
async def mirror_file(src, key, *, bucket=Bucket.DOCUMENTS,
content_type=None, metadata=None) -> None:
backend = get_storage()
if backend.name == "filesystem":
return
s3 = getattr(backend, "s3", backend)
try:
await s3.put_file(src, key, bucket=bucket,
content_type=content_type, metadata=metadata)
except Exception as exc: # noqa: BLE001
logger.warning("storage.mirror_file: S3 persist failed for %s: %s", key, exc)
# ── synchronous facade ───────────────────────────────────────────── # ── synchronous facade ─────────────────────────────────────────────
# A few legacy writers are plain sync functions (track-changes save, retrofit # A few legacy writers are plain sync functions (track-changes save, retrofit
# backup, the multimodal thumbnail renderer which runs in a worker thread via # backup, the multimodal thumbnail renderer which runs in a worker thread via
@@ -511,3 +548,15 @@ def put_file_sync(src, key, *, bucket=Bucket.DOCUMENTS, content_type=None,
metadata=None) -> str: metadata=None) -> str:
return _run_coro_blocking( return _run_coro_blocking(
put_file(src, key, bucket=bucket, content_type=content_type, metadata=metadata)) put_file(src, key, bucket=bucket, content_type=content_type, metadata=metadata))
def mirror_sync(key, data, *, bucket=Bucket.DOCUMENTS, content_type=None,
metadata=None) -> None:
_run_coro_blocking(mirror(key, data, bucket=bucket,
content_type=content_type, metadata=metadata))
def mirror_file_sync(src, key, *, bucket=Bucket.DOCUMENTS, content_type=None,
metadata=None) -> None:
_run_coro_blocking(mirror_file(src, key, bucket=bucket,
content_type=content_type, metadata=metadata))

View File

@@ -0,0 +1,74 @@
"""Tests for storage.mirror — the INV-STG1 dual-write seal.
mirror() must: be a no-op under the filesystem backend (the disk write is
canonical), persist to the S3 sub-backend under s3/dual, and never raise (an S3
failure is logged, the request proceeds on the disk copy). Offline.
"""
from __future__ import annotations
import asyncio
import pytest
from legal_mcp.services import storage
class _FakeS3:
def __init__(self, fail: bool = False) -> None:
self.fail = fail
self.puts: list[tuple] = []
async def put_bytes(self, key, data, *, bucket, content_type=None, metadata=None):
if self.fail:
raise RuntimeError("s3 down")
self.puts.append((key, bytes(data), bucket))
return f"s3://{bucket}/{key}"
class _FakeFilesystem:
name = "filesystem"
class _FakeDual:
name = "dual"
def __init__(self, s3: _FakeS3) -> None:
self.s3 = s3
def _run(coro):
loop = asyncio.new_event_loop()
try:
return loop.run_until_complete(coro)
finally:
loop.close()
def test_mirror_noop_under_filesystem(monkeypatch):
monkeypatch.setattr(storage, "get_storage", lambda: _FakeFilesystem())
# must not raise and must not attempt any S3 work
_run(storage.mirror("cases/x/y.pdf", b"data", bucket=storage.Bucket.DOCUMENTS))
def test_mirror_persists_under_dual(monkeypatch):
s3 = _FakeS3()
monkeypatch.setattr(storage, "get_storage", lambda: _FakeDual(s3))
_run(storage.mirror("cases/x/y.pdf", b"data", bucket=storage.Bucket.DOCUMENTS))
assert s3.puts == [("cases/x/y.pdf", b"data", storage.Bucket.DOCUMENTS)]
def test_mirror_best_effort_never_raises(monkeypatch):
s3 = _FakeS3(fail=True)
monkeypatch.setattr(storage, "get_storage", lambda: _FakeDual(s3))
# S3 failure must be swallowed-with-log, never propagate (disk copy holds)
_run(storage.mirror("cases/x/y.pdf", b"data", bucket=storage.Bucket.DOCUMENTS))
def test_mirror_uses_backend_itself_when_no_s3_attr(monkeypatch):
# a pure s3 backend has no .s3 sub-attr → getattr falls back to the backend
s3 = _FakeS3()
s3.name = "s3"
monkeypatch.setattr(storage, "get_storage", lambda: s3)
_run(storage.mirror("k", b"d", bucket=storage.Bucket.DERIVED))
assert s3.puts == [("k", b"d", storage.Bucket.DERIVED)]

View File

@@ -0,0 +1,62 @@
"""INV-STG1 leak-guard — no blob may be written to disk without going through, or
being mirrored to, the storage layer (services/storage.py).
After the cutover to ``STORAGE_BACKEND=s3`` a direct disk write under DATA_DIR
that bypasses storage creates an orphan: a file in the old folders that never
reaches MinIO (lost on cleanup, not served, not backed up). This static guard
fails CI on any NEW direct blob-write (``write_bytes``/``write_text``/
``shutil.copy*``/``shutil.move``/``open(...,'wb')``) in the web API or services
that is not explicitly acknowledged with a ``# noqa: STG1`` marker.
Marking a line means the author has CONSCIOUSLY handled it — either sealed it
(``_seal_blob`` / ``storage.mirror`` right after, for paths the disk-based
pipeline still reads) or justified it as benign (temp file, staging-then-unlink,
git-per-case metadata, log/flag, BytesIO buffer, storage fallback). New
unmarked writes block the build until the author does the same.
"""
from __future__ import annotations
import re
from pathlib import Path
import pytest
_ROOT = Path(__file__).resolve().parents[2]
# storage.py is the storage layer itself — its disk writes ARE the implementation.
_EXCLUDE = {"storage.py"}
_SCAN = [
_ROOT / "web" / "app.py",
*(p for p in sorted((_ROOT / "mcp-server" / "src" / "legal_mcp" / "services").glob("*.py"))
if p.name not in _EXCLUDE),
]
# Direct-disk-write patterns that could land a blob in the old folders.
_PATTERNS = re.compile(
r"\.write_bytes\(|\.write_text\(|shutil\.copy2?\(|shutil\.move\(|open\([^)]*,\s*['\"][wax]b?['\"]"
)
_MARKER = "noqa: STG1"
def _violations() -> list[str]:
out: list[str] = []
for f in _SCAN:
if not f.exists():
continue
for i, line in enumerate(f.read_text(encoding="utf-8").splitlines(), 1):
s = line.strip()
if s.startswith("#"):
continue
if _PATTERNS.search(line) and _MARKER not in line:
out.append(f"{f.relative_to(_ROOT)}:{i}: {s[:100]}")
return out
def test_no_unmarked_blob_disk_writes():
violations = _violations()
assert not violations, (
"INV-STG1: direct blob-disk-write(s) without a `# noqa: STG1` marker — "
"seal each via `_seal_blob`/`storage.mirror` (if the pipeline reads the "
"disk path) or justify it as benign on the line:\n "
+ "\n ".join(violations)
)

View File

@@ -48,6 +48,7 @@
| `backfill_nevo_preamble.py` | python | **#86.2** — מיגרציית-נתונים: חיתוך preamble/רציו של נבו שדלף לפסיקה שהוטמעה לפני תיקון #86.1. מאתר כל `case_law` ש-`strip_nevo_preamble(full_text)` עדיין מקצר (דליפה היסטורית), ומבצע: (1) לכידת ה-מיני-רציו ל-`case_law.nevo_ratio` (gold-set ל-#86.3); (2) שכתוב `full_text` החתוך + חישוב-מחדש של `content_hash`; (3) `reindex_case_law` (re-chunk+embed, ללא re-OCR/LLM); (4) **סימון (לא מחיקה)** הלכות ש-`supporting_quote` שלהן בתוך ה-preamble שהוסר → `pending_review` + quality_flag `nevo_preamble_leak`. **שומר-בטיחות:** שורות עם keep%<`--min-keep` (ברירת-מחדל 60) מוחרגות מ-`--apply` כחשד over-strip (אלא אם `--include-suspicious`). **dry-run כברירת-מחדל**; `--apply` כותב backup JSON + manifest CSV ל-`data/audit/` תחילה. idempotent. רץ עם venv של mcp-server. **chair-gated** (לאמת manifest לפני apply) | מיגרציית-נתונים — dry-run בוצע (19 פסקים, 27 הלכות מזוהמות); apply ממתין לאישור | | `backfill_nevo_preamble.py` | python | **#86.2** — מיגרציית-נתונים: חיתוך preamble/רציו של נבו שדלף לפסיקה שהוטמעה לפני תיקון #86.1. מאתר כל `case_law` ש-`strip_nevo_preamble(full_text)` עדיין מקצר (דליפה היסטורית), ומבצע: (1) לכידת ה-מיני-רציו ל-`case_law.nevo_ratio` (gold-set ל-#86.3); (2) שכתוב `full_text` החתוך + חישוב-מחדש של `content_hash`; (3) `reindex_case_law` (re-chunk+embed, ללא re-OCR/LLM); (4) **סימון (לא מחיקה)** הלכות ש-`supporting_quote` שלהן בתוך ה-preamble שהוסר → `pending_review` + quality_flag `nevo_preamble_leak`. **שומר-בטיחות:** שורות עם keep%<`--min-keep` (ברירת-מחדל 60) מוחרגות מ-`--apply` כחשד over-strip (אלא אם `--include-suspicious`). **dry-run כברירת-מחדל**; `--apply` כותב backup JSON + manifest CSV ל-`data/audit/` תחילה. idempotent. רץ עם venv של mcp-server. **chair-gated** (לאמת manifest לפני apply) | מיגרציית-נתונים — dry-run בוצע (19 פסקים, 27 הלכות מזוהמות); apply ממתין לאישור |
| `nevo_ratio_benchmark.py` | python | **#86.3** — מדידת איכות חילוץ-הלכות מול ה-מיני-רציו של נבו (gold-set מקצועי חינמי). לכל פסק עם `nevo_ratio` (או נגזר מ-`full_text` אם טרם בוצע backfill): LLM-judge מקומי (`claude_session`, אפס עלות) ממפה סמנטית את הלכות-המערכת מול הלכות-נבו ומפיק **recall** (כיסוי הלכות-נבו), **precision** (אחוז הלכותינו הממופות), **granularity** (יחס פירוק — איתות over-extraction ל-#81.5). `--case <num>` / `--all [--limit N]` / `--model` / `--out`. כותב CSV ל-`data/audit/`. רץ עם venv של mcp-server (דורש Claude CLI מקומי). אומת על בג"ץ 1764/05: recall 0.875, precision 1.0, granularity 1.75x | ידני — מדידת-איכות (CI/ad-hoc) | | `nevo_ratio_benchmark.py` | python | **#86.3** — מדידת איכות חילוץ-הלכות מול ה-מיני-רציו של נבו (gold-set מקצועי חינמי). לכל פסק עם `nevo_ratio` (או נגזר מ-`full_text` אם טרם בוצע backfill): LLM-judge מקומי (`claude_session`, אפס עלות) ממפה סמנטית את הלכות-המערכת מול הלכות-נבו ומפיק **recall** (כיסוי הלכות-נבו), **precision** (אחוז הלכותינו הממופות), **granularity** (יחס פירוק — איתות over-extraction ל-#81.5). `--case <num>` / `--all [--limit N]` / `--model` / `--out`. כותב CSV ל-`data/audit/`. רץ עם venv של mcp-server (דורש Claude CLI מקומי). אומת על בג"ץ 1764/05: recall 0.875, precision 1.0, granularity 1.75x | ידני — מדידת-איכות (CI/ad-hoc) |
| `migrate_blobs_to_minio.py` | python | **#106.4 — הגירת בלובים לדיסק→MinIO (DB-driven, dry-run-default).** סורק 6 עמודות-נתיב (documents.file_path · cases.active_draft_path · digests.source_document_path · draft_final_pairs.final_path · *_image_embeddings.image_thumbnail_path), מנרמל 3 פורמטי-נתיב legacy (container-abs `/data/`, host-abs, relative) ל-key יחסי-DATA_DIR, וגוזר bucket per-file-semantic (מסמך→documents, thumbnail→derived). dry-run מפיק תוכנית+מניפסט CSV (data/audit) + מדווח חסרים; `--apply` מעלה דרך mcli ומאמת size (דיסק לא נוגע → הפיך). אומת 2026-06-11: 3404 קבצים/899MB, 0 outside, 28 חסרים. **חובה mcli alias legalminio**. | ידני — הגירת-אחסון X14 | | `migrate_blobs_to_minio.py` | python | **#106.4 — הגירת בלובים לדיסק→MinIO (DB-driven, dry-run-default).** סורק 6 עמודות-נתיב (documents.file_path · cases.active_draft_path · digests.source_document_path · draft_final_pairs.final_path · *_image_embeddings.image_thumbnail_path), מנרמל 3 פורמטי-נתיב legacy (container-abs `/data/`, host-abs, relative) ל-key יחסי-DATA_DIR, וגוזר bucket per-file-semantic (מסמך→documents, thumbnail→derived). dry-run מפיק תוכנית+מניפסט CSV (data/audit) + מדווח חסרים; `--apply` מעלה דרך mcli ומאמת size (דיסק לא נוגע → הפיך). אומת 2026-06-11: 3404 קבצים/899MB, 0 outside, 28 חסרים. **חובה mcli alias legalminio**. | ידני — הגירת-אחסון X14 |
| `storage_leak_tripwire.py` | python | **INV-STG1 tripwire (ניטור-ריצה).** משלים את ה-CI leak-guard: סורק בלובים ב-data/{cases,precedent-library,internal-decisions,digests,training} ומשווה מול ה-key-sets החיים של legal-documents/legal-derived (json-key match, סיווג bucket per-file כמו בהגירה). מדווח בלובים שדלפו (בדיסק אך לא ב-MinIO → יאבדו בניקוי, לא מוגשים/מגובים). read-only, `--since <ISO>`. אומת: 0 דליפות. **חובה מקומי** (mcli legalminio). | תקופתי / לפני ניקוי-דיסק #128 |
| `nevo_corpus_audit.py` | python | **#86.2/#86.3 — אודיט קורפוס-נבו (read-only).** `leak` סורק chunks+הלכות למרקרי-preamble של נבו (מיובאים מ-extractor._NEVO_MARKERS), מבחין בין הווקטור המזיק (מרקר בתוך הלכה=רציו-עריכה כהלכה) ל-benign (רשימת-ציטוטים), ומפיק CSV. אומת 2026-06-11: **0 הלכות מזוהמות** (שכבת-הידע נקייה) → אין purge/re-ingest (גם נוגד no-reocr). `leak --apply` מבצע backfill **אדיטיבי** של `case_law.nevo_ratio` מטקסט שמור (extract_nevo_ratio, ללא re-OCR) — captured 16→32. `benchmark` משווה הלכות-שלנו מול ה-מיני-רציו דרך הפאנל התלת-מודלי → recall כיסוי (1110-20: 13 הלכות, recall=1.0). **חובה מקומי** (benchmark). | ידני — ניטור-זיהום / ground-truth | | `nevo_corpus_audit.py` | python | **#86.2/#86.3 — אודיט קורפוס-נבו (read-only).** `leak` סורק chunks+הלכות למרקרי-preamble של נבו (מיובאים מ-extractor._NEVO_MARKERS), מבחין בין הווקטור המזיק (מרקר בתוך הלכה=רציו-עריכה כהלכה) ל-benign (רשימת-ציטוטים), ומפיק CSV. אומת 2026-06-11: **0 הלכות מזוהמות** (שכבת-הידע נקייה) → אין purge/re-ingest (גם נוגד no-reocr). `leak --apply` מבצע backfill **אדיטיבי** של `case_law.nevo_ratio` מטקסט שמור (extract_nevo_ratio, ללא re-OCR) — captured 16→32. `benchmark` משווה הלכות-שלנו מול ה-מיני-רציו דרך הפאנל התלת-מודלי → recall כיסוי (1110-20: 13 הלכות, recall=1.0). **חובה מקומי** (benchmark). | ידני — ניטור-זיהום / ground-truth |
| `halacha_goldset.py` | python | **#81.7** — הארנס gold-set לאיכות חילוץ-הלכות. `export --n N` מייצא מדגם מרובד (לפי precedent×rule_type) ל-CSV עם עמודות-תיוג ריקות (`is_holding`/`correct_type`/`quote_complete`) לתיוג ידני (חיים/דפנה). `score --in <csv>` קורא את ה-CSV המתויג ומודד כל ולידטור (`compute_quality_flags`/`is_fact_dependent`/`is_quote_truncated`/`is_thin_restatement`) מול אמת-המידה האנושית: P/R/F1 + confusion. בסיס ל-#81.8 (כיול סף האישור). מייבא את אותם ולידטורים שה-extractor מריץ. רץ עם venv של mcp-server. **הערה:** קיים גם דף-תיוג אינטראקטיבי DB-backed (`/goldset`) — זה ה-CSV-fallback | ידני — export→תיוג→score | | `halacha_goldset.py` | python | **#81.7** — הארנס gold-set לאיכות חילוץ-הלכות. `export --n N` מייצא מדגם מרובד (לפי precedent×rule_type) ל-CSV עם עמודות-תיוג ריקות (`is_holding`/`correct_type`/`quote_complete`) לתיוג ידני (חיים/דפנה). `score --in <csv>` קורא את ה-CSV המתויג ומודד כל ולידטור (`compute_quality_flags`/`is_fact_dependent`/`is_quote_truncated`/`is_thin_restatement`) מול אמת-המידה האנושית: P/R/F1 + confusion. בסיס ל-#81.8 (כיול סף האישור). מייבא את אותם ולידטורים שה-extractor מריץ. רץ עם venv של mcp-server. **הערה:** קיים גם דף-תיוג אינטראקטיבי DB-backed (`/goldset`) — זה ה-CSV-fallback | ידני — export→תיוג→score |
| `goldset_panel_label.py` | python | **#81.7 — תיוג ה-gold-set בקונצנזוס תלת-מודלי (ללא man-in-the-loop, הנחיית-יו"ר 2026-06-11).** מריץ את שלושת השופטים העצמאיים (Opus/claude_session · DeepSeek · Gemini, מיובאים מ-`halacha_panel_approve`) עם ה-prompt העשיר (`is_holding`+`type`+נימוק מ-`goldset_ai_recommend`) על כל פריט; **רוב 2/3 נכתב ל-`is_holding`/`correct_type`** עם `tagged_by='panel:opus+deepseek+gemini'`יצול→NULL→יו"ר, INV-G10). מודד **Fleiss κ** (3 מעריכים) ומריץ **מבחן-אנונימיזציה** (שמות-תיק ממוסכים→שיפוט-מחדש; flip=שינון). לא מעגלי — הוולידטורים הנמדדים rule-based. כותב per-model+consensus+anon ל-DB ודוח ל-`data/audit/`. **מחליף** תיוג-ידני; `goldset_ai_recommend`/`goldset_independent_judge` נשארים כבדיקות single-model. `--limit`/`--no-anon`/`--force`. **חובה מקומי**. | ידני — לאחר יצירת/הרחבת batch | | `goldset_panel_label.py` | python | **#81.7 — תיוג ה-gold-set בקונצנזוס תלת-מודלי (ללא man-in-the-loop, הנחיית-יו"ר 2026-06-11).** מריץ את שלושת השופטים העצמאיים (Opus/claude_session · DeepSeek · Gemini, מיובאים מ-`halacha_panel_approve`) עם ה-prompt העשיר (`is_holding`+`type`+נימוק מ-`goldset_ai_recommend`) על כל פריט; **רוב 2/3 נכתב ל-`is_holding`/`correct_type`** עם `tagged_by='panel:opus+deepseek+gemini'`יצול→NULL→יו"ר, INV-G10). מודד **Fleiss κ** (3 מעריכים) ומריץ **מבחן-אנונימיזציה** (שמות-תיק ממוסכים→שיפוט-מחדש; flip=שינון). לא מעגלי — הוולידטורים הנמדדים rule-based. כותב per-model+consensus+anon ל-DB ודוח ל-`data/audit/`. **מחליף** תיוג-ידני; `goldset_ai_recommend`/`goldset_independent_judge` נשארים כבדיקות single-model. `--limit`/`--no-anon`/`--force`. **חובה מקומי**. | ידני — לאחר יצירת/הרחבת batch |

View File

@@ -0,0 +1,97 @@
#!/usr/bin/env python3
"""INV-STG1 runtime tripwire — detect blobs that leaked to the old disk folders
without reaching MinIO (the detective control complementing the CI leak-guard).
After the s3-only cutover, every blob written under DATA_DIR/{cases,
precedent-library,internal-decisions,digests,training} should ALSO be in MinIO
(the upload/finalize paths keep a disk copy for the pipeline but mirror to S3 via
storage.mirror — see web/app.py _seal_blob). A file present on disk but ABSENT
from the matching S3 bucket means a write bypassed the seal → it would be lost on
disk cleanup and is not served/backed-up. This script reports them.
Classifies disk files into documents/derived buckets exactly like the migration
(``*/extracted/*`` and ``*/thumbnails/*`` → legal-derived; the rest → legal-
documents) and compares against the live bucket key-sets (proper JSON key match,
so Hebrew filenames with spaces compare correctly). Read-only.
Run locally (needs the `legalminio` mcli alias):
python3 scripts/storage_leak_tripwire.py # full scan
python3 scripts/storage_leak_tripwire.py --since 2026-06-11 # only newer files
"""
from __future__ import annotations
import argparse
import json
import subprocess
import sys
from pathlib import Path
MCLI = str(Path.home() / ".local" / "bin" / "mcli")
DATA = Path("/home/chaim/legal-ai/data")
CATS = ["cases", "precedent-library", "internal-decisions", "digests", "training"]
# non-blob disk files that legitimately stay on disk / in git-per-case
SKIP_SUFFIX = {".tmp", ".log"}
SKIP_NAME = {"case.json", "notes.md", ".pull.log"}
def _bucket_for(rel: str) -> str:
return ("legal-derived" if ("/extracted/" in rel or "/thumbnails/" in rel)
else "legal-documents")
def _s3_keys(bucket: str) -> set[str]:
out = subprocess.run([MCLI, "ls", "--recursive", "--json", f"legalminio/{bucket}"],
capture_output=True, text=True, env={"TERM": "xterm", "HOME": str(Path.home())})
keys: set[str] = set()
for ln in out.stdout.splitlines():
try:
k = json.loads(ln).get("key", "")
except json.JSONDecodeError:
continue
if k and "/.git/" not in k:
keys.add(k)
return keys
def main(args) -> int:
s3 = {b: _s3_keys(b) for b in ("legal-documents", "legal-derived")}
since = None
if args.since:
import datetime
since = datetime.datetime.fromisoformat(args.since).timestamp()
leaked: list[str] = []
scanned = 0
for cat in CATS:
root = DATA / cat
if not root.exists():
continue
for f in root.rglob("*"):
if not f.is_file() or "/.git/" in f.as_posix():
continue
if f.suffix in SKIP_SUFFIX or f.name in SKIP_NAME:
continue
if since and f.stat().st_mtime < since:
continue
rel = f.relative_to(DATA).as_posix()
scanned += 1
if rel not in s3[_bucket_for(rel)]:
leaked.append(rel)
print(f"scanned {scanned} disk blobs across {CATS}")
if not leaked:
print("✓ no leaks — every disk blob is present in MinIO.")
return 0
print(f"{len(leaked)} LEAKED blobs (on disk, NOT in MinIO):")
for r in leaked[:50]:
print(f" {r} → expected in {_bucket_for(r)}")
if len(leaked) > 50:
print(f" … and {len(leaked) - 50} more")
return 1
if __name__ == "__main__":
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--since", help="ISO date — only check files modified on/after")
sys.exit(main(ap.parse_args()))

View File

@@ -146,7 +146,8 @@ async def upload_file(file: UploadFile = File(...)):
raise HTTPException(400, f"File too large. Max: {MAX_FILE_SIZE // (1024*1024)}MB") raise HTTPException(400, f"File too large. Max: {MAX_FILE_SIZE // (1024*1024)}MB")
dest = UPLOAD_DIR / filename dest = UPLOAD_DIR / filename
dest.write_bytes(content) dest.write_bytes(content) # noqa: STG1 — sealed below
await _seal_blob(dest, content)
return { return {
"filename": filename, "filename": filename,
@@ -301,12 +302,14 @@ async def _process_proofread_training(
# Copy original to training dir # Copy original to training dir
original_name = re.sub(r"^\d+_", "", source.name) original_name = re.sub(r"^\d+_", "", source.name)
orig_dest = training_dir / original_name orig_dest = training_dir / original_name
shutil.copy2(str(source), str(orig_dest)) shutil.copy2(str(source), str(orig_dest)) # noqa: STG1 — sealed below
await _seal_blob_file(orig_dest)
# Save cleaned version # Save cleaned version
proofread_name = Path(original_name).stem + ".md" proofread_name = Path(original_name).stem + ".md"
proofread_dest = proofread_dir / proofread_name proofread_dest = proofread_dir / proofread_name
proofread_dest.write_text(clean_text, encoding="utf-8") proofread_dest.write_text(clean_text, encoding="utf-8") # noqa: STG1 — sealed below
await _seal_blob(proofread_dest, clean_text.encode("utf-8"))
# 3. Parse date # 3. Parse date
d_date = None d_date = None
@@ -1405,7 +1408,7 @@ async def create_curator_proposal(body: CuratorProposal):
f"## נימוק\n\n{body.rationale.strip() or '(לא ניתן)'}\n" f"## נימוק\n\n{body.rationale.strip() or '(לא ניתן)'}\n"
) )
try: try:
path.write_text(md, encoding="utf-8") path.write_text(md, encoding="utf-8") # noqa: STG1 — curator proposal state, not a corpus blob
except OSError as e: except OSError as e:
raise HTTPException(500, f"failed to write proposal: {e}") raise HTTPException(500, f"failed to write proposal: {e}")
return { return {
@@ -2846,6 +2849,31 @@ async def serve_blob(
return FileResponse(path, media_type=media_type, filename=filename) return FileResponse(path, media_type=media_type, filename=filename)
async def _seal_blob(dest: Path, content: bytes,
*, bucket=storage.Bucket.DOCUMENTS) -> None:
"""Mirror a just-written disk blob to object storage (INV-STG1 seal).
The ingest/extract pipeline still reads ``dest`` from its DATA_DIR path, so
these endpoints keep the disk copy; this ALSO persists the blob to MinIO so
nothing written to the old folders is ever missing from object storage
(durability + presigned serving). No-op under the filesystem backend;
best-effort under s3/dual (logged, never breaks the request)."""
try:
key = dest.resolve().relative_to(Path(config.DATA_DIR).resolve()).as_posix()
except ValueError:
return # outside DATA_DIR → not a managed blob
await storage.mirror(key, content, bucket=bucket)
async def _seal_blob_file(dest: Path, *, bucket=storage.Bucket.DOCUMENTS) -> None:
"""``_seal_blob`` for a file already on disk (e.g. after shutil.copy)."""
try:
key = dest.resolve().relative_to(Path(config.DATA_DIR).resolve()).as_posix()
except ValueError:
return
await storage.mirror_file(dest, key, bucket=bucket)
@app.get("/api/cases/{case_number}/local-files/{folder}/{filename}") @app.get("/api/cases/{case_number}/local-files/{folder}/{filename}")
async def api_read_local_file(case_number: str, folder: str, filename: str): async def api_read_local_file(case_number: str, folder: str, filename: str):
"""Read contents of a local case file.""" """Read contents of a local case file."""
@@ -2952,7 +2980,7 @@ async def api_research_analysis_upload(
tmp = dest.with_suffix(".md.upload-tmp") tmp = dest.with_suffix(".md.upload-tmp")
try: try:
dest.parent.mkdir(parents=True, exist_ok=True) dest.parent.mkdir(parents=True, exist_ok=True)
tmp.write_text(text, encoding="utf-8") tmp.write_text(text, encoding="utf-8") # noqa: STG1 — atomic upload .tmp, replaced below
parsed = research_md.parse(tmp) parsed = research_md.parse(tmp)
except Exception as e: except Exception as e:
tmp.unlink(missing_ok=True) tmp.unlink(missing_ok=True)
@@ -2986,7 +3014,8 @@ async def api_research_analysis_upload(
backup_dir.mkdir(exist_ok=True) backup_dir.mkdir(exist_ok=True)
ts = time.strftime("%Y%m%d-%H%M%S") ts = time.strftime("%Y%m%d-%H%M%S")
backup_path = backup_dir / f"analysis-and-research-{ts}.md" backup_path = backup_dir / f"analysis-and-research-{ts}.md"
shutil.copy2(dest, backup_path) shutil.copy2(dest, backup_path) # noqa: STG1 — sealed below
await _seal_blob_file(backup_path)
# Replace with uploaded file # Replace with uploaded file
tmp.replace(dest) tmp.replace(dest)
@@ -3096,7 +3125,8 @@ async def api_precedent_upload_pdf(
while dest.exists(): while dest.exists():
dest = case_dir / f"{safe_name or 'precedent'}-{counter}{ext}" dest = case_dir / f"{safe_name or 'precedent'}-{counter}{ext}"
counter += 1 counter += 1
dest.write_bytes(content) dest.write_bytes(content) # noqa: STG1 — sealed below
await _seal_blob(dest, content)
case_id = UUID(case["id"]) case_id = UUID(case["id"])
doc = await db.create_document( doc = await db.create_document(
@@ -3227,7 +3257,8 @@ async def api_upload_export(case_number: str, file: UploadFile = File(...)):
pass pass
dest = export_dir / f"עריכה-v{next_ver}.docx" dest = export_dir / f"עריכה-v{next_ver}.docx"
dest.write_bytes(content) dest.write_bytes(content) # noqa: STG1 — sealed below
await _seal_blob(dest, content)
# Auto-register as active_draft + retrofit bookmarks # Auto-register as active_draft + retrofit bookmarks
auto_result: dict = {"status": "ok"} auto_result: dict = {"status": "ok"}
@@ -3382,12 +3413,14 @@ async def api_mark_final(case_number: str, filename: str):
# Rename/copy to final # Rename/copy to final
final_name = f"סופי-{case_number}.docx" final_name = f"סופי-{case_number}.docx"
final_path = export_dir / final_name final_path = export_dir / final_name
shutil.copy2(str(source), str(final_path)) shutil.copy2(str(source), str(final_path)) # noqa: STG1 — sealed below
await _seal_blob_file(final_path)
# Also copy to training directory for future style learning # Also copy to training directory for future style learning
config.TRAINING_DIR.mkdir(parents=True, exist_ok=True) config.TRAINING_DIR.mkdir(parents=True, exist_ok=True)
training_dest = config.TRAINING_DIR / f"החלטה-{case_number}.docx" training_dest = config.TRAINING_DIR / f"החלטה-{case_number}.docx"
shutil.copy2(str(source), str(training_dest)) shutil.copy2(str(source), str(training_dest)) # noqa: STG1 — sealed below
await _seal_blob_file(training_dest)
# Update case status to final # Update case status to final
pool = await db.get_pool() pool = await db.get_pool()
@@ -3635,13 +3668,15 @@ async def api_upload_final_decision(case_number: str, file: UploadFile = File(..
export_dir.mkdir(parents=True, exist_ok=True) export_dir.mkdir(parents=True, exist_ok=True)
final_name = f"סופי-{case_number}.docx" final_name = f"סופי-{case_number}.docx"
final_path = export_dir / final_name final_path = export_dir / final_name
final_path.write_bytes(content) final_path.write_bytes(content) # noqa: STG1 — sealed below
await _seal_blob(final_path, content)
# Enroll in the style corpus. Use the FULL case_number as decision_number so a # Enroll in the style corpus. Use the FULL case_number as decision_number so a
# בל"מ never collides with a same-numbered ערר already in the corpus (e.g. ARAR-25-8126). # בל"מ never collides with a same-numbered ערר already in the corpus (e.g. ARAR-25-8126).
config.TRAINING_DIR.mkdir(parents=True, exist_ok=True) config.TRAINING_DIR.mkdir(parents=True, exist_ok=True)
training_dest = config.TRAINING_DIR / f"החלטה-{case_number}.docx" training_dest = config.TRAINING_DIR / f"החלטה-{case_number}.docx"
shutil.copy2(str(final_path), str(training_dest)) shutil.copy2(str(final_path), str(training_dest)) # noqa: STG1 — sealed below
await _seal_blob_file(training_dest)
# Extract the final text (word count for the UI; full text snapshotted into the pair). # Extract the final text (word count for the UI; full text snapshotted into the pair).
final_text = "" final_text = ""
@@ -4977,7 +5012,7 @@ async def api_install_skill(file: UploadFile = File(...)):
dest = skill_dir / rel_path dest = skill_dir / rel_path
dest.parent.mkdir(parents=True, exist_ok=True) dest.parent.mkdir(parents=True, exist_ok=True)
dest.write_bytes(zf.read(name)) dest.write_bytes(zf.read(name)) # noqa: STG1 — extracts to ~/.paperclip skills (outside DATA_DIR)
extracted_files.append(rel_path) extracted_files.append(rel_path)
zf.close() zf.close()
@@ -5149,7 +5184,7 @@ async def api_restart_paperclip():
# Fallback: write a flag file that host-side watcher picks up # Fallback: write a flag file that host-side watcher picks up
flag_file = PAPERCLIP_SKILLS_DIR / ".restart-requested" flag_file = PAPERCLIP_SKILLS_DIR / ".restart-requested"
try: try:
flag_file.write_text(str(time.time())) flag_file.write_text(str(time.time())) # noqa: STG1 — restart flag (state)
return { return {
"status": "restart_requested", "status": "restart_requested",
"method": "flag_file", "method": "flag_file",
@@ -5202,7 +5237,8 @@ async def api_upload_tagged_document(
dest = case_dir / f"{stem}-{counter}{ext}" dest = case_dir / f"{stem}-{counter}{ext}"
counter += 1 counter += 1
dest.write_bytes(content) dest.write_bytes(content) # noqa: STG1 — sealed below
await _seal_blob(dest, content)
# Create document record # Create document record
case_id = UUID(case["id"]) case_id = UUID(case["id"])
@@ -5742,7 +5778,8 @@ async def _process_case_document(task_id: str, source: Path, req: ClassifyReques
# Use original name without timestamp prefix # Use original name without timestamp prefix
original_name = re.sub(r"^\d+_", "", source.name) original_name = re.sub(r"^\d+_", "", source.name)
dest = case_dir / original_name dest = case_dir / original_name
shutil.copy2(str(source), str(dest)) shutil.copy2(str(source), str(dest)) # noqa: STG1 — sealed below
await _seal_blob_file(dest)
# Create document record # Create document record
await _progress.set(task_id, {"status": "registering", "filename": req.filename}) await _progress.set(task_id, {"status": "registering", "filename": req.filename})
@@ -5792,7 +5829,8 @@ async def _process_training_document(task_id: str, source: Path, req: ClassifyRe
config.TRAINING_DIR.mkdir(parents=True, exist_ok=True) config.TRAINING_DIR.mkdir(parents=True, exist_ok=True)
original_name = re.sub(r"^\d+_", "", source.name) original_name = re.sub(r"^\d+_", "", source.name)
dest = config.TRAINING_DIR / original_name dest = config.TRAINING_DIR / original_name
shutil.copy2(str(source), str(dest)) shutil.copy2(str(source), str(dest)) # noqa: STG1 — sealed below
await _seal_blob_file(dest)
# Extract text # Extract text
await _progress.set(task_id, {"status": "processing", "filename": req.filename, "step": "extracting"}) await _progress.set(task_id, {"status": "processing", "filename": req.filename, "step": "extracting"})
@@ -6865,7 +6903,8 @@ async def bulletin_upload(file: UploadFile = File(...)):
# Idempotent: same content (any filename) already staged → skip. # Idempotent: same content (any filename) already staged → skip.
if any(p.name.startswith(f"{digest}_") for p in _BULLETINS_DIR.glob(f"{digest}_*")): if any(p.name.startswith(f"{digest}_") for p in _BULLETINS_DIR.glob(f"{digest}_*")):
return {"status": "exists", "filename": dest.name, "size": len(content)} return {"status": "exists", "filename": dest.name, "size": len(content)}
dest.write_bytes(content) dest.write_bytes(content) # noqa: STG1 — sealed below
await _seal_blob(dest, content)
return {"status": "stored", "filename": dest.name, "size": len(content)} return {"status": "stored", "filename": dest.name, "size": len(content)}