Merge pull request 'feat(storage): אטימת מסלול-הכתיבה INV-STG1 — 15 seals + CI leak-guard + tripwire' (#205) from worktree-seal-storage-write-path into main
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m30s
G12 Leak-Guard / leak-guard (push) Successful in 5s

This commit was merged in pull request #205.
This commit is contained in:
2026-06-11 19:57:54 +00:00
11 changed files with 355 additions and 25 deletions

View File

@@ -146,7 +146,8 @@ async def upload_file(file: UploadFile = File(...)):
raise HTTPException(400, f"File too large. Max: {MAX_FILE_SIZE // (1024*1024)}MB")
dest = UPLOAD_DIR / filename
dest.write_bytes(content)
dest.write_bytes(content) # noqa: STG1 — sealed below
await _seal_blob(dest, content)
return {
"filename": filename,
@@ -301,12 +302,14 @@ async def _process_proofread_training(
# Copy original to training dir
original_name = re.sub(r"^\d+_", "", source.name)
orig_dest = training_dir / original_name
shutil.copy2(str(source), str(orig_dest))
shutil.copy2(str(source), str(orig_dest)) # noqa: STG1 — sealed below
await _seal_blob_file(orig_dest)
# Save cleaned version
proofread_name = Path(original_name).stem + ".md"
proofread_dest = proofread_dir / proofread_name
proofread_dest.write_text(clean_text, encoding="utf-8")
proofread_dest.write_text(clean_text, encoding="utf-8") # noqa: STG1 — sealed below
await _seal_blob(proofread_dest, clean_text.encode("utf-8"))
# 3. Parse date
d_date = None
@@ -1405,7 +1408,7 @@ async def create_curator_proposal(body: CuratorProposal):
f"## נימוק\n\n{body.rationale.strip() or '(לא ניתן)'}\n"
)
try:
path.write_text(md, encoding="utf-8")
path.write_text(md, encoding="utf-8") # noqa: STG1 — curator proposal state, not a corpus blob
except OSError as e:
raise HTTPException(500, f"failed to write proposal: {e}")
return {
@@ -2846,6 +2849,31 @@ async def serve_blob(
return FileResponse(path, media_type=media_type, filename=filename)
async def _seal_blob(dest: Path, content: bytes,
*, bucket=storage.Bucket.DOCUMENTS) -> None:
"""Mirror a just-written disk blob to object storage (INV-STG1 seal).
The ingest/extract pipeline still reads ``dest`` from its DATA_DIR path, so
these endpoints keep the disk copy; this ALSO persists the blob to MinIO so
nothing written to the old folders is ever missing from object storage
(durability + presigned serving). No-op under the filesystem backend;
best-effort under s3/dual (logged, never breaks the request)."""
try:
key = dest.resolve().relative_to(Path(config.DATA_DIR).resolve()).as_posix()
except ValueError:
return # outside DATA_DIR → not a managed blob
await storage.mirror(key, content, bucket=bucket)
async def _seal_blob_file(dest: Path, *, bucket=storage.Bucket.DOCUMENTS) -> None:
"""``_seal_blob`` for a file already on disk (e.g. after shutil.copy)."""
try:
key = dest.resolve().relative_to(Path(config.DATA_DIR).resolve()).as_posix()
except ValueError:
return
await storage.mirror_file(dest, key, bucket=bucket)
@app.get("/api/cases/{case_number}/local-files/{folder}/{filename}")
async def api_read_local_file(case_number: str, folder: str, filename: str):
"""Read contents of a local case file."""
@@ -2952,7 +2980,7 @@ async def api_research_analysis_upload(
tmp = dest.with_suffix(".md.upload-tmp")
try:
dest.parent.mkdir(parents=True, exist_ok=True)
tmp.write_text(text, encoding="utf-8")
tmp.write_text(text, encoding="utf-8") # noqa: STG1 — atomic upload .tmp, replaced below
parsed = research_md.parse(tmp)
except Exception as e:
tmp.unlink(missing_ok=True)
@@ -2986,7 +3014,8 @@ async def api_research_analysis_upload(
backup_dir.mkdir(exist_ok=True)
ts = time.strftime("%Y%m%d-%H%M%S")
backup_path = backup_dir / f"analysis-and-research-{ts}.md"
shutil.copy2(dest, backup_path)
shutil.copy2(dest, backup_path) # noqa: STG1 — sealed below
await _seal_blob_file(backup_path)
# Replace with uploaded file
tmp.replace(dest)
@@ -3096,7 +3125,8 @@ async def api_precedent_upload_pdf(
while dest.exists():
dest = case_dir / f"{safe_name or 'precedent'}-{counter}{ext}"
counter += 1
dest.write_bytes(content)
dest.write_bytes(content) # noqa: STG1 — sealed below
await _seal_blob(dest, content)
case_id = UUID(case["id"])
doc = await db.create_document(
@@ -3227,7 +3257,8 @@ async def api_upload_export(case_number: str, file: UploadFile = File(...)):
pass
dest = export_dir / f"עריכה-v{next_ver}.docx"
dest.write_bytes(content)
dest.write_bytes(content) # noqa: STG1 — sealed below
await _seal_blob(dest, content)
# Auto-register as active_draft + retrofit bookmarks
auto_result: dict = {"status": "ok"}
@@ -3382,12 +3413,14 @@ async def api_mark_final(case_number: str, filename: str):
# Rename/copy to final
final_name = f"סופי-{case_number}.docx"
final_path = export_dir / final_name
shutil.copy2(str(source), str(final_path))
shutil.copy2(str(source), str(final_path)) # noqa: STG1 — sealed below
await _seal_blob_file(final_path)
# Also copy to training directory for future style learning
config.TRAINING_DIR.mkdir(parents=True, exist_ok=True)
training_dest = config.TRAINING_DIR / f"החלטה-{case_number}.docx"
shutil.copy2(str(source), str(training_dest))
shutil.copy2(str(source), str(training_dest)) # noqa: STG1 — sealed below
await _seal_blob_file(training_dest)
# Update case status to final
pool = await db.get_pool()
@@ -3635,13 +3668,15 @@ async def api_upload_final_decision(case_number: str, file: UploadFile = File(..
export_dir.mkdir(parents=True, exist_ok=True)
final_name = f"סופי-{case_number}.docx"
final_path = export_dir / final_name
final_path.write_bytes(content)
final_path.write_bytes(content) # noqa: STG1 — sealed below
await _seal_blob(final_path, content)
# Enroll in the style corpus. Use the FULL case_number as decision_number so a
# בל"מ never collides with a same-numbered ערר already in the corpus (e.g. ARAR-25-8126).
config.TRAINING_DIR.mkdir(parents=True, exist_ok=True)
training_dest = config.TRAINING_DIR / f"החלטה-{case_number}.docx"
shutil.copy2(str(final_path), str(training_dest))
shutil.copy2(str(final_path), str(training_dest)) # noqa: STG1 — sealed below
await _seal_blob_file(training_dest)
# Extract the final text (word count for the UI; full text snapshotted into the pair).
final_text = ""
@@ -4977,7 +5012,7 @@ async def api_install_skill(file: UploadFile = File(...)):
dest = skill_dir / rel_path
dest.parent.mkdir(parents=True, exist_ok=True)
dest.write_bytes(zf.read(name))
dest.write_bytes(zf.read(name)) # noqa: STG1 — extracts to ~/.paperclip skills (outside DATA_DIR)
extracted_files.append(rel_path)
zf.close()
@@ -5149,7 +5184,7 @@ async def api_restart_paperclip():
# Fallback: write a flag file that host-side watcher picks up
flag_file = PAPERCLIP_SKILLS_DIR / ".restart-requested"
try:
flag_file.write_text(str(time.time()))
flag_file.write_text(str(time.time())) # noqa: STG1 — restart flag (state)
return {
"status": "restart_requested",
"method": "flag_file",
@@ -5202,7 +5237,8 @@ async def api_upload_tagged_document(
dest = case_dir / f"{stem}-{counter}{ext}"
counter += 1
dest.write_bytes(content)
dest.write_bytes(content) # noqa: STG1 — sealed below
await _seal_blob(dest, content)
# Create document record
case_id = UUID(case["id"])
@@ -5742,7 +5778,8 @@ async def _process_case_document(task_id: str, source: Path, req: ClassifyReques
# Use original name without timestamp prefix
original_name = re.sub(r"^\d+_", "", source.name)
dest = case_dir / original_name
shutil.copy2(str(source), str(dest))
shutil.copy2(str(source), str(dest)) # noqa: STG1 — sealed below
await _seal_blob_file(dest)
# Create document record
await _progress.set(task_id, {"status": "registering", "filename": req.filename})
@@ -5792,7 +5829,8 @@ async def _process_training_document(task_id: str, source: Path, req: ClassifyRe
config.TRAINING_DIR.mkdir(parents=True, exist_ok=True)
original_name = re.sub(r"^\d+_", "", source.name)
dest = config.TRAINING_DIR / original_name
shutil.copy2(str(source), str(dest))
shutil.copy2(str(source), str(dest)) # noqa: STG1 — sealed below
await _seal_blob_file(dest)
# Extract text
await _progress.set(task_id, {"status": "processing", "filename": req.filename, "step": "extracting"})
@@ -6865,7 +6903,8 @@ async def bulletin_upload(file: UploadFile = File(...)):
# Idempotent: same content (any filename) already staged → skip.
if any(p.name.startswith(f"{digest}_") for p in _BULLETINS_DIR.glob(f"{digest}_*")):
return {"status": "exists", "filename": dest.name, "size": len(content)}
dest.write_bytes(content)
dest.write_bytes(content) # noqa: STG1 — sealed below
await _seal_blob(dest, content)
return {"status": "stored", "filename": dest.name, "size": len(content)}