From 1f42a39ce4bf634b59b229c42648a4e8b20bd450 Mon Sep 17 00:00:00 2001
From: Chaim <chaim@marcus-law.co.il>
Date: Mon, 8 Jun 2026 08:05:25 +0000
Subject: [PATCH] =?UTF-8?q?feat(storage):=20X14=20Phase=202b=20=E2=80=94?=
 =?UTF-8?q?=20route=20extracted-text=20+=20async=20DOCX=20exports=20throug?=
 =?UTF-8?q?h=20storage.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Continue the write-site rewiring onto the unified storage layer (INV-STG1):
- services/processor.py: extracted-text .txt → DERIVED bucket (a derived
  artifact; the DB column is the source of truth per INV-STG5, so the write
  stays non-fatal)
- services/docx_exporter.py (export_decision): DOCX → DOCUMENTS bucket via
  BytesIO → put_bytes, with a fallback to a direct disk write when the caller
  passes an output_path outside DATA_DIR
- services/analysis_docx_exporter.py (build_analysis_docx): same pattern;
  out_path is always under DATA_DIR

Under the default STORAGE_BACKEND=filesystem the bytes land at the exact
legacy path (put_bytes → DATA_DIR/key), so behaviour is unchanged. The
disk-reading bits that must stay for now (export_dir glob in _next_version)
are kept; storage-native versioning is a cutover concern.

Still on disk (sync call-sites, follow-up Phase 2c): docx_reviser
(track-changes), docx_retrofit backup, and multimodal thumbnails (rendered in
a to_thread). git-tracked text (case.json/notes/research-md/draft-md) stays on
disk by design (INV-STG7).

tests: 38 storage + docx tests green (incl. test_export_qa_gate /
test_docx_exporter_bookmarks which exercise the real export path); 242
collected, no import breakage.

Keeps G2; advances INV-STG1. Spec: docs/spec/X14-storage-minio.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../services/analysis_docx_exporter.py         | 16 +++++++++++++---
 .../src/legal_mcp/services/docx_exporter.py    | 18 +++++++++++++++---
 mcp-server/src/legal_mcp/services/processor.py | 18 ++++++++++++------
 3 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/mcp-server/src/legal_mcp/services/analysis_docx_exporter.py b/mcp-server/src/legal_mcp/services/analysis_docx_exporter.py
index 86bebf1..c0b92cc 100644
--- a/mcp-server/src/legal_mcp/services/analysis_docx_exporter.py
+++ b/mcp-server/src/legal_mcp/services/analysis_docx_exporter.py
@@ -21,6 +21,7 @@ Output: data/cases/{case_number}/exports/ניתוח-משפטי-v{N}.docx
 
 from __future__ import annotations
 
+import io
 import re
 from pathlib import Path
 from typing import Any
@@ -34,7 +35,7 @@ from docx.text.paragraph import Paragraph
 from docx.text.run import Run
 
 from legal_mcp import config
-from legal_mcp.services import db, research_md
+from legal_mcp.services import db, research_md, storage
 
 
 def _mark_run_rtl(run: Run) -> None:
@@ -494,10 +495,19 @@ async def build_analysis_docx(case_number: str) -> Path:
                 continue
             _emit_content_line(doc, raw)
 
-    # Save versioned
+    # Save versioned through the storage layer (INV-STG1). export_dir.mkdir +
+    # the glob in _next_version still read disk (correct under filesystem/dual;
+    # storage-native versioning is a cutover concern). out_path is always under
+    # DATA_DIR, so the bytes land exactly where they did before.
     export_dir = case_dir / "exports"
     export_dir.mkdir(parents=True, exist_ok=True)
     version = _next_version(export_dir)
     out_path = export_dir / f"ניתוח-משפטי-v{version}.docx"
-    doc.save(str(out_path))
+    buf = io.BytesIO()
+    doc.save(buf)
+    await storage.put_bytes(
+        out_path.relative_to(config.DATA_DIR).as_posix(), buf.getvalue(),
+        bucket=storage.Bucket.DOCUMENTS,
+        content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    )
     return out_path
diff --git a/mcp-server/src/legal_mcp/services/docx_exporter.py b/mcp-server/src/legal_mcp/services/docx_exporter.py
index 2f9bbfc..a4466c8 100644
--- a/mcp-server/src/legal_mcp/services/docx_exporter.py
+++ b/mcp-server/src/legal_mcp/services/docx_exporter.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import io
 import logging
 import re
 from datetime import date
@@ -17,7 +18,7 @@ from docx.oxml import OxmlElement
 from docx.oxml.ns import qn
 
 from legal_mcp import config
-from legal_mcp.services import db
+from legal_mcp.services import db, storage
 
 logger = logging.getLogger(__name__)
 
@@ -474,8 +475,19 @@ async def export_decision(
                 pass
         output_path = str(export_dir / f"{prefix}-v{next_ver}.docx")
 
-    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-    doc.save(output_path)
+    # Persist through the storage layer (INV-STG1). Under the filesystem
+    # backend the bytes land at output_path exactly as before; a caller-
+    # provided path outside DATA_DIR falls back to a direct disk write.
+    buf = io.BytesIO()
+    doc.save(buf)
+    data = buf.getvalue()
+    _docx_ctype = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    try:
+        key = Path(output_path).resolve().relative_to(Path(config.DATA_DIR).resolve()).as_posix()
+        await storage.put_bytes(key, data, bucket=storage.Bucket.DOCUMENTS, content_type=_docx_ctype)
+    except ValueError:
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+        Path(output_path).write_bytes(data)
     logger.info("DOCX exported (mode=%s): %s", mode, output_path)
     return output_path
 
diff --git a/mcp-server/src/legal_mcp/services/processor.py b/mcp-server/src/legal_mcp/services/processor.py
index c1aec5c..d9eb704 100644
--- a/mcp-server/src/legal_mcp/services/processor.py
+++ b/mcp-server/src/legal_mcp/services/processor.py
@@ -8,7 +8,9 @@ from pathlib import Path
 from uuid import UUID
 
 from legal_mcp import config
-from legal_mcp.services import chunker, db, embeddings, extractor, references_extractor
+from legal_mcp.services import (
+    chunker, db, embeddings, extractor, references_extractor, storage,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -40,13 +42,17 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:
             page_count=page_count,
         )
 
-        # Save extracted text to documents/extracted/ directory
+        # Save extracted text (a DERIVED artifact — the DB column holds the
+        # source of truth, INV-STG5) through the storage layer (INV-STG1).
+        # Non-fatal: the .txt is a convenience copy, the pipeline reads the DB.
         original_path = Path(doc["file_path"])
-        extracted_dir = original_path.parent.parent / "extracted"
-        extracted_dir.mkdir(parents=True, exist_ok=True)
-        txt_path = extracted_dir / (original_path.stem + ".txt")
+        txt_path = original_path.parent.parent / "extracted" / (original_path.stem + ".txt")
         try:
-            txt_path.write_text(text, encoding="utf-8")
+            await storage.put_bytes(
+                txt_path.relative_to(config.DATA_DIR).as_posix(),
+                text.encode("utf-8"), bucket=storage.Bucket.DERIVED,
+                content_type="text/plain; charset=utf-8",
+            )
             logger.info("Saved extracted text to %s", txt_path)
         except Exception as e:
             logger.warning("Failed to save text file (non-fatal): %s", e)