feat(storage): X14 Phase 2b — route extracted-text + async DOCX exports through storage.py
Continue the write-site rewiring onto the unified storage layer (INV-STG1): - services/processor.py: extracted-text .txt → DERIVED bucket (a derived artifact; the DB column is the source of truth per INV-STG5, so the write stays non-fatal) - services/docx_exporter.py (export_decision): DOCX → DOCUMENTS bucket via BytesIO → put_bytes, with a fallback to a direct disk write when the caller passes an output_path outside DATA_DIR - services/analysis_docx_exporter.py (build_analysis_docx): same pattern; out_path is always under DATA_DIR Under the default STORAGE_BACKEND=filesystem the bytes land at the exact legacy path (put_bytes → DATA_DIR/key), so behaviour is unchanged. The disk-reading bits that must stay for now (export_dir glob in _next_version) are kept; storage-native versioning is a cutover concern. Still on disk (sync call-sites, follow-up Phase 2c): docx_reviser (track-changes), docx_retrofit backup, and multimodal thumbnails (rendered in a to_thread). git-tracked text (case.json/notes/research-md/draft-md) stays on disk by design (INV-STG7). tests: 38 storage + docx tests green (incl. test_export_qa_gate / test_docx_exporter_bookmarks which exercise the real export path); 242 collected, no import breakage. Keeps G2; advances INV-STG1. Spec: docs/spec/X14-storage-minio.md. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,7 @@ Output: data/cases/{case_number}/exports/ניתוח-משפטי-v{N}.docx
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
@@ -34,7 +35,7 @@ from docx.text.paragraph import Paragraph
|
||||
from docx.text.run import Run
|
||||
|
||||
from legal_mcp import config
|
||||
from legal_mcp.services import db, research_md
|
||||
from legal_mcp.services import db, research_md, storage
|
||||
|
||||
|
||||
def _mark_run_rtl(run: Run) -> None:
|
||||
@@ -494,10 +495,19 @@ async def build_analysis_docx(case_number: str) -> Path:
|
||||
continue
|
||||
_emit_content_line(doc, raw)
|
||||
|
||||
# Save versioned
|
||||
# Save versioned through the storage layer (INV-STG1). export_dir.mkdir +
|
||||
# the glob in _next_version still read disk (correct under filesystem/dual;
|
||||
# storage-native versioning is a cutover concern). out_path is always under
|
||||
# DATA_DIR, so the bytes land exactly where they did before.
|
||||
export_dir = case_dir / "exports"
|
||||
export_dir.mkdir(parents=True, exist_ok=True)
|
||||
version = _next_version(export_dir)
|
||||
out_path = export_dir / f"ניתוח-משפטי-v{version}.docx"
|
||||
doc.save(str(out_path))
|
||||
buf = io.BytesIO()
|
||||
doc.save(buf)
|
||||
await storage.put_bytes(
|
||||
out_path.relative_to(config.DATA_DIR).as_posix(), buf.getvalue(),
|
||||
bucket=storage.Bucket.DOCUMENTS,
|
||||
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
)
|
||||
return out_path
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
from datetime import date
|
||||
@@ -17,7 +18,7 @@ from docx.oxml import OxmlElement
|
||||
from docx.oxml.ns import qn
|
||||
|
||||
from legal_mcp import config
|
||||
from legal_mcp.services import db
|
||||
from legal_mcp.services import db, storage
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -474,8 +475,19 @@ async def export_decision(
|
||||
pass
|
||||
output_path = str(export_dir / f"{prefix}-v{next_ver}.docx")
|
||||
|
||||
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
doc.save(output_path)
|
||||
# Persist through the storage layer (INV-STG1). Under the filesystem
|
||||
# backend the bytes land at output_path exactly as before; a caller-
|
||||
# provided path outside DATA_DIR falls back to a direct disk write.
|
||||
buf = io.BytesIO()
|
||||
doc.save(buf)
|
||||
data = buf.getvalue()
|
||||
_docx_ctype = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
try:
|
||||
key = Path(output_path).resolve().relative_to(Path(config.DATA_DIR).resolve()).as_posix()
|
||||
await storage.put_bytes(key, data, bucket=storage.Bucket.DOCUMENTS, content_type=_docx_ctype)
|
||||
except ValueError:
|
||||
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
Path(output_path).write_bytes(data)
|
||||
logger.info("DOCX exported (mode=%s): %s", mode, output_path)
|
||||
return output_path
|
||||
|
||||
|
||||
@@ -8,7 +8,9 @@ from pathlib import Path
|
||||
from uuid import UUID
|
||||
|
||||
from legal_mcp import config
|
||||
from legal_mcp.services import chunker, db, embeddings, extractor, references_extractor
|
||||
from legal_mcp.services import (
|
||||
chunker, db, embeddings, extractor, references_extractor, storage,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -40,13 +42,17 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:
|
||||
page_count=page_count,
|
||||
)
|
||||
|
||||
# Save extracted text to documents/extracted/ directory
|
||||
# Save extracted text (a DERIVED artifact — the DB column holds the
|
||||
# source of truth, INV-STG5) through the storage layer (INV-STG1).
|
||||
# Non-fatal: the .txt is a convenience copy, the pipeline reads the DB.
|
||||
original_path = Path(doc["file_path"])
|
||||
extracted_dir = original_path.parent.parent / "extracted"
|
||||
extracted_dir.mkdir(parents=True, exist_ok=True)
|
||||
txt_path = extracted_dir / (original_path.stem + ".txt")
|
||||
txt_path = original_path.parent.parent / "extracted" / (original_path.stem + ".txt")
|
||||
try:
|
||||
txt_path.write_text(text, encoding="utf-8")
|
||||
await storage.put_bytes(
|
||||
txt_path.relative_to(config.DATA_DIR).as_posix(),
|
||||
text.encode("utf-8"), bucket=storage.Bucket.DERIVED,
|
||||
content_type="text/plain; charset=utf-8",
|
||||
)
|
||||
logger.info("Saved extracted text to %s", txt_path)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to save text file (non-fatal): %s", e)
|
||||
|
||||
Reference in New Issue
Block a user