feat(storage): X14 Phase 2a — route source-document writes through storage.py
Rewire the source-document staging writes onto the unified storage layer (INV-STG1), replacing direct shutil.copy2 calls: - tools/documents.py: case originals + training-corpus uploads - services/ingest.py: _stage_file (now async) — covers precedent-library, internal-decisions, and digests (the canonical intake helper) - services/digest_library.py: awaits the now-async _stage_file Each write goes through storage.put_file(..., bucket=DOCUMENTS) with the DATA_DIR-relative key; the Hebrew original filename rides as object metadata (INV-STG2), content-type is guessed from the extension. DB path columns are unchanged (still the absolute dest) — object_key backfill is Phase 3. Under the default STORAGE_BACKEND=filesystem the bytes land at the exact legacy on-disk location (put_file → shutil.copy2 to DATA_DIR/key), so this is zero behaviour change in prod. shutil import dropped where now unused. tests: +2 staging regression tests (file lands under DATA_DIR at the legacy path); 20 storage + 22 ingest tests green; 242 collected with no import breakage. Derived/export write sites (thumbnails, extracted text, DOCX exports) are Phase 2b. Keeps G2; advances INV-STG1. Spec: docs/spec/X14-storage-minio.md. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -4,12 +4,12 @@ from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import shutil
|
||||
import mimetypes
|
||||
from pathlib import Path
|
||||
from uuid import UUID
|
||||
|
||||
from legal_mcp import config
|
||||
from legal_mcp.services import audit, db, git_sync, processor
|
||||
from legal_mcp.services import audit, db, git_sync, processor, storage
|
||||
from legal_mcp.tools.envelope import empty, err, ok # GAP-48: SSoT envelope
|
||||
|
||||
|
||||
@@ -50,11 +50,14 @@ async def document_upload(
|
||||
"idempotent_existing": True,
|
||||
}, message=f"הקובץ כבר הועלה לתיק {case_number} (זהה ב-hash) — מוחזר הקיים, ללא עיבוד מחדש.")
|
||||
|
||||
# Copy file to case directory
|
||||
case_dir = config.find_case_dir(case_number) / "documents" / "originals"
|
||||
case_dir.mkdir(parents=True, exist_ok=True)
|
||||
dest = case_dir / source.name
|
||||
shutil.copy2(str(source), str(dest))
|
||||
# Stage the original through the unified storage layer (INV-STG1).
|
||||
dest = config.find_case_dir(case_number) / "documents" / "originals" / source.name
|
||||
await storage.put_file(
|
||||
source, dest.relative_to(config.DATA_DIR).as_posix(),
|
||||
bucket=storage.Bucket.DOCUMENTS,
|
||||
content_type=mimetypes.guess_type(source.name)[0],
|
||||
metadata={"filename": source.name},
|
||||
)
|
||||
|
||||
# For auto classification, start with "reference" — will be updated after processing
|
||||
initial_doc_type = doc_type if doc_type != "auto" else "reference"
|
||||
@@ -156,10 +159,14 @@ async def document_upload_training(
|
||||
}
|
||||
subdir = _SUBTYPE_DIRS.get(appeal_subtype, "")
|
||||
training_dest = config.TRAINING_DIR / subdir if subdir else config.TRAINING_DIR
|
||||
training_dest.mkdir(parents=True, exist_ok=True)
|
||||
dest = training_dest / source.name
|
||||
if source.resolve() != dest.resolve():
|
||||
shutil.copy2(str(source), str(dest))
|
||||
await storage.put_file(
|
||||
source, dest.relative_to(config.DATA_DIR).as_posix(),
|
||||
bucket=storage.Bucket.DOCUMENTS,
|
||||
content_type=mimetypes.guess_type(source.name)[0],
|
||||
metadata={"filename": source.name},
|
||||
)
|
||||
|
||||
# Extract text and strip Nevo preamble
|
||||
text, page_count, _ = await extractor.extract_text(str(dest))
|
||||
|
||||
Reference in New Issue
Block a user