legal-ai/scripts/migrate_blobs_to_minio.py

#!/usr/bin/env python3
"""#106.4 — migrate binary blobs (PDF/DOCX/thumbnails) from disk to MinIO.

DB-DRIVEN, NOT a wholesale ``mc mirror``: the bucket is chosen per-file-semantic
(source/draft → documents, thumbnail → derived), so the migration walks the DB
path columns and uploads each referenced file to its correct (bucket, key). The
key is the DATA_DIR-relative POSIX path (storage.normalize_key), matching how the
write-wiring (#106.3) and read-wiring (#106.5) resolve keys.

⚠️ The legacy path columns are INCONSISTENT (audited 2026-06-11): three formats
coexist — container-absolute ``/data/…``, host-absolute
``/home/chaim/legal-ai/data/…``, and DATA_DIR-relative ``digests/…``. This script
normalises all three to a host path + a clean key. Files it cannot locate are
reported, never silently skipped.

Buckets (X14 §3.1):
  documents → originals, drafts/exports, digests sources, finals (finals promote
              to immutable only at #106.7).
  derived   → page thumbnails.

DRY-RUN by default: prints the full plan (per table/bucket: found / missing /
bytes) and writes a CSV manifest to data/audit/. Touches NOTHING. ``--apply``
uploads via the configured MinIO client (mcli alias from --mc-alias), verifying
size after each PUT; the disk is never modified, so a re-run is idempotent and
the migration is reversible (wipe the buckets, flip STORAGE_BACKEND back).

DB path-column normalisation to clean keys is a SEPARATE, later step (reads still
use the legacy paths until #106.5 deploys) — this script only moves bytes.

    cd ~/legal-ai/mcp-server
    .venv/bin/python ../scripts/migrate_blobs_to_minio.py            # dry-run plan
    .venv/bin/python ../scripts/migrate_blobs_to_minio.py --apply --mc-alias legalminio
"""
from __future__ import annotations

import argparse
import asyncio
import csv
import subprocess
from datetime import datetime, timezone
from pathlib import Path

from legal_mcp import config
from legal_mcp.services import db

DATA_DIR = Path(config.DATA_DIR).resolve()
_CONTAINER_DATA = "/data/"  # the container bind-mount target seen in legacy rows

# (table, column, bucket) — only columns that actually exist (audited 2026-06-11).
SOURCES = [
    ("documents", "file_path", "documents"),
    ("cases", "active_draft_path", "documents"),
    ("digests", "source_document_path", "documents"),
    ("draft_final_pairs", "final_path", "documents"),
    ("document_image_embeddings", "image_thumbnail_path", "derived"),
    ("precedent_image_embeddings", "image_thumbnail_path", "derived"),
]
BUCKET_ENV = {
    "documents": config.MINIO_BUCKET_DOCUMENTS,
    "derived": config.MINIO_BUCKET_DERIVED,
    "immutable": config.MINIO_BUCKET_IMMUTABLE,
}

# Served-but-NOT-DB-tracked file categories (the #106.5 cutover-prerequisite the
# tri-model panel flagged): the 4 FileResponse endpoints serve these from case
# dirs, but no DB column references them, so the DB-driven pass misses them. All
# go to the documents bucket; keys are DATA_DIR-relative (same scheme). Globs are
# relative to DATA_DIR.
UNTRACKED_GLOBS = [
    "cases/*/documents/research/*",
    "cases/*/documents/proofread/*",
    "cases/*/drafts/*",
    "cases/*/exports/*",
    "training/proofread/*",
]


def iter_untracked():
    """Yield (label, host_path) for served files not referenced by any DB column."""
    for pattern in UNTRACKED_GLOBS:
        for host in DATA_DIR.glob(pattern):
            if host.is_file():
                yield pattern, host


def resolve_host(stored: str) -> Path | None:
    """Normalise one stored path (3 legacy formats) to a host filesystem path."""
    s = (stored or "").strip()
    if not s:
        return None
    if s.startswith(_CONTAINER_DATA):                 # container-absolute /data/…
        return DATA_DIR / s[len(_CONTAINER_DATA):]
    p = Path(s)
    if p.is_absolute():                               # host-absolute
        return p
    return DATA_DIR / s                               # DATA_DIR-relative


def to_key(host: Path) -> str | None:
    """DATA_DIR-relative POSIX key, or None if the file is outside DATA_DIR."""
    try:
        return host.resolve().relative_to(DATA_DIR).as_posix()
    except ValueError:
        return None


async def main(args: argparse.Namespace) -> int:
    pool = await db.get_pool()
    ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
    audit = Path(config.DATA_DIR) / "audit"
    audit.mkdir(parents=True, exist_ok=True)
    manifest = audit / f"minio-migration-plan-{ts}.csv"

    totals = {"found": 0, "missing": 0, "outside": 0, "bytes": 0, "uploaded": 0, "failed": 0}
    per_bucket: dict = {}
    rows_out = []

    # Build the work-list: DB-tracked columns (default) or the filesystem scan of
    # served-but-untracked files (--untracked, the #106.5 cutover-prerequisite).
    items: list[tuple[str, str, str, Path]] = []  # (label, stored, bucket, host)
    if args.untracked:
        for label, host in iter_untracked():
            items.append((label, str(host), "documents", host))
    else:
        for table, col, bucket in SOURCES:
            try:
                rows = await pool.fetch(
                    f"SELECT DISTINCT {col} AS v FROM {table} WHERE COALESCE({col},'') <> ''")
            except Exception as e:  # noqa: BLE001
                print(f"  {table}.{col}: SKIP ({str(e)[:60]})")
                continue
            for r in rows:
                items.append((f"{table}.{col}", r["v"], bucket, resolve_host(r["v"])))

    for label, stored, bucket, host in items:
        b = per_bucket.setdefault(bucket, {"found": 0, "missing": 0, "bytes": 0})
        key = to_key(host) if host else None
        if host is None or key is None:
            totals["outside"] += 1
            rows_out.append([label, "", bucket, stored, "", "OUTSIDE_DATA_DIR", 0])
            continue
        if not host.exists():
            totals["missing"] += 1
            b["missing"] += 1
            rows_out.append([label, "", bucket, stored, key, "MISSING", 0])
            continue
        size = host.stat().st_size
        totals["found"] += 1
        totals["bytes"] += size
        b["found"] += 1
        b["bytes"] += size
        status = "PLANNED"
        if args.apply:
            ok = _upload(args.mc_alias, BUCKET_ENV[bucket], key, host, size)
            status = "UPLOADED" if ok else "FAILED"
            totals["uploaded" if ok else "failed"] += 1
        rows_out.append([label, "", bucket, stored, key, status, size])

    with manifest.open("w", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow(["source", "_", "bucket", "stored_path", "key", "status", "bytes"])
        w.writerows(rows_out)

    print(f"\n{'APPLY' if args.apply else 'DRY-RUN'} — blob migration plan")
    print("=" * 56)
    for bucket, b in sorted(per_bucket.items()):
        print(f"  {bucket:10} found={b['found']:5} missing={b['missing']:4} "
              f"bytes={b['bytes']/1e6:.1f}MB")
    print("-" * 56)
    print(f"  TOTAL found={totals['found']} missing={totals['missing']} "
          f"outside-DATA_DIR={totals['outside']} bytes={totals['bytes']/1e6:.1f}MB")
    if args.apply:
        print(f"  uploaded={totals['uploaded']} failed={totals['failed']}")
    print(f"\nmanifest → {manifest}")
    if totals["missing"] or totals["outside"]:
        print("⚠ some referenced files are missing/outside DATA_DIR — review the "
              "manifest BEFORE --apply; they will not migrate.")
    return 0


def _upload(alias: str, bucket: str, key: str, host: Path, size: int) -> bool:
    """Upload one file via mcli and verify the remote size. Disk untouched."""
    target = f"{alias}/{bucket}/{key}"
    try:
        subprocess.run(["mcli", "cp", "-q", str(host), target],
                       check=True, capture_output=True, timeout=300)
        out = subprocess.run(["mcli", "stat", "--json", target],
                             capture_output=True, text=True, timeout=60)
        return out.returncode == 0 and str(size) in out.stdout
    except Exception as e:  # noqa: BLE001
        print(f"    upload FAILED {key}: {str(e)[:80]}")
        return False


if __name__ == "__main__":
    ap = argparse.ArgumentParser(description=__doc__,
                                 formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument("--apply", action="store_true", help="upload (default: dry-run plan only)")
    ap.add_argument("--mc-alias", default="legalminio", help="mcli alias for MinIO")
    ap.add_argument("--untracked", action="store_true",
                    help="migrate served-but-NOT-DB-tracked files (research/proofread/"
                         "drafts/exports) instead of the DB columns (#106.5 prerequisite)")
    raise SystemExit(asyncio.run(main(ap.parse_args())))