From a13fc76c494b638e1410440d618c6ed57b62cd6b Mon Sep 17 00:00:00 2001
From: Chaim <chaim@marcus-law.co.il>
Date: Thu, 11 Jun 2026 17:48:38 +0000
Subject: [PATCH] =?UTF-8?q?feat(storage):=20#106.5=20prereq=20=E2=80=94=20?=
 =?UTF-8?q?migrate=20served-but-untracked=20files=20(--untracked)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

מצב --untracked לסקריפט ההגירה: סורק את ה-filesystem לקטגוריות שה-4 endpoints מגישים
אך אינן רשומות בשום עמודת-DB (research/*, proofread/*, drafts/*, exports/*, training/
proofread/*) → מעלה ל-legal-documents עם אותו key יחסי-DATA_DIR. זהו תנאי-הסף שהפאנל
התלת-מודלי זיהה: בלי הקבצים האלה ב-MinIO, cutover ל-s3-only היה מחזיר 404 על הגשתם.

dry-run אומת: 144 קבצים / 83.9MB, 0 חסרים, 0 outside. הפיך (העתקה אדיטיבית, דיסק שלם).
refactor קטן: הלולאה הראשית עובדת על work-list אחיד (DB-tracked או filesystem-scan).

invariants: G2 (אותו key/bucket scheme) · INV-STG1/3 · INV-G10 (dry-run/הפיך, אפס שינוי
בייצור — רק העלאה לדליות; cutover עדיין נעול-אדם).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 scripts/migrate_blobs_to_minio.py | 95 +++++++++++++++++++++----------
 1 file changed, 64 insertions(+), 31 deletions(-)

diff --git a/scripts/migrate_blobs_to_minio.py b/scripts/migrate_blobs_to_minio.py
index a26db25..d0e223e 100644
--- a/scripts/migrate_blobs_to_minio.py
+++ b/scripts/migrate_blobs_to_minio.py
@@ -61,6 +61,27 @@ BUCKET_ENV = {
     "immutable": config.MINIO_BUCKET_IMMUTABLE,
 }
 
+# Served-but-NOT-DB-tracked file categories (the #106.5 cutover-prerequisite the
+# tri-model panel flagged): the 4 FileResponse endpoints serve these from case
+# dirs, but no DB column references them, so the DB-driven pass misses them. All
+# go to the documents bucket; keys are DATA_DIR-relative (same scheme). Globs are
+# relative to DATA_DIR.
+UNTRACKED_GLOBS = [
+    "cases/*/documents/research/*",
+    "cases/*/documents/proofread/*",
+    "cases/*/drafts/*",
+    "cases/*/exports/*",
+    "training/proofread/*",
+]
+
+
+def iter_untracked():
+    """Yield (label, host_path) for served files not referenced by any DB column."""
+    for pattern in UNTRACKED_GLOBS:
+        for host in DATA_DIR.glob(pattern):
+            if host.is_file():
+                yield pattern, host
+
 
 def resolve_host(stored: str) -> Path | None:
     """Normalise one stored path (3 legacy formats) to a host filesystem path."""
@@ -94,41 +115,50 @@ async def main(args: argparse.Namespace) -> int:
     per_bucket: dict = {}
     rows_out = []
 
-    for table, col, bucket in SOURCES:
-        try:
-            rows = await pool.fetch(
-                f"SELECT DISTINCT {col} AS v FROM {table} WHERE COALESCE({col},'') <> ''")
-        except Exception as e:  # noqa: BLE001
-            print(f"  {table}.{col}: SKIP ({str(e)[:60]})")
-            continue
+    # Build the work-list: DB-tracked columns (default) or the filesystem scan of
+    # served-but-untracked files (--untracked, the #106.5 cutover-prerequisite).
+    items: list[tuple[str, str, str, Path]] = []  # (label, stored, bucket, host)
+    if args.untracked:
+        for label, host in iter_untracked():
+            items.append((label, str(host), "documents", host))
+    else:
+        for table, col, bucket in SOURCES:
+            try:
+                rows = await pool.fetch(
+                    f"SELECT DISTINCT {col} AS v FROM {table} WHERE COALESCE({col},'') <> ''")
+            except Exception as e:  # noqa: BLE001
+                print(f"  {table}.{col}: SKIP ({str(e)[:60]})")
+                continue
+            for r in rows:
+                items.append((f"{table}.{col}", r["v"], bucket, resolve_host(r["v"])))
+
+    for label, stored, bucket, host in items:
         b = per_bucket.setdefault(bucket, {"found": 0, "missing": 0, "bytes": 0})
-        for r in rows:
-            host = resolve_host(r["v"])
-            key = to_key(host) if host else None
-            if host is None or key is None:
-                totals["outside"] += 1
-                rows_out.append([table, col, bucket, r["v"], "", "OUTSIDE_DATA_DIR", 0])
-                continue
-            if not host.exists():
-                totals["missing"] += 1
-                b["missing"] += 1
-                rows_out.append([table, col, bucket, r["v"], key, "MISSING", 0])
-                continue
-            size = host.stat().st_size
-            totals["found"] += 1
-            totals["bytes"] += size
-            b["found"] += 1
-            b["bytes"] += size
-            status = "PLANNED"
-            if args.apply:
-                ok = _upload(args.mc_alias, BUCKET_ENV[bucket], key, host, size)
-                status = "UPLOADED" if ok else "FAILED"
-                totals["uploaded" if ok else "failed"] += 1
-            rows_out.append([table, col, bucket, r["v"], key, status, size])
+        key = to_key(host) if host else None
+        if host is None or key is None:
+            totals["outside"] += 1
+            rows_out.append([label, "", bucket, stored, "", "OUTSIDE_DATA_DIR", 0])
+            continue
+        if not host.exists():
+            totals["missing"] += 1
+            b["missing"] += 1
+            rows_out.append([label, "", bucket, stored, key, "MISSING", 0])
+            continue
+        size = host.stat().st_size
+        totals["found"] += 1
+        totals["bytes"] += size
+        b["found"] += 1
+        b["bytes"] += size
+        status = "PLANNED"
+        if args.apply:
+            ok = _upload(args.mc_alias, BUCKET_ENV[bucket], key, host, size)
+            status = "UPLOADED" if ok else "FAILED"
+            totals["uploaded" if ok else "failed"] += 1
+        rows_out.append([label, "", bucket, stored, key, status, size])
 
     with manifest.open("w", encoding="utf-8", newline="") as f:
         w = csv.writer(f)
-        w.writerow(["table", "column", "bucket", "stored_path", "key", "status", "bytes"])
+        w.writerow(["source", "_", "bucket", "stored_path", "key", "status", "bytes"])
         w.writerows(rows_out)
 
     print(f"\n{'APPLY' if args.apply else 'DRY-RUN'} — blob migration plan")
@@ -167,4 +197,7 @@ if __name__ == "__main__":
                                  formatter_class=argparse.RawDescriptionHelpFormatter)
     ap.add_argument("--apply", action="store_true", help="upload (default: dry-run plan only)")
     ap.add_argument("--mc-alias", default="legalminio", help="mcli alias for MinIO")
+    ap.add_argument("--untracked", action="store_true",
+                    help="migrate served-but-NOT-DB-tracked files (research/proofread/"
+                         "drafts/exports) instead of the DB columns (#106.5 prerequisite)")
     raise SystemExit(asyncio.run(main(ap.parse_args())))