Merge pull request 'feat(storage): #106.5 prereq — הגירת קבצים מוגשים לא-מתויגי-DB (--untracked)' (#199) from worktree-minio-migrate-untracked into main

2026-06-11 17:48:57 +00:00
parent 2e2234ec27 a13fc76c49
commit 2e0cfd8d94
1 changed files with 64 additions and 31 deletions
--- a/scripts/migrate_blobs_to_minio.py
+++ b/scripts/migrate_blobs_to_minio.py
@@ -61,6 +61,27 @@ BUCKET_ENV = {
    "immutable": config.MINIO_BUCKET_IMMUTABLE,
 }
 # Served-but-NOT-DB-tracked file categories (the #106.5 cutover-prerequisite the
 # tri-model panel flagged): the 4 FileResponse endpoints serve these from case
 # dirs, but no DB column references them, so the DB-driven pass misses them. All
 # go to the documents bucket; keys are DATA_DIR-relative (same scheme). Globs are
 # relative to DATA_DIR.
 UNTRACKED_GLOBS = [
    "cases/*/documents/research/*",
    "cases/*/documents/proofread/*",
    "cases/*/drafts/*",
    "cases/*/exports/*",
    "training/proofread/*",
 ]
 def iter_untracked():
    """Yield (label, host_path) for served files not referenced by any DB column."""
    for pattern in UNTRACKED_GLOBS:
        for host in DATA_DIR.glob(pattern):
            if host.is_file():
                yield pattern, host
 def resolve_host(stored: str) -> Path | None:
    """Normalise one stored path (3 legacy formats) to a host filesystem path."""
@@ -94,6 +115,13 @@ async def main(args: argparse.Namespace) -> int:
    per_bucket: dict = {}
    rows_out = []
    # Build the work-list: DB-tracked columns (default) or the filesystem scan of
    # served-but-untracked files (--untracked, the #106.5 cutover-prerequisite).
    items: list[tuple[str, str, str, Path]] = []  # (label, stored, bucket, host)
    if args.untracked:
        for label, host in iter_untracked():
            items.append((label, str(host), "documents", host))
    else:
        for table, col, bucket in SOURCES:
            try:
                rows = await pool.fetch(
@@ -101,18 +129,20 @@ async def main(args: argparse.Namespace) -> int:
            except Exception as e:  # noqa: BLE001
                print(f"  {table}.{col}: SKIP ({str(e)[:60]})")
                continue
        b = per_bucket.setdefault(bucket, {"found": 0, "missing": 0, "bytes": 0})
            for r in rows:
-            host = resolve_host(r["v"])
+                items.append((f"{table}.{col}", r["v"], bucket, resolve_host(r["v"])))
    for label, stored, bucket, host in items:
        b = per_bucket.setdefault(bucket, {"found": 0, "missing": 0, "bytes": 0})
        key = to_key(host) if host else None
        if host is None or key is None:
            totals["outside"] += 1
-                rows_out.append([table, col, bucket, r["v"], "", "OUTSIDE_DATA_DIR", 0])
+            rows_out.append([label, "", bucket, stored, "", "OUTSIDE_DATA_DIR", 0])
            continue
        if not host.exists():
            totals["missing"] += 1
            b["missing"] += 1
-                rows_out.append([table, col, bucket, r["v"], key, "MISSING", 0])
+            rows_out.append([label, "", bucket, stored, key, "MISSING", 0])
            continue
        size = host.stat().st_size
        totals["found"] += 1
@@ -124,11 +154,11 @@ async def main(args: argparse.Namespace) -> int:
            ok = _upload(args.mc_alias, BUCKET_ENV[bucket], key, host, size)
            status = "UPLOADED" if ok else "FAILED"
            totals["uploaded" if ok else "failed"] += 1
-            rows_out.append([table, col, bucket, r["v"], key, status, size])
+        rows_out.append([label, "", bucket, stored, key, status, size])
    with manifest.open("w", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
-        w.writerow(["table", "column", "bucket", "stored_path", "key", "status", "bytes"])
+        w.writerow(["source", "_", "bucket", "stored_path", "key", "status", "bytes"])
        w.writerows(rows_out)
    print(f"\n{'APPLY' if args.apply else 'DRY-RUN'} — blob migration plan")
@@ -167,4 +197,7 @@ if __name__ == "__main__":
                                 formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument("--apply", action="store_true", help="upload (default: dry-run plan only)")
    ap.add_argument("--mc-alias", default="legalminio", help="mcli alias for MinIO")
    ap.add_argument("--untracked", action="store_true",
                    help="migrate served-but-NOT-DB-tracked files (research/proofread/"
                         "drafts/exports) instead of the DB columns (#106.5 prerequisite)")
    raise SystemExit(asyncio.run(main(ap.parse_args())))