fix(storage): ASCII-encode S3 object metadata — s3-only upload 500 על שמות-קובץ עבריים (INV-STG2) #237

Merged
chaim merged 1 commits from worktree-s3-meta-ascii into main 2026-06-12 11:31:09 +00:00
2 changed files with 62 additions and 1 deletions
Showing only changes of commit 15e4af595a - Show all commits

View File

@@ -88,6 +88,22 @@ def normalize_key(key: str | Path) -> str:
return posix.as_posix().lstrip("/") return posix.as_posix().lstrip("/")
def _ascii_metadata(value) -> str:
"""Coerce an S3 user-metadata value to ASCII.
S3/MinIO object metadata must be ASCII (botocore raises ParamValidationError
otherwise). The only non-ASCII value we attach is the original Hebrew
filename (``ingest._stage_file`` → ``metadata={"filename": ...}``), so a
Hebrew name like ``"יומון 5167 - 11.6.26.pdf"`` would 500 every s3-only
upload. Percent-encode non-ASCII losslessly (recover with
``urllib.parse.unquote``) while leaving plain-ASCII values readable."""
s = str(value)
if s.isascii():
return s
from urllib.parse import quote
return quote(s)
class StorageBackend: class StorageBackend:
"""Abstract backend. All methods are async except the cheap path helpers.""" """Abstract backend. All methods are async except the cheap path helpers."""
@@ -247,7 +263,7 @@ class S3Backend(StorageBackend):
if content_type: if content_type:
extra["ContentType"] = content_type extra["ContentType"] = content_type
if metadata: if metadata:
extra["Metadata"] = {kk: str(vv) for kk, vv in metadata.items()} extra["Metadata"] = {kk: _ascii_metadata(vv) for kk, vv in metadata.items()}
async with self._client() as s3: async with self._client() as s3:
await s3.put_object(Bucket=_bucket_name(bucket), Key=k, Body=data, **extra) await s3.put_object(Bucket=_bucket_name(bucket), Key=k, Body=data, **extra)
return f"s3://{_bucket_name(bucket)}/{k}" return f"s3://{_bucket_name(bucket)}/{k}"

View File

@@ -79,3 +79,48 @@ def test_put_bytes_sync_roundtrip(_tmp_datadir):
src_key = "cases/1/exports/x.docx" src_key = "cases/1/exports/x.docx"
storage.put_bytes_sync(src_key, b"PK\x03\x04zip", bucket=storage.Bucket.DOCUMENTS) storage.put_bytes_sync(src_key, b"PK\x03\x04zip", bucket=storage.Bucket.DOCUMENTS)
assert (_tmp_datadir / src_key).read_bytes() == b"PK\x03\x04zip" assert (_tmp_datadir / src_key).read_bytes() == b"PK\x03\x04zip"
def test_ascii_metadata_encodes_hebrew():
"""S3 object metadata must be ASCII (botocore enforces). A Hebrew original
filename — the value ingest._stage_file attaches — must come back ASCII and
losslessly recoverable (regression: s3-only digest/document upload 500)."""
from urllib.parse import unquote
name = "יומון 5167 - 11.6.26.pdf"
out = storage._ascii_metadata(name)
assert out.isascii()
assert unquote(out) == name
# plain ASCII passes through untouched (readability)
assert storage._ascii_metadata("digest_ab12.pdf") == "digest_ab12.pdf"
def test_s3_put_bytes_sends_ascii_metadata(monkeypatch):
"""Reproduce the failure path: S3Backend.put_bytes with a Hebrew filename in
metadata must hand put_object an ASCII-only Metadata mapping (no
ParamValidationError)."""
captured = {}
class _FakeS3:
async def put_object(self, **kwargs):
captured.update(kwargs)
return {}
class _FakeClientCtx:
async def __aenter__(self):
return _FakeS3()
async def __aexit__(self, *exc):
return False
backend = storage.S3Backend()
monkeypatch.setattr(backend, "_client", lambda **kw: _FakeClientCtx())
monkeypatch.setattr(storage, "_bucket_name", lambda b: "documents")
run(backend.put_bytes(
"digests/incoming/ab12_x.pdf", b"%PDF-1.4",
bucket=storage.Bucket.DOCUMENTS,
metadata={"filename": "יומון 5167 - 11.6.26.pdf"},
))
meta = captured["Metadata"]
assert all(v.isascii() for v in meta.values())
from urllib.parse import unquote
assert unquote(meta["filename"]) == "יומון 5167 - 11.6.26.pdf"