fix(storage): ASCII-encode S3 object metadata — s3-only upload 500 על שמות-קובץ עבריים (INV-STG2) #237
@@ -88,6 +88,22 @@ def normalize_key(key: str | Path) -> str:
|
||||
return posix.as_posix().lstrip("/")
|
||||
|
||||
|
||||
def _ascii_metadata(value) -> str:
|
||||
"""Coerce an S3 user-metadata value to ASCII.
|
||||
|
||||
S3/MinIO object metadata must be ASCII (botocore raises ParamValidationError
|
||||
otherwise). The only non-ASCII value we attach is the original Hebrew
|
||||
filename (``ingest._stage_file`` → ``metadata={"filename": ...}``), so a
|
||||
Hebrew name like ``"יומון 5167 - 11.6.26.pdf"`` would 500 every s3-only
|
||||
upload. Percent-encode non-ASCII losslessly (recover with
|
||||
``urllib.parse.unquote``) while leaving plain-ASCII values readable."""
|
||||
s = str(value)
|
||||
if s.isascii():
|
||||
return s
|
||||
from urllib.parse import quote
|
||||
return quote(s)
|
||||
|
||||
|
||||
class StorageBackend:
|
||||
"""Abstract backend. All methods are async except the cheap path helpers."""
|
||||
|
||||
@@ -247,7 +263,7 @@ class S3Backend(StorageBackend):
|
||||
if content_type:
|
||||
extra["ContentType"] = content_type
|
||||
if metadata:
|
||||
extra["Metadata"] = {kk: str(vv) for kk, vv in metadata.items()}
|
||||
extra["Metadata"] = {kk: _ascii_metadata(vv) for kk, vv in metadata.items()}
|
||||
async with self._client() as s3:
|
||||
await s3.put_object(Bucket=_bucket_name(bucket), Key=k, Body=data, **extra)
|
||||
return f"s3://{_bucket_name(bucket)}/{k}"
|
||||
|
||||
@@ -79,3 +79,48 @@ def test_put_bytes_sync_roundtrip(_tmp_datadir):
|
||||
src_key = "cases/1/exports/x.docx"
|
||||
storage.put_bytes_sync(src_key, b"PK\x03\x04zip", bucket=storage.Bucket.DOCUMENTS)
|
||||
assert (_tmp_datadir / src_key).read_bytes() == b"PK\x03\x04zip"
|
||||
|
||||
|
||||
def test_ascii_metadata_encodes_hebrew():
|
||||
"""S3 object metadata must be ASCII (botocore enforces). A Hebrew original
|
||||
filename — the value ingest._stage_file attaches — must come back ASCII and
|
||||
losslessly recoverable (regression: s3-only digest/document upload 500)."""
|
||||
from urllib.parse import unquote
|
||||
name = "יומון 5167 - 11.6.26.pdf"
|
||||
out = storage._ascii_metadata(name)
|
||||
assert out.isascii()
|
||||
assert unquote(out) == name
|
||||
# plain ASCII passes through untouched (readability)
|
||||
assert storage._ascii_metadata("digest_ab12.pdf") == "digest_ab12.pdf"
|
||||
|
||||
|
||||
def test_s3_put_bytes_sends_ascii_metadata(monkeypatch):
|
||||
"""Reproduce the failure path: S3Backend.put_bytes with a Hebrew filename in
|
||||
metadata must hand put_object an ASCII-only Metadata mapping (no
|
||||
ParamValidationError)."""
|
||||
captured = {}
|
||||
|
||||
class _FakeS3:
|
||||
async def put_object(self, **kwargs):
|
||||
captured.update(kwargs)
|
||||
return {}
|
||||
|
||||
class _FakeClientCtx:
|
||||
async def __aenter__(self):
|
||||
return _FakeS3()
|
||||
async def __aexit__(self, *exc):
|
||||
return False
|
||||
|
||||
backend = storage.S3Backend()
|
||||
monkeypatch.setattr(backend, "_client", lambda **kw: _FakeClientCtx())
|
||||
monkeypatch.setattr(storage, "_bucket_name", lambda b: "documents")
|
||||
|
||||
run(backend.put_bytes(
|
||||
"digests/incoming/ab12_x.pdf", b"%PDF-1.4",
|
||||
bucket=storage.Bucket.DOCUMENTS,
|
||||
metadata={"filename": "יומון 5167 - 11.6.26.pdf"},
|
||||
))
|
||||
meta = captured["Metadata"]
|
||||
assert all(v.isascii() for v in meta.values())
|
||||
from urllib.parse import unquote
|
||||
assert unquote(meta["filename"]) == "יומון 5167 - 11.6.26.pdf"
|
||||
|
||||
Reference in New Issue
Block a user