Merge pull request 'fix(storage): ASCII-encode S3 object metadata — s3-only upload 500 על שמות-קובץ עבריים (INV-STG2)' (#237) from worktree-s3-meta-ascii into main
This commit was merged in pull request #237.
This commit is contained in:
@@ -88,6 +88,22 @@ def normalize_key(key: str | Path) -> str:
|
|||||||
return posix.as_posix().lstrip("/")
|
return posix.as_posix().lstrip("/")
|
||||||
|
|
||||||
|
|
||||||
|
def _ascii_metadata(value) -> str:
|
||||||
|
"""Coerce an S3 user-metadata value to ASCII.
|
||||||
|
|
||||||
|
S3/MinIO object metadata must be ASCII (botocore raises ParamValidationError
|
||||||
|
otherwise). The only non-ASCII value we attach is the original Hebrew
|
||||||
|
filename (``ingest._stage_file`` → ``metadata={"filename": ...}``), so a
|
||||||
|
Hebrew name like ``"יומון 5167 - 11.6.26.pdf"`` would 500 every s3-only
|
||||||
|
upload. Percent-encode non-ASCII losslessly (recover with
|
||||||
|
``urllib.parse.unquote``) while leaving plain-ASCII values readable."""
|
||||||
|
s = str(value)
|
||||||
|
if s.isascii():
|
||||||
|
return s
|
||||||
|
from urllib.parse import quote
|
||||||
|
return quote(s)
|
||||||
|
|
||||||
|
|
||||||
class StorageBackend:
|
class StorageBackend:
|
||||||
"""Abstract backend. All methods are async except the cheap path helpers."""
|
"""Abstract backend. All methods are async except the cheap path helpers."""
|
||||||
|
|
||||||
@@ -247,7 +263,7 @@ class S3Backend(StorageBackend):
|
|||||||
if content_type:
|
if content_type:
|
||||||
extra["ContentType"] = content_type
|
extra["ContentType"] = content_type
|
||||||
if metadata:
|
if metadata:
|
||||||
extra["Metadata"] = {kk: str(vv) for kk, vv in metadata.items()}
|
extra["Metadata"] = {kk: _ascii_metadata(vv) for kk, vv in metadata.items()}
|
||||||
async with self._client() as s3:
|
async with self._client() as s3:
|
||||||
await s3.put_object(Bucket=_bucket_name(bucket), Key=k, Body=data, **extra)
|
await s3.put_object(Bucket=_bucket_name(bucket), Key=k, Body=data, **extra)
|
||||||
return f"s3://{_bucket_name(bucket)}/{k}"
|
return f"s3://{_bucket_name(bucket)}/{k}"
|
||||||
|
|||||||
@@ -79,3 +79,48 @@ def test_put_bytes_sync_roundtrip(_tmp_datadir):
|
|||||||
src_key = "cases/1/exports/x.docx"
|
src_key = "cases/1/exports/x.docx"
|
||||||
storage.put_bytes_sync(src_key, b"PK\x03\x04zip", bucket=storage.Bucket.DOCUMENTS)
|
storage.put_bytes_sync(src_key, b"PK\x03\x04zip", bucket=storage.Bucket.DOCUMENTS)
|
||||||
assert (_tmp_datadir / src_key).read_bytes() == b"PK\x03\x04zip"
|
assert (_tmp_datadir / src_key).read_bytes() == b"PK\x03\x04zip"
|
||||||
|
|
||||||
|
|
||||||
|
def test_ascii_metadata_encodes_hebrew():
|
||||||
|
"""S3 object metadata must be ASCII (botocore enforces). A Hebrew original
|
||||||
|
filename — the value ingest._stage_file attaches — must come back ASCII and
|
||||||
|
losslessly recoverable (regression: s3-only digest/document upload 500)."""
|
||||||
|
from urllib.parse import unquote
|
||||||
|
name = "יומון 5167 - 11.6.26.pdf"
|
||||||
|
out = storage._ascii_metadata(name)
|
||||||
|
assert out.isascii()
|
||||||
|
assert unquote(out) == name
|
||||||
|
# plain ASCII passes through untouched (readability)
|
||||||
|
assert storage._ascii_metadata("digest_ab12.pdf") == "digest_ab12.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
def test_s3_put_bytes_sends_ascii_metadata(monkeypatch):
|
||||||
|
"""Reproduce the failure path: S3Backend.put_bytes with a Hebrew filename in
|
||||||
|
metadata must hand put_object an ASCII-only Metadata mapping (no
|
||||||
|
ParamValidationError)."""
|
||||||
|
captured = {}
|
||||||
|
|
||||||
|
class _FakeS3:
|
||||||
|
async def put_object(self, **kwargs):
|
||||||
|
captured.update(kwargs)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
class _FakeClientCtx:
|
||||||
|
async def __aenter__(self):
|
||||||
|
return _FakeS3()
|
||||||
|
async def __aexit__(self, *exc):
|
||||||
|
return False
|
||||||
|
|
||||||
|
backend = storage.S3Backend()
|
||||||
|
monkeypatch.setattr(backend, "_client", lambda **kw: _FakeClientCtx())
|
||||||
|
monkeypatch.setattr(storage, "_bucket_name", lambda b: "documents")
|
||||||
|
|
||||||
|
run(backend.put_bytes(
|
||||||
|
"digests/incoming/ab12_x.pdf", b"%PDF-1.4",
|
||||||
|
bucket=storage.Bucket.DOCUMENTS,
|
||||||
|
metadata={"filename": "יומון 5167 - 11.6.26.pdf"},
|
||||||
|
))
|
||||||
|
meta = captured["Metadata"]
|
||||||
|
assert all(v.isascii() for v in meta.values())
|
||||||
|
from urllib.parse import unquote
|
||||||
|
assert unquote(meta["filename"]) == "יומון 5167 - 11.6.26.pdf"
|
||||||
|
|||||||
Reference in New Issue
Block a user