From 15e4af595a7bd88c3104bfa67883fb43d49994ac Mon Sep 17 00:00:00 2001 From: Chaim Date: Fri, 12 Jun 2026 11:30:25 +0000 Subject: [PATCH] =?UTF-8?q?fix(storage):=20ASCII-encode=20S3=20object=20me?= =?UTF-8?q?tadata=20=E2=80=94=20s3-only=20upload=20500=20on=20Hebrew=20fil?= =?UTF-8?q?enames=20(INV-STG2)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit תיקון: כל העלאת קובץ עם שם עברי נכשלה ב-500 תחת backend s3-only. השורש: `ingest._stage_file` מצרף את שם-הקובץ המקורי כ-S3 object metadata (`metadata={"filename": src.name}`), ו-`S3Backend.put_bytes` העביר אותו כמו-שהוא ל-`put_object`. botocore אוכף ASCII-only על S3 metadata → ParamValidationError → 500. שם עברי כמו "יומון 5167 - 11.6.26.pdf" שבר כל upload. נחשף ב-cutover ל-s3-only (2026-06-11): קליטת היומונים (וגם כל מסמך/פסיקה עם שם עברי) הפסיקה לעבוד; היומון האחרון שנקלט (5165, 9.6) היה לפני ה-cutover. התיקון (נרמול-במקור, G1; בשכבת-האחסון היחידה, INV-STG2): - `_ascii_metadata` מקודד ערכי-metadata לא-ASCII ב-percent-encoding (lossless, שחזור עם urllib.parse.unquote); ASCII רגיל עובר ללא שינוי (קריאוּת). - `S3Backend.put_bytes` מחיל אותו על כל ערכי ה-Metadata. בדיקות: test_ascii_metadata_encodes_hebrew (helper) + test_s3_put_bytes_sends_ascii_metadata (משחזר את מסלול-הכשל מול fake put_object). 16 עוברות בקובץ. Invariants: מקיים G1 (נרמול-במקור, לא תיקון-בקריאה), INV-STG2 (שם-קובץ עברי כ-metadata ולא ככ-key), G2 (אין מסלול-אחסון מקביל — תיקון ה-choke-point היחיד). Co-Authored-By: Claude Opus 4.8 (1M context) --- mcp-server/src/legal_mcp/services/storage.py | 18 +++++++- mcp-server/tests/test_storage_staging.py | 45 ++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/mcp-server/src/legal_mcp/services/storage.py b/mcp-server/src/legal_mcp/services/storage.py index 38d577b..ec146b8 100644 --- a/mcp-server/src/legal_mcp/services/storage.py +++ b/mcp-server/src/legal_mcp/services/storage.py @@ -88,6 +88,22 @@ def normalize_key(key: str | Path) -> str: return posix.as_posix().lstrip("/") +def _ascii_metadata(value) -> str: + """Coerce an S3 user-metadata value to ASCII. + + S3/MinIO object metadata must be ASCII (botocore raises ParamValidationError + otherwise). The only non-ASCII value we attach is the original Hebrew + filename (``ingest._stage_file`` → ``metadata={"filename": ...}``), so a + Hebrew name like ``"יומון 5167 - 11.6.26.pdf"`` would 500 every s3-only + upload. Percent-encode non-ASCII losslessly (recover with + ``urllib.parse.unquote``) while leaving plain-ASCII values readable.""" + s = str(value) + if s.isascii(): + return s + from urllib.parse import quote + return quote(s) + + class StorageBackend: """Abstract backend. All methods are async except the cheap path helpers.""" @@ -247,7 +263,7 @@ class S3Backend(StorageBackend): if content_type: extra["ContentType"] = content_type if metadata: - extra["Metadata"] = {kk: str(vv) for kk, vv in metadata.items()} + extra["Metadata"] = {kk: _ascii_metadata(vv) for kk, vv in metadata.items()} async with self._client() as s3: await s3.put_object(Bucket=_bucket_name(bucket), Key=k, Body=data, **extra) return f"s3://{_bucket_name(bucket)}/{k}" diff --git a/mcp-server/tests/test_storage_staging.py b/mcp-server/tests/test_storage_staging.py index 7768d1a..d347468 100644 --- a/mcp-server/tests/test_storage_staging.py +++ b/mcp-server/tests/test_storage_staging.py @@ -79,3 +79,48 @@ def test_put_bytes_sync_roundtrip(_tmp_datadir): src_key = "cases/1/exports/x.docx" storage.put_bytes_sync(src_key, b"PK\x03\x04zip", bucket=storage.Bucket.DOCUMENTS) assert (_tmp_datadir / src_key).read_bytes() == b"PK\x03\x04zip" + + +def test_ascii_metadata_encodes_hebrew(): + """S3 object metadata must be ASCII (botocore enforces). A Hebrew original + filename — the value ingest._stage_file attaches — must come back ASCII and + losslessly recoverable (regression: s3-only digest/document upload 500).""" + from urllib.parse import unquote + name = "יומון 5167 - 11.6.26.pdf" + out = storage._ascii_metadata(name) + assert out.isascii() + assert unquote(out) == name + # plain ASCII passes through untouched (readability) + assert storage._ascii_metadata("digest_ab12.pdf") == "digest_ab12.pdf" + + +def test_s3_put_bytes_sends_ascii_metadata(monkeypatch): + """Reproduce the failure path: S3Backend.put_bytes with a Hebrew filename in + metadata must hand put_object an ASCII-only Metadata mapping (no + ParamValidationError).""" + captured = {} + + class _FakeS3: + async def put_object(self, **kwargs): + captured.update(kwargs) + return {} + + class _FakeClientCtx: + async def __aenter__(self): + return _FakeS3() + async def __aexit__(self, *exc): + return False + + backend = storage.S3Backend() + monkeypatch.setattr(backend, "_client", lambda **kw: _FakeClientCtx()) + monkeypatch.setattr(storage, "_bucket_name", lambda b: "documents") + + run(backend.put_bytes( + "digests/incoming/ab12_x.pdf", b"%PDF-1.4", + bucket=storage.Bucket.DOCUMENTS, + metadata={"filename": "יומון 5167 - 11.6.26.pdf"}, + )) + meta = captured["Metadata"] + assert all(v.isascii() for v in meta.values()) + from urllib.parse import unquote + assert unquote(meta["filename"]) == "יומון 5167 - 11.6.26.pdf"