fix(ingest): read staged file via storage.ensure_local — s3-only upload 500 (INV-STG1)
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 5s

תיקון: העלאת פסיקה/החלטת-ועדה (precedent-library + internal-decisions) נכשלה
תחת backend s3-only עם "Package not found at '/data/...docx'" / "Converted file
not found". השורש: ‎`ingest._stage_file` כותב את הקובץ דרך ‎`storage.put_file`
ומחזיר נתיב‎-DATA_DIR, אבל תחת s3-only ה‎-blob נכתב רק ל‎-MinIO ואין עותק בדיסק —
ואז הצינור קרא את הנתיב ישירות מהדיסק (extract_text) → קובץ לא קיים. מסלול
תיקי‎-המקרה לא נפגע כי הוא שומר עותק‎-דיסק + mirror_file; רק מסלול ‎_stage_file
המשותף קרא את ה‎-key כאילו הוא על הדיסק.

התיקון (נרמול‎-במקור, G1; קריאה דרך שכבת‎-האחסון, INV-STG1):
- ‎`_stage_file` מחזיר עכשיו את ה‎-KEY (נתיב יחסי‎-DATA_DIR), לא Path.
- ‎`ingest_document` ו‎-‎`digest_library` מאתרים נתיב‎-קריאה מקומי דרך
  ‎`storage.ensure_local` (עותק‎-דיסק תחת filesystem/dual; הורדה ל‎-temp תחת
  s3-only) ומנקים את ה‎-temp ב‎-finally — בלי דליפה ל‎-/tmp.
- מולטימודל (PDF) קורא את אותו נתיב מקומי מאומת.

בדיקות: test_unified_ingest::test_ingest_reads_via_ensure_local_when_no_disk_copy
מדמה backend ללא עותק‎-דיסק ומוודא שהצינור משלים (נכשל מול הקוד הישן). 55 עוברות.

Invariants: מקיים INV-STG1 (קריאה/כתיבה רק דרך שכבת‎-האחסון), G1 (נרמול‎-במקור,
לא תיקון‎-בקריאה), G2 (אין מסלול מקביל — תיקון הצינור הקנוני).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-12 07:32:04 +00:00
parent ca1a0ddaac
commit 4f7c3733e2
4 changed files with 180 additions and 89 deletions

View File

@@ -30,20 +30,24 @@ def test_stage_file_lands_under_datadir(_tmp_datadir):
src.parent.mkdir(parents=True)
src.write_bytes(b"%PDF-1.4 ...")
root = _tmp_datadir / "precedent-library"
dest = run(ingest._stage_file(src, root, "court_ruling"))
# dest is under the staging subdir, prefixed with a uuid, original suffix kept
# _stage_file returns the DATA_DIR-relative storage KEY (what the DB stores);
# the caller resolves a local path via storage.ensure_local.
key = run(ingest._stage_file(src, root, "court_ruling"))
assert isinstance(key, str)
assert key.startswith("precedent-library/court_ruling/")
assert key.endswith(".pdf")
# under the filesystem backend the key maps to the exact legacy on-disk path
dest = _tmp_datadir / key
assert dest.parent == root / "court_ruling"
assert dest.exists()
assert dest.read_bytes() == b"%PDF-1.4 ..."
assert dest.suffix == ".pdf"
# and the key is DATA_DIR-relative (what the DB column will store)
assert dest.relative_to(_tmp_datadir).as_posix().startswith("precedent-library/court_ruling/")
def test_stage_file_default_subdir(_tmp_datadir):
src = _tmp_datadir / "x.docx"
src.write_bytes(b"doc")
dest = run(ingest._stage_file(src, _tmp_datadir / "digests", ""))
key = run(ingest._stage_file(src, _tmp_datadir / "digests", ""))
dest = _tmp_datadir / key
assert dest.parent == _tmp_datadir / "digests" / "other"
assert dest.exists()

View File

@@ -172,3 +172,58 @@ def test_display_name_fallback_uses_canonical_id(patched, tmp_path):
case_number="8046/24", text="t", chair_name="x", practice_area="betterment_levy"))
kind, kw = patched["create"][0]
assert kw["case_name"] == "8046/24", "missing case_name falls back to canonical id"
def test_ingest_reads_via_ensure_local_when_no_disk_copy(patched, tmp_path, monkeypatch):
"""Regression: under the s3-only backend the staged key has NO on-disk file,
so reading the DATA_DIR path directly 500'd ('Package not found at …').
ingest must resolve a readable local path via storage.ensure_local (a temp
download) and clean it up afterwards.
"""
from legal_mcp.services import storage
store: dict[str, bytes] = {}
class _MemBackend(storage.StorageBackend):
"""In-memory backend with NO local copy — mimics s3-only."""
name = "mem"
async def put_bytes(self, key, data, *, bucket=storage.Bucket.DOCUMENTS,
content_type=None, metadata=None):
store[storage.normalize_key(key)] = bytes(data)
return f"mem://{key}"
async def put_file(self, src, key, *, bucket=storage.Bucket.DOCUMENTS,
content_type=None, metadata=None):
store[storage.normalize_key(key)] = Path(src).read_bytes()
return f"mem://{key}"
async def get_bytes(self, key, *, bucket=storage.Bucket.DOCUMENTS):
return store[storage.normalize_key(key)]
async def exists(self, key, *, bucket=storage.Bucket.DOCUMENTS):
return storage.normalize_key(key) in store
def local_path(self, key, *, bucket=storage.Bucket.DOCUMENTS):
return None # the s3-only condition: nothing on disk
monkeypatch.setattr(storage, "_singleton", _MemBackend())
# An extract_text that actually READS the path it is handed — proves ingest
# passes a real, readable local file rather than a phantom DATA_DIR path.
seen = []
async def _extract_reads(path):
seen.append(path)
assert Path(path).read_bytes() == b"%PDF-1.4 fake"
return ("full decision text", 1, [0])
monkeypatch.setattr(extractor, "extract_text", _extract_reads)
out = _run(precedent_library.ingest_precedent(
file_path=_make_pdf(tmp_path), citation="עע\"מ 1234/20",
practice_area="rishuy_uvniya", source_type="court_ruling",
))
assert out["status"] == "completed"
# the temp download was cleaned up (no /tmp leak)
assert seen and not Path(seen[0]).exists()