fix(ingest): read staged file via storage.ensure_local — s3-only upload 500 (INV-STG1)
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 5s
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 5s
תיקון: העלאת פסיקה/החלטת-ועדה (precedent-library + internal-decisions) נכשלה תחת backend s3-only עם "Package not found at '/data/...docx'" / "Converted file not found". השורש: `ingest._stage_file` כותב את הקובץ דרך `storage.put_file` ומחזיר נתיב-DATA_DIR, אבל תחת s3-only ה-blob נכתב רק ל-MinIO ואין עותק בדיסק — ואז הצינור קרא את הנתיב ישירות מהדיסק (extract_text) → קובץ לא קיים. מסלול תיקי-המקרה לא נפגע כי הוא שומר עותק-דיסק + mirror_file; רק מסלול _stage_file המשותף קרא את ה-key כאילו הוא על הדיסק. התיקון (נרמול-במקור, G1; קריאה דרך שכבת-האחסון, INV-STG1): - `_stage_file` מחזיר עכשיו את ה-KEY (נתיב יחסי-DATA_DIR), לא Path. - `ingest_document` ו-`digest_library` מאתרים נתיב-קריאה מקומי דרך `storage.ensure_local` (עותק-דיסק תחת filesystem/dual; הורדה ל-temp תחת s3-only) ומנקים את ה-temp ב-finally — בלי דליפה ל-/tmp. - מולטימודל (PDF) קורא את אותו נתיב מקומי מאומת. בדיקות: test_unified_ingest::test_ingest_reads_via_ensure_local_when_no_disk_copy מדמה backend ללא עותק-דיסק ומוודא שהצינור משלים (נכשל מול הקוד הישן). 55 עוברות. Invariants: מקיים INV-STG1 (קריאה/כתיבה רק דרך שכבת-האחסון), G1 (נרמול-במקור, לא תיקון-בקריאה), G2 (אין מסלול מקביל — תיקון הצינור הקנוני). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -30,20 +30,24 @@ def test_stage_file_lands_under_datadir(_tmp_datadir):
|
||||
src.parent.mkdir(parents=True)
|
||||
src.write_bytes(b"%PDF-1.4 ...")
|
||||
root = _tmp_datadir / "precedent-library"
|
||||
dest = run(ingest._stage_file(src, root, "court_ruling"))
|
||||
# dest is under the staging subdir, prefixed with a uuid, original suffix kept
|
||||
# _stage_file returns the DATA_DIR-relative storage KEY (what the DB stores);
|
||||
# the caller resolves a local path via storage.ensure_local.
|
||||
key = run(ingest._stage_file(src, root, "court_ruling"))
|
||||
assert isinstance(key, str)
|
||||
assert key.startswith("precedent-library/court_ruling/")
|
||||
assert key.endswith(".pdf")
|
||||
# under the filesystem backend the key maps to the exact legacy on-disk path
|
||||
dest = _tmp_datadir / key
|
||||
assert dest.parent == root / "court_ruling"
|
||||
assert dest.exists()
|
||||
assert dest.read_bytes() == b"%PDF-1.4 ..."
|
||||
assert dest.suffix == ".pdf"
|
||||
# and the key is DATA_DIR-relative (what the DB column will store)
|
||||
assert dest.relative_to(_tmp_datadir).as_posix().startswith("precedent-library/court_ruling/")
|
||||
|
||||
|
||||
def test_stage_file_default_subdir(_tmp_datadir):
|
||||
src = _tmp_datadir / "x.docx"
|
||||
src.write_bytes(b"doc")
|
||||
dest = run(ingest._stage_file(src, _tmp_datadir / "digests", ""))
|
||||
key = run(ingest._stage_file(src, _tmp_datadir / "digests", ""))
|
||||
dest = _tmp_datadir / key
|
||||
assert dest.parent == _tmp_datadir / "digests" / "other"
|
||||
assert dest.exists()
|
||||
|
||||
|
||||
@@ -172,3 +172,58 @@ def test_display_name_fallback_uses_canonical_id(patched, tmp_path):
|
||||
case_number="8046/24", text="t", chair_name="x", practice_area="betterment_levy"))
|
||||
kind, kw = patched["create"][0]
|
||||
assert kw["case_name"] == "8046/24", "missing case_name falls back to canonical id"
|
||||
|
||||
|
||||
def test_ingest_reads_via_ensure_local_when_no_disk_copy(patched, tmp_path, monkeypatch):
|
||||
"""Regression: under the s3-only backend the staged key has NO on-disk file,
|
||||
so reading the DATA_DIR path directly 500'd ('Package not found at …').
|
||||
ingest must resolve a readable local path via storage.ensure_local (a temp
|
||||
download) and clean it up afterwards.
|
||||
"""
|
||||
from legal_mcp.services import storage
|
||||
|
||||
store: dict[str, bytes] = {}
|
||||
|
||||
class _MemBackend(storage.StorageBackend):
|
||||
"""In-memory backend with NO local copy — mimics s3-only."""
|
||||
name = "mem"
|
||||
|
||||
async def put_bytes(self, key, data, *, bucket=storage.Bucket.DOCUMENTS,
|
||||
content_type=None, metadata=None):
|
||||
store[storage.normalize_key(key)] = bytes(data)
|
||||
return f"mem://{key}"
|
||||
|
||||
async def put_file(self, src, key, *, bucket=storage.Bucket.DOCUMENTS,
|
||||
content_type=None, metadata=None):
|
||||
store[storage.normalize_key(key)] = Path(src).read_bytes()
|
||||
return f"mem://{key}"
|
||||
|
||||
async def get_bytes(self, key, *, bucket=storage.Bucket.DOCUMENTS):
|
||||
return store[storage.normalize_key(key)]
|
||||
|
||||
async def exists(self, key, *, bucket=storage.Bucket.DOCUMENTS):
|
||||
return storage.normalize_key(key) in store
|
||||
|
||||
def local_path(self, key, *, bucket=storage.Bucket.DOCUMENTS):
|
||||
return None # the s3-only condition: nothing on disk
|
||||
|
||||
monkeypatch.setattr(storage, "_singleton", _MemBackend())
|
||||
|
||||
# An extract_text that actually READS the path it is handed — proves ingest
|
||||
# passes a real, readable local file rather than a phantom DATA_DIR path.
|
||||
seen = []
|
||||
|
||||
async def _extract_reads(path):
|
||||
seen.append(path)
|
||||
assert Path(path).read_bytes() == b"%PDF-1.4 fake"
|
||||
return ("full decision text", 1, [0])
|
||||
|
||||
monkeypatch.setattr(extractor, "extract_text", _extract_reads)
|
||||
|
||||
out = _run(precedent_library.ingest_precedent(
|
||||
file_path=_make_pdf(tmp_path), citation="עע\"מ 1234/20",
|
||||
practice_area="rishuy_uvniya", source_type="court_ruling",
|
||||
))
|
||||
assert out["status"] == "completed"
|
||||
# the temp download was cleaned up (no /tmp leak)
|
||||
assert seen and not Path(seen[0]).exists()
|
||||
|
||||
Reference in New Issue
Block a user