From 9ae2d47d03fb80a9f0221451a3f10a4150e01909 Mon Sep 17 00:00:00 2001 From: Chaim Date: Sat, 30 May 2026 19:09:37 +0000 Subject: [PATCH] test(ingest): failing tests for unified pipeline (FU-1) Co-Authored-By: Claude Sonnet 4.6 --- mcp-server/tests/test_unified_ingest.py | 169 ++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 mcp-server/tests/test_unified_ingest.py diff --git a/mcp-server/tests/test_unified_ingest.py b/mcp-server/tests/test_unified_ingest.py new file mode 100644 index 0000000..08bbd1e --- /dev/null +++ b/mcp-server/tests/test_unified_ingest.py @@ -0,0 +1,169 @@ +"""FU-1: unified ingest pipeline tests (offline, all I/O monkeypatched). + +Proves both intake types flow through services.ingest.ingest_document and that +the canonical pipeline is symmetric: BOTH metadata and halacha extraction are +queued for BOTH types (GAP-02 regression), enum validation applies to both +(GAP-04), multimodal is gated by flag+PDF not by intake type (GAP-05), and the +external citation guard is preserved. +""" +from __future__ import annotations + +import asyncio +from pathlib import Path +from uuid import uuid4 + +import pytest + +from legal_mcp import config +from legal_mcp.services import db, embeddings, chunker, extractor +from legal_mcp.services import ingest, precedent_library, internal_decisions + + +def _run(coro): + return asyncio.run(coro) + + +class _Chunk: + def __init__(self, i): + self.chunk_index = i + self.content = f"chunk-{i}" + self.section_type = "body" + self.page_number = 1 + self.role = "child" + self.local_id = f"c{i}" + self.parent_local_id = None + + +@pytest.fixture() +def patched(monkeypatch, tmp_path): + """Patch every I/O boundary. Record queue + create calls.""" + calls = {"metadata": [], "halacha": [], "create": [], "chunks": [], "pages": []} + + async def _extract_text(path): + return ("full decision text", 2, [0, 100]) + + def _strip(text): + return text + + def _chunk(text, page_offsets=None): + return [_Chunk(0), _Chunk(1)] + + async def _embed(texts, input_type="document"): + return [[0.0] * 8 for _ in texts] + + async def _store_chunks(cid, dicts): + calls["chunks"].append((cid, len(dicts))) + return len(dicts) + + async def _create_external(**kw): + calls["create"].append(("external", kw)) + return {"id": uuid4()} + + async def _create_internal(**kw): + calls["create"].append(("internal", kw)) + return {"id": uuid4()} + + async def _req_meta(cid): + calls["metadata"].append(cid) + + async def _req_hal(cid): + calls["halacha"].append(cid) + + async def _set_status(cid, status): + return None + + monkeypatch.setattr(extractor, "extract_text", _extract_text) + monkeypatch.setattr(extractor, "strip_nevo_preamble", _strip) + monkeypatch.setattr(chunker, "chunk_document", _chunk) + monkeypatch.setattr(embeddings, "embed_texts", _embed) + monkeypatch.setattr(db, "store_precedent_chunks", _store_chunks) + monkeypatch.setattr(db, "create_external_case_law", _create_external) + monkeypatch.setattr(db, "create_internal_committee_decision", _create_internal) + monkeypatch.setattr(db, "request_metadata_extraction", _req_meta) + monkeypatch.setattr(db, "request_halacha_extraction", _req_hal) + monkeypatch.setattr(db, "set_case_law_extraction_status", _set_status) + monkeypatch.setattr(db, "set_case_law_halacha_status", _set_status) + # Force flat chunking + multimodal OFF unless a test flips it. + monkeypatch.setattr(config, "PARENT_DOC_RETRIEVAL_ENABLED", False) + monkeypatch.setattr(config, "MULTIMODAL_ENABLED", False) + return calls + + +def _make_pdf(tmp_path) -> str: + p = tmp_path / "decision.pdf" + p.write_bytes(b"%PDF-1.4 fake") + return str(p) + + +def test_internal_queues_BOTH_metadata_and_halacha(patched, tmp_path): + """GAP-02 regression: the internal path must queue metadata too.""" + _run(internal_decisions.ingest_internal_decision( + case_number="8046/24", text="decision text", chair_name="דפנה תמיר", + district="ירושלים", practice_area="betterment_levy", + )) + assert len(patched["metadata"]) == 1, "internal path must queue metadata (GAP-02)" + assert len(patched["halacha"]) == 1 + + +def test_external_queues_both(patched, tmp_path): + _run(precedent_library.ingest_precedent( + file_path=_make_pdf(tmp_path), citation="עע\"מ 1234/20", + practice_area="rishuy_uvniya", source_type="court_ruling", + )) + assert len(patched["metadata"]) == 1 + assert len(patched["halacha"]) == 1 + + +def test_both_types_go_through_ingest_document(patched, tmp_path, monkeypatch): + seen = [] + real = ingest.ingest_document + + async def _spy(spec, **kw): + seen.append(spec.source_kind) + return await real(spec, **kw) + + monkeypatch.setattr(ingest, "ingest_document", _spy) + _run(internal_decisions.ingest_internal_decision( + case_number="8046/24", text="t", chair_name="דפנה תמיר", practice_area="betterment_levy")) + _run(precedent_library.ingest_precedent( + file_path=_make_pdf(tmp_path), citation="עע\"מ 1/20", practice_area="rishuy_uvniya")) + assert seen == ["internal_committee", "external_upload"] + + +def test_enum_validation_rejects_bad_practice_area_internal(patched, tmp_path): + """GAP-04: internal path must validate enums like the external one.""" + with pytest.raises(ValueError, match="practice_area"): + _run(internal_decisions.ingest_internal_decision( + case_number="8046/24", text="t", chair_name="x", practice_area="bogus")) + + +def test_enum_validation_rejects_bad_practice_area_external(patched, tmp_path): + with pytest.raises(ValueError, match="practice_area"): + _run(precedent_library.ingest_precedent( + file_path=_make_pdf(tmp_path), citation="עע\"מ 1/20", practice_area="bogus")) + + +def test_external_citation_guard_still_blocks_arar(patched, tmp_path): + with pytest.raises(ValueError, match="ערר"): + _run(precedent_library.ingest_precedent( + file_path=_make_pdf(tmp_path), citation="ערר 1234/24")) + + +def test_internal_text_path_works_without_file(patched): + out = _run(internal_decisions.ingest_internal_decision( + case_number="8046/24", text="t", chair_name="x", practice_area="betterment_levy")) + assert out["status"] == "completed" + assert out["case_law_id"] + + +def test_internal_requires_file_or_text(patched): + with pytest.raises(ValueError, match="file_path or text"): + _run(internal_decisions.ingest_internal_decision( + case_number="8046/24", chair_name="x", practice_area="betterment_levy")) + + +def test_display_name_fallback_uses_canonical_id(patched, tmp_path): + _run(internal_decisions.ingest_internal_decision( + case_number="8046/24", text="t", chair_name="x", practice_area="betterment_levy")) + kind, kw = patched["create"][0] + assert kw["case_name"] == "8046/24", "missing case_name falls back to canonical id"