Files
legal-ai/mcp-server/tests/test_unified_ingest.py
2026-05-30 19:09:37 +00:00

170 lines
6.3 KiB
Python

"""FU-1: unified ingest pipeline tests (offline, all I/O monkeypatched).
Proves both intake types flow through services.ingest.ingest_document and that
the canonical pipeline is symmetric: BOTH metadata and halacha extraction are
queued for BOTH types (GAP-02 regression), enum validation applies to both
(GAP-04), multimodal is gated by flag+PDF not by intake type (GAP-05), and the
external citation guard is preserved.
"""
from __future__ import annotations
import asyncio
from pathlib import Path
from uuid import uuid4
import pytest
from legal_mcp import config
from legal_mcp.services import db, embeddings, chunker, extractor
from legal_mcp.services import ingest, precedent_library, internal_decisions
def _run(coro):
return asyncio.run(coro)
class _Chunk:
def __init__(self, i):
self.chunk_index = i
self.content = f"chunk-{i}"
self.section_type = "body"
self.page_number = 1
self.role = "child"
self.local_id = f"c{i}"
self.parent_local_id = None
@pytest.fixture()
def patched(monkeypatch, tmp_path):
"""Patch every I/O boundary. Record queue + create calls."""
calls = {"metadata": [], "halacha": [], "create": [], "chunks": [], "pages": []}
async def _extract_text(path):
return ("full decision text", 2, [0, 100])
def _strip(text):
return text
def _chunk(text, page_offsets=None):
return [_Chunk(0), _Chunk(1)]
async def _embed(texts, input_type="document"):
return [[0.0] * 8 for _ in texts]
async def _store_chunks(cid, dicts):
calls["chunks"].append((cid, len(dicts)))
return len(dicts)
async def _create_external(**kw):
calls["create"].append(("external", kw))
return {"id": uuid4()}
async def _create_internal(**kw):
calls["create"].append(("internal", kw))
return {"id": uuid4()}
async def _req_meta(cid):
calls["metadata"].append(cid)
async def _req_hal(cid):
calls["halacha"].append(cid)
async def _set_status(cid, status):
return None
monkeypatch.setattr(extractor, "extract_text", _extract_text)
monkeypatch.setattr(extractor, "strip_nevo_preamble", _strip)
monkeypatch.setattr(chunker, "chunk_document", _chunk)
monkeypatch.setattr(embeddings, "embed_texts", _embed)
monkeypatch.setattr(db, "store_precedent_chunks", _store_chunks)
monkeypatch.setattr(db, "create_external_case_law", _create_external)
monkeypatch.setattr(db, "create_internal_committee_decision", _create_internal)
monkeypatch.setattr(db, "request_metadata_extraction", _req_meta)
monkeypatch.setattr(db, "request_halacha_extraction", _req_hal)
monkeypatch.setattr(db, "set_case_law_extraction_status", _set_status)
monkeypatch.setattr(db, "set_case_law_halacha_status", _set_status)
# Force flat chunking + multimodal OFF unless a test flips it.
monkeypatch.setattr(config, "PARENT_DOC_RETRIEVAL_ENABLED", False)
monkeypatch.setattr(config, "MULTIMODAL_ENABLED", False)
return calls
def _make_pdf(tmp_path) -> str:
p = tmp_path / "decision.pdf"
p.write_bytes(b"%PDF-1.4 fake")
return str(p)
def test_internal_queues_BOTH_metadata_and_halacha(patched, tmp_path):
"""GAP-02 regression: the internal path must queue metadata too."""
_run(internal_decisions.ingest_internal_decision(
case_number="8046/24", text="decision text", chair_name="דפנה תמיר",
district="ירושלים", practice_area="betterment_levy",
))
assert len(patched["metadata"]) == 1, "internal path must queue metadata (GAP-02)"
assert len(patched["halacha"]) == 1
def test_external_queues_both(patched, tmp_path):
_run(precedent_library.ingest_precedent(
file_path=_make_pdf(tmp_path), citation="עע\"מ 1234/20",
practice_area="rishuy_uvniya", source_type="court_ruling",
))
assert len(patched["metadata"]) == 1
assert len(patched["halacha"]) == 1
def test_both_types_go_through_ingest_document(patched, tmp_path, monkeypatch):
seen = []
real = ingest.ingest_document
async def _spy(spec, **kw):
seen.append(spec.source_kind)
return await real(spec, **kw)
monkeypatch.setattr(ingest, "ingest_document", _spy)
_run(internal_decisions.ingest_internal_decision(
case_number="8046/24", text="t", chair_name="דפנה תמיר", practice_area="betterment_levy"))
_run(precedent_library.ingest_precedent(
file_path=_make_pdf(tmp_path), citation="עע\"מ 1/20", practice_area="rishuy_uvniya"))
assert seen == ["internal_committee", "external_upload"]
def test_enum_validation_rejects_bad_practice_area_internal(patched, tmp_path):
"""GAP-04: internal path must validate enums like the external one."""
with pytest.raises(ValueError, match="practice_area"):
_run(internal_decisions.ingest_internal_decision(
case_number="8046/24", text="t", chair_name="x", practice_area="bogus"))
def test_enum_validation_rejects_bad_practice_area_external(patched, tmp_path):
with pytest.raises(ValueError, match="practice_area"):
_run(precedent_library.ingest_precedent(
file_path=_make_pdf(tmp_path), citation="עע\"מ 1/20", practice_area="bogus"))
def test_external_citation_guard_still_blocks_arar(patched, tmp_path):
with pytest.raises(ValueError, match="ערר"):
_run(precedent_library.ingest_precedent(
file_path=_make_pdf(tmp_path), citation="ערר 1234/24"))
def test_internal_text_path_works_without_file(patched):
out = _run(internal_decisions.ingest_internal_decision(
case_number="8046/24", text="t", chair_name="x", practice_area="betterment_levy"))
assert out["status"] == "completed"
assert out["case_law_id"]
def test_internal_requires_file_or_text(patched):
with pytest.raises(ValueError, match="file_path or text"):
_run(internal_decisions.ingest_internal_decision(
case_number="8046/24", chair_name="x", practice_area="betterment_levy"))
def test_display_name_fallback_uses_canonical_id(patched, tmp_path):
_run(internal_decisions.ingest_internal_decision(
case_number="8046/24", text="t", chair_name="x", practice_area="betterment_levy"))
kind, kw = patched["create"][0]
assert kw["case_name"] == "8046/24", "missing case_name falls back to canonical id"