diff --git a/mcp-server/tests/test_idempotent_ingest.py b/mcp-server/tests/test_idempotent_ingest.py new file mode 100644 index 0000000..04b5f08 --- /dev/null +++ b/mcp-server/tests/test_idempotent_ingest.py @@ -0,0 +1,110 @@ +"""FU-2a: idempotent ingest + write-time normalization + searchable flag. + +Offline tests for the *pure* pieces (canonical normalization, completeness +predicate) and ingest wiring. The real ON CONFLICT upsert is verified by a +DB smoke test against localhost:5433 (see plan Task 6), since it requires a +live Postgres partial unique index. +""" +from __future__ import annotations + +import asyncio +from uuid import uuid4 + +import pytest + +from legal_mcp.services import db, ingest + + +def _run(coro): + return asyncio.run(coro) + + +# ── GAP-06: canonical normalization (pure, deterministic) ────────────── +@pytest.mark.parametrize("raw,expected", [ + ("ערר 8137/24", "8137-24"), + (" עע\"מ 1/20 ", "1-20"), + ("8126-03-25", "8126-03-25"), # month segment preserved + ("בל\"מ 1010-01-25", "1010-01-25"), + ("8047/23", "8047-23"), +]) +def test_canonical_case_number(raw, expected): + assert db._canonical_case_number(raw) == expected + + +def test_canonical_does_not_invent_month(): + # No month in input → none added (X1 §1). + assert db._canonical_case_number("8126/24") == "8126-24" + + +# ── GAP-13: completeness predicate (pure) ────────────────────────────── +def _complete_row(): + return { + "case_number": "8047-23", "case_name": "פלוני נ' הוועדה", + "practice_area": "rishuy_uvniya", "source_kind": "internal_committee", + "extraction_status": "completed", "headnote": "תקציר", + "summary": "", "subject_tags": [], + } + + +def test_compute_searchable_true_when_complete(): + assert db._compute_searchable(_complete_row(), has_embedded_chunk=True) is True + + +def test_compute_searchable_false_without_embedded_chunk(): + assert db._compute_searchable(_complete_row(), has_embedded_chunk=False) is False + + +def test_compute_searchable_false_without_metadata(): + row = _complete_row() + row["headnote"] = ""; row["summary"] = ""; row["subject_tags"] = [] + assert db._compute_searchable(row, has_embedded_chunk=True) is False + + +def test_compute_searchable_false_when_extraction_incomplete(): + row = _complete_row(); row["extraction_status"] = "pending" + assert db._compute_searchable(row, has_embedded_chunk=True) is False + + +def test_compute_searchable_false_without_core_fields(): + row = _complete_row(); row["practice_area"] = "" + assert db._compute_searchable(row, has_embedded_chunk=True) is False + + +# ── ingest wires in recompute_searchable (both types) ────────────────── +def test_ingest_calls_recompute_searchable(monkeypatch, tmp_path): + calls = {"recompute": [], "meta": [], "hal": []} + + async def _extract_text(path): return ("text", 1, [0]) + monkeypatch.setattr(ingest.extractor, "extract_text", _extract_text) + monkeypatch.setattr(ingest.extractor, "strip_nevo_preamble", lambda t: t) + monkeypatch.setattr(ingest.chunker, "chunk_document", + lambda t, page_offsets=None: [type("C", (), { + "chunk_index": 0, "content": "c", "section_type": "b", + "page_number": 1})()]) + + async def _embed(texts, input_type="document"): return [[0.0] * 8 for _ in texts] + monkeypatch.setattr(ingest.embeddings, "embed_texts", _embed) + + async def _store(cid, dicts): return len(dicts) + monkeypatch.setattr(ingest.db, "store_precedent_chunks", _store) + + async def _create_internal(**kw): return {"id": uuid4()} + monkeypatch.setattr(ingest.db, "create_internal_committee_decision", _create_internal) + + async def _noop(*a, **k): return None + monkeypatch.setattr(ingest.db, "set_case_law_extraction_status", _noop) + monkeypatch.setattr(ingest.db, "set_case_law_halacha_status", _noop) + monkeypatch.setattr(ingest.db, "request_metadata_extraction", + lambda cid: calls["meta"].append(cid) or _noop()) + monkeypatch.setattr(ingest.db, "request_halacha_extraction", + lambda cid: calls["hal"].append(cid) or _noop()) + + async def _recompute(cid): calls["recompute"].append(cid) + monkeypatch.setattr(ingest.db, "recompute_searchable", _recompute) + monkeypatch.setattr(ingest.config, "PARENT_DOC_RETRIEVAL_ENABLED", False) + monkeypatch.setattr(ingest.config, "MULTIMODAL_ENABLED", False) + + from legal_mcp.services import internal_decisions + _run(internal_decisions.ingest_internal_decision( + case_number="8047/23", text="t", chair_name="x", practice_area="rishuy_uvniya")) + assert len(calls["recompute"]) == 1, "ingest must recompute searchable after success"