test(ingest): failing tests for idempotent ingest + searchable (FU-2a)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-30 20:41:34 +00:00
parent a16f8cd933
commit bcd226ac1a

View File

@@ -0,0 +1,110 @@
"""FU-2a: idempotent ingest + write-time normalization + searchable flag.
Offline tests for the *pure* pieces (canonical normalization, completeness
predicate) and ingest wiring. The real ON CONFLICT upsert is verified by a
DB smoke test against localhost:5433 (see plan Task 6), since it requires a
live Postgres partial unique index.
"""
from __future__ import annotations
import asyncio
from uuid import uuid4
import pytest
from legal_mcp.services import db, ingest
def _run(coro):
return asyncio.run(coro)
# ── GAP-06: canonical normalization (pure, deterministic) ──────────────
@pytest.mark.parametrize("raw,expected", [
("ערר 8137/24", "8137-24"),
(" עע\"מ 1/20 ", "1-20"),
("8126-03-25", "8126-03-25"), # month segment preserved
("בל\"מ 1010-01-25", "1010-01-25"),
("8047/23", "8047-23"),
])
def test_canonical_case_number(raw, expected):
assert db._canonical_case_number(raw) == expected
def test_canonical_does_not_invent_month():
# No month in input → none added (X1 §1).
assert db._canonical_case_number("8126/24") == "8126-24"
# ── GAP-13: completeness predicate (pure) ──────────────────────────────
def _complete_row():
return {
"case_number": "8047-23", "case_name": "פלוני נ' הוועדה",
"practice_area": "rishuy_uvniya", "source_kind": "internal_committee",
"extraction_status": "completed", "headnote": "תקציר",
"summary": "", "subject_tags": [],
}
def test_compute_searchable_true_when_complete():
assert db._compute_searchable(_complete_row(), has_embedded_chunk=True) is True
def test_compute_searchable_false_without_embedded_chunk():
assert db._compute_searchable(_complete_row(), has_embedded_chunk=False) is False
def test_compute_searchable_false_without_metadata():
row = _complete_row()
row["headnote"] = ""; row["summary"] = ""; row["subject_tags"] = []
assert db._compute_searchable(row, has_embedded_chunk=True) is False
def test_compute_searchable_false_when_extraction_incomplete():
row = _complete_row(); row["extraction_status"] = "pending"
assert db._compute_searchable(row, has_embedded_chunk=True) is False
def test_compute_searchable_false_without_core_fields():
row = _complete_row(); row["practice_area"] = ""
assert db._compute_searchable(row, has_embedded_chunk=True) is False
# ── ingest wires in recompute_searchable (both types) ──────────────────
def test_ingest_calls_recompute_searchable(monkeypatch, tmp_path):
calls = {"recompute": [], "meta": [], "hal": []}
async def _extract_text(path): return ("text", 1, [0])
monkeypatch.setattr(ingest.extractor, "extract_text", _extract_text)
monkeypatch.setattr(ingest.extractor, "strip_nevo_preamble", lambda t: t)
monkeypatch.setattr(ingest.chunker, "chunk_document",
lambda t, page_offsets=None: [type("C", (), {
"chunk_index": 0, "content": "c", "section_type": "b",
"page_number": 1})()])
async def _embed(texts, input_type="document"): return [[0.0] * 8 for _ in texts]
monkeypatch.setattr(ingest.embeddings, "embed_texts", _embed)
async def _store(cid, dicts): return len(dicts)
monkeypatch.setattr(ingest.db, "store_precedent_chunks", _store)
async def _create_internal(**kw): return {"id": uuid4()}
monkeypatch.setattr(ingest.db, "create_internal_committee_decision", _create_internal)
async def _noop(*a, **k): return None
monkeypatch.setattr(ingest.db, "set_case_law_extraction_status", _noop)
monkeypatch.setattr(ingest.db, "set_case_law_halacha_status", _noop)
monkeypatch.setattr(ingest.db, "request_metadata_extraction",
lambda cid: calls["meta"].append(cid) or _noop())
monkeypatch.setattr(ingest.db, "request_halacha_extraction",
lambda cid: calls["hal"].append(cid) or _noop())
async def _recompute(cid): calls["recompute"].append(cid)
monkeypatch.setattr(ingest.db, "recompute_searchable", _recompute)
monkeypatch.setattr(ingest.config, "PARENT_DOC_RETRIEVAL_ENABLED", False)
monkeypatch.setattr(ingest.config, "MULTIMODAL_ENABLED", False)
from legal_mcp.services import internal_decisions
_run(internal_decisions.ingest_internal_decision(
case_number="8047/23", text="t", chair_name="x", practice_area="rishuy_uvniya"))
assert len(calls["recompute"]) == 1, "ingest must recompute searchable after success"