test(ingest): failing tests for idempotent ingest + searchable (FU-2a)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
110
mcp-server/tests/test_idempotent_ingest.py
Normal file
110
mcp-server/tests/test_idempotent_ingest.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""FU-2a: idempotent ingest + write-time normalization + searchable flag.
|
||||
|
||||
Offline tests for the *pure* pieces (canonical normalization, completeness
|
||||
predicate) and ingest wiring. The real ON CONFLICT upsert is verified by a
|
||||
DB smoke test against localhost:5433 (see plan Task 6), since it requires a
|
||||
live Postgres partial unique index.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from uuid import uuid4
|
||||
|
||||
import pytest
|
||||
|
||||
from legal_mcp.services import db, ingest
|
||||
|
||||
|
||||
def _run(coro):
|
||||
return asyncio.run(coro)
|
||||
|
||||
|
||||
# ── GAP-06: canonical normalization (pure, deterministic) ──────────────
|
||||
@pytest.mark.parametrize("raw,expected", [
|
||||
("ערר 8137/24", "8137-24"),
|
||||
(" עע\"מ 1/20 ", "1-20"),
|
||||
("8126-03-25", "8126-03-25"), # month segment preserved
|
||||
("בל\"מ 1010-01-25", "1010-01-25"),
|
||||
("8047/23", "8047-23"),
|
||||
])
|
||||
def test_canonical_case_number(raw, expected):
|
||||
assert db._canonical_case_number(raw) == expected
|
||||
|
||||
|
||||
def test_canonical_does_not_invent_month():
|
||||
# No month in input → none added (X1 §1).
|
||||
assert db._canonical_case_number("8126/24") == "8126-24"
|
||||
|
||||
|
||||
# ── GAP-13: completeness predicate (pure) ──────────────────────────────
|
||||
def _complete_row():
|
||||
return {
|
||||
"case_number": "8047-23", "case_name": "פלוני נ' הוועדה",
|
||||
"practice_area": "rishuy_uvniya", "source_kind": "internal_committee",
|
||||
"extraction_status": "completed", "headnote": "תקציר",
|
||||
"summary": "", "subject_tags": [],
|
||||
}
|
||||
|
||||
|
||||
def test_compute_searchable_true_when_complete():
|
||||
assert db._compute_searchable(_complete_row(), has_embedded_chunk=True) is True
|
||||
|
||||
|
||||
def test_compute_searchable_false_without_embedded_chunk():
|
||||
assert db._compute_searchable(_complete_row(), has_embedded_chunk=False) is False
|
||||
|
||||
|
||||
def test_compute_searchable_false_without_metadata():
|
||||
row = _complete_row()
|
||||
row["headnote"] = ""; row["summary"] = ""; row["subject_tags"] = []
|
||||
assert db._compute_searchable(row, has_embedded_chunk=True) is False
|
||||
|
||||
|
||||
def test_compute_searchable_false_when_extraction_incomplete():
|
||||
row = _complete_row(); row["extraction_status"] = "pending"
|
||||
assert db._compute_searchable(row, has_embedded_chunk=True) is False
|
||||
|
||||
|
||||
def test_compute_searchable_false_without_core_fields():
|
||||
row = _complete_row(); row["practice_area"] = ""
|
||||
assert db._compute_searchable(row, has_embedded_chunk=True) is False
|
||||
|
||||
|
||||
# ── ingest wires in recompute_searchable (both types) ──────────────────
|
||||
def test_ingest_calls_recompute_searchable(monkeypatch, tmp_path):
|
||||
calls = {"recompute": [], "meta": [], "hal": []}
|
||||
|
||||
async def _extract_text(path): return ("text", 1, [0])
|
||||
monkeypatch.setattr(ingest.extractor, "extract_text", _extract_text)
|
||||
monkeypatch.setattr(ingest.extractor, "strip_nevo_preamble", lambda t: t)
|
||||
monkeypatch.setattr(ingest.chunker, "chunk_document",
|
||||
lambda t, page_offsets=None: [type("C", (), {
|
||||
"chunk_index": 0, "content": "c", "section_type": "b",
|
||||
"page_number": 1})()])
|
||||
|
||||
async def _embed(texts, input_type="document"): return [[0.0] * 8 for _ in texts]
|
||||
monkeypatch.setattr(ingest.embeddings, "embed_texts", _embed)
|
||||
|
||||
async def _store(cid, dicts): return len(dicts)
|
||||
monkeypatch.setattr(ingest.db, "store_precedent_chunks", _store)
|
||||
|
||||
async def _create_internal(**kw): return {"id": uuid4()}
|
||||
monkeypatch.setattr(ingest.db, "create_internal_committee_decision", _create_internal)
|
||||
|
||||
async def _noop(*a, **k): return None
|
||||
monkeypatch.setattr(ingest.db, "set_case_law_extraction_status", _noop)
|
||||
monkeypatch.setattr(ingest.db, "set_case_law_halacha_status", _noop)
|
||||
monkeypatch.setattr(ingest.db, "request_metadata_extraction",
|
||||
lambda cid: calls["meta"].append(cid) or _noop())
|
||||
monkeypatch.setattr(ingest.db, "request_halacha_extraction",
|
||||
lambda cid: calls["hal"].append(cid) or _noop())
|
||||
|
||||
async def _recompute(cid): calls["recompute"].append(cid)
|
||||
monkeypatch.setattr(ingest.db, "recompute_searchable", _recompute)
|
||||
monkeypatch.setattr(ingest.config, "PARENT_DOC_RETRIEVAL_ENABLED", False)
|
||||
monkeypatch.setattr(ingest.config, "MULTIMODAL_ENABLED", False)
|
||||
|
||||
from legal_mcp.services import internal_decisions
|
||||
_run(internal_decisions.ingest_internal_decision(
|
||||
case_number="8047/23", text="t", chair_name="x", practice_area="rishuy_uvniya"))
|
||||
assert len(calls["recompute"]) == 1, "ingest must recompute searchable after success"
|
||||
Reference in New Issue
Block a user