111 lines
4.6 KiB
Python
111 lines
4.6 KiB
Python
"""FU-2a: idempotent ingest + write-time normalization + searchable flag.
|
|
|
|
Offline tests for the *pure* pieces (canonical normalization, completeness
|
|
predicate) and ingest wiring. The real ON CONFLICT upsert is verified by a
|
|
DB smoke test against localhost:5433 (see plan Task 6), since it requires a
|
|
live Postgres partial unique index.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from uuid import uuid4
|
|
|
|
import pytest
|
|
|
|
from legal_mcp.services import db, ingest
|
|
|
|
|
|
def _run(coro):
|
|
return asyncio.run(coro)
|
|
|
|
|
|
# ── GAP-06: canonical normalization (pure, deterministic) ──────────────
|
|
@pytest.mark.parametrize("raw,expected", [
|
|
("ערר 8137/24", "8137-24"),
|
|
(" עע\"מ 1/20 ", "1-20"),
|
|
("8126-03-25", "8126-03-25"), # month segment preserved
|
|
("בל\"מ 1010-01-25", "1010-01-25"),
|
|
("8047/23", "8047-23"),
|
|
])
|
|
def test_canonical_case_number(raw, expected):
|
|
assert db._canonical_case_number(raw) == expected
|
|
|
|
|
|
def test_canonical_does_not_invent_month():
|
|
# No month in input → none added (X1 §1).
|
|
assert db._canonical_case_number("8126/24") == "8126-24"
|
|
|
|
|
|
# ── GAP-13: completeness predicate (pure) ──────────────────────────────
|
|
def _complete_row():
|
|
return {
|
|
"case_number": "8047-23", "case_name": "פלוני נ' הוועדה",
|
|
"practice_area": "rishuy_uvniya", "source_kind": "internal_committee",
|
|
"extraction_status": "completed", "headnote": "תקציר",
|
|
"summary": "", "subject_tags": [],
|
|
}
|
|
|
|
|
|
def test_compute_searchable_true_when_complete():
|
|
assert db._compute_searchable(_complete_row(), has_embedded_chunk=True) is True
|
|
|
|
|
|
def test_compute_searchable_false_without_embedded_chunk():
|
|
assert db._compute_searchable(_complete_row(), has_embedded_chunk=False) is False
|
|
|
|
|
|
def test_compute_searchable_false_without_metadata():
|
|
row = _complete_row()
|
|
row["headnote"] = ""; row["summary"] = ""; row["subject_tags"] = []
|
|
assert db._compute_searchable(row, has_embedded_chunk=True) is False
|
|
|
|
|
|
def test_compute_searchable_false_when_extraction_incomplete():
|
|
row = _complete_row(); row["extraction_status"] = "pending"
|
|
assert db._compute_searchable(row, has_embedded_chunk=True) is False
|
|
|
|
|
|
def test_compute_searchable_false_without_core_fields():
|
|
row = _complete_row(); row["practice_area"] = ""
|
|
assert db._compute_searchable(row, has_embedded_chunk=True) is False
|
|
|
|
|
|
# ── ingest wires in recompute_searchable (both types) ──────────────────
|
|
def test_ingest_calls_recompute_searchable(monkeypatch, tmp_path):
|
|
calls = {"recompute": [], "meta": [], "hal": []}
|
|
|
|
async def _extract_text(path): return ("text", 1, [0])
|
|
monkeypatch.setattr(ingest.extractor, "extract_text", _extract_text)
|
|
monkeypatch.setattr(ingest.extractor, "strip_nevo_preamble", lambda t: t)
|
|
monkeypatch.setattr(ingest.chunker, "chunk_document",
|
|
lambda t, page_offsets=None: [type("C", (), {
|
|
"chunk_index": 0, "content": "c", "section_type": "b",
|
|
"page_number": 1})()])
|
|
|
|
async def _embed(texts, input_type="document"): return [[0.0] * 8 for _ in texts]
|
|
monkeypatch.setattr(ingest.embeddings, "embed_texts", _embed)
|
|
|
|
async def _store(cid, dicts): return len(dicts)
|
|
monkeypatch.setattr(ingest.db, "store_precedent_chunks", _store)
|
|
|
|
async def _create_internal(**kw): return {"id": uuid4()}
|
|
monkeypatch.setattr(ingest.db, "create_internal_committee_decision", _create_internal)
|
|
|
|
async def _noop(*a, **k): return None
|
|
monkeypatch.setattr(ingest.db, "set_case_law_extraction_status", _noop)
|
|
monkeypatch.setattr(ingest.db, "set_case_law_halacha_status", _noop)
|
|
monkeypatch.setattr(ingest.db, "request_metadata_extraction",
|
|
lambda cid: calls["meta"].append(cid) or _noop())
|
|
monkeypatch.setattr(ingest.db, "request_halacha_extraction",
|
|
lambda cid: calls["hal"].append(cid) or _noop())
|
|
|
|
async def _recompute(cid): calls["recompute"].append(cid)
|
|
monkeypatch.setattr(ingest.db, "recompute_searchable", _recompute)
|
|
monkeypatch.setattr(ingest.config, "PARENT_DOC_RETRIEVAL_ENABLED", False)
|
|
monkeypatch.setattr(ingest.config, "MULTIMODAL_ENABLED", False)
|
|
|
|
from legal_mcp.services import internal_decisions
|
|
_run(internal_decisions.ingest_internal_decision(
|
|
case_number="8047/23", text="t", chair_name="x", practice_area="rishuy_uvniya"))
|
|
assert len(calls["recompute"]) == 1, "ingest must recompute searchable after success"
|