90 lines
3.0 KiB
Python
90 lines
3.0 KiB
Python
"""FU-3: re-index on content change (offline, monkeypatched I/O)."""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from uuid import uuid4
|
|
|
|
import pytest
|
|
|
|
from legal_mcp.services import db, ingest
|
|
|
|
|
|
def _run(coro):
|
|
return asyncio.run(coro)
|
|
|
|
|
|
# ── content_hash is deterministic ──────────────────────────────────────
|
|
def test_content_hash_deterministic():
|
|
h1 = db._content_hash("פסק דין כלשהו")
|
|
h2 = db._content_hash("פסק דין כלשהו")
|
|
assert h1 == h2 and len(h1) == 64 # sha256 hex
|
|
|
|
|
|
def test_content_hash_empty_is_blank():
|
|
assert db._content_hash("") == ""
|
|
assert db._content_hash(None) == ""
|
|
|
|
|
|
def test_content_hash_changes_with_text():
|
|
assert db._content_hash("alpha") != db._content_hash("beta")
|
|
|
|
|
|
# ── mark_indexed copies content_hash → indexed_hash ─────────────────────
|
|
def test_mark_indexed_executes_update(monkeypatch):
|
|
seen = {}
|
|
|
|
class _Conn:
|
|
async def execute(self, q, *a):
|
|
seen["q"] = q; seen["args"] = a
|
|
async def __aenter__(self): return self
|
|
async def __aexit__(self, *a): return False
|
|
|
|
class _Pool:
|
|
def acquire(self): return _Conn()
|
|
|
|
async def _pool(): return _Pool()
|
|
monkeypatch.setattr(db, "get_pool", _pool)
|
|
|
|
cid = uuid4()
|
|
_run(db.mark_indexed(cid))
|
|
assert "indexed_hash" in seen["q"] and "content_hash" in seen["q"]
|
|
assert seen["args"][0] == cid
|
|
|
|
|
|
# ── reindex_case_law re-embeds from stored text, no extractor/LLM ───────
|
|
def test_reindex_case_law_uses_stored_text(monkeypatch):
|
|
cid = uuid4()
|
|
calls = {"chunk_embed_store": [], "mark_indexed": []}
|
|
|
|
async def _get_case_law(x):
|
|
return {"id": cid, "full_text": "טקסט שמור של ההחלטה"}
|
|
monkeypatch.setattr(ingest.db, "get_case_law", _get_case_law)
|
|
|
|
async def _ces(case_law_id, text, page_offsets, page_count, progress):
|
|
calls["chunk_embed_store"].append((case_law_id, text))
|
|
return 5
|
|
monkeypatch.setattr(ingest, "_chunk_embed_store", _ces)
|
|
|
|
async def _mark(x):
|
|
calls["mark_indexed"].append(x)
|
|
monkeypatch.setattr(ingest.db, "mark_indexed", _mark)
|
|
|
|
out = _run(ingest.reindex_case_law(cid))
|
|
assert out["chunks"] == 5 and out["reindexed"] is True
|
|
assert calls["chunk_embed_store"][0][1] == "טקסט שמור של ההחלטה"
|
|
assert calls["mark_indexed"] == [cid]
|
|
|
|
|
|
def test_reindex_case_law_missing_row_raises(monkeypatch):
|
|
async def _none(x): return None
|
|
monkeypatch.setattr(ingest.db, "get_case_law", _none)
|
|
with pytest.raises(ValueError, match="not found"):
|
|
_run(ingest.reindex_case_law(uuid4()))
|
|
|
|
|
|
def test_reindex_case_law_empty_text_raises(monkeypatch):
|
|
async def _empty(x): return {"id": uuid4(), "full_text": " "}
|
|
monkeypatch.setattr(ingest.db, "get_case_law", _empty)
|
|
with pytest.raises(ValueError, match="no stored full_text"):
|
|
_run(ingest.reindex_case_law(uuid4()))
|