test(reindex): failing tests for content-hash re-index (FU-3)

This commit is contained in:
2026-05-30 22:02:16 +00:00
parent 8b3f191c8b
commit e522555b1a

View File

@@ -0,0 +1,82 @@
"""FU-3: re-index on content change (offline, monkeypatched I/O)."""
from __future__ import annotations
import asyncio
from uuid import uuid4
import pytest
from legal_mcp.services import db, ingest
def _run(coro):
return asyncio.run(coro)
# ── content_hash is deterministic ──────────────────────────────────────
def test_content_hash_deterministic():
h1 = db._content_hash("פסק דין כלשהו")
h2 = db._content_hash("פסק דין כלשהו")
assert h1 == h2 and len(h1) == 64 # sha256 hex
def test_content_hash_empty_is_blank():
assert db._content_hash("") == ""
assert db._content_hash(None) == ""
def test_content_hash_changes_with_text():
assert db._content_hash("alpha") != db._content_hash("beta")
# ── mark_indexed copies content_hash → indexed_hash ─────────────────────
def test_mark_indexed_executes_update(monkeypatch):
seen = {}
class _Conn:
async def execute(self, q, *a):
seen["q"] = q; seen["args"] = a
async def __aenter__(self): return self
async def __aexit__(self, *a): return False
class _Pool:
def acquire(self): return _Conn()
async def _pool(): return _Pool()
monkeypatch.setattr(db, "get_pool", _pool)
cid = uuid4()
_run(db.mark_indexed(cid))
assert "indexed_hash" in seen["q"] and "content_hash" in seen["q"]
assert seen["args"][0] == cid
# ── reindex_case_law re-embeds from stored text, no extractor/LLM ───────
def test_reindex_case_law_uses_stored_text(monkeypatch):
cid = uuid4()
calls = {"chunk_embed_store": [], "mark_indexed": []}
async def _get_case_law(x):
return {"id": cid, "full_text": "טקסט שמור של ההחלטה"}
monkeypatch.setattr(ingest.db, "get_case_law", _get_case_law)
async def _ces(case_law_id, text, page_offsets, page_count, progress):
calls["chunk_embed_store"].append((case_law_id, text))
return 5
monkeypatch.setattr(ingest, "_chunk_embed_store", _ces)
async def _mark(x):
calls["mark_indexed"].append(x)
monkeypatch.setattr(ingest.db, "mark_indexed", _mark)
out = _run(ingest.reindex_case_law(cid))
assert out["chunks"] == 5 and out["reindexed"] is True
assert calls["chunk_embed_store"][0][1] == "טקסט שמור של ההחלטה"
assert calls["mark_indexed"] == [cid]
def test_reindex_case_law_missing_row_raises(monkeypatch):
async def _none(x): return None
monkeypatch.setattr(ingest.db, "get_case_law", _none)
with pytest.raises(ValueError, match="not found"):
_run(ingest.reindex_case_law(uuid4()))