"""Regression test for GAP-10 / INV-RET1: corpus separation enforced on EVERY precedent-library query path — including the halacha sub-query. Bug: ``search_precedent_library_semantic`` and ``search_precedent_library_lexical`` filtered the *chunk* sub-query by ``cl.source_kind`` but NOT the *halacha* sub-query. So an external (``source_kind='external_upload'``) search leaked internal-committee halachot, and an internal search leaked external-ruling halachot — a cross-corpus contamination of the rule-level results. Fix: the same ``cl.source_kind = ''`` predicate that gates the chunk query now also gates the halacha query, in BOTH functions. This test runs fully OFFLINE — it monkeypatches ``db.get_pool`` with a fake pool that captures every SQL string passed to ``fetch`` instead of hitting Postgres. It asserts the captured halacha SQL carries the source_kind predicate identical to the chunk SQL. """ from __future__ import annotations import asyncio import pytest from legal_mcp.services import db class _FakePool: """Captures SQL passed to ``fetch``; returns no rows.""" def __init__(self) -> None: self.queries: list[str] = [] async def fetch(self, sql: str, *args) -> list: # noqa: ANN002 self.queries.append(sql) return [] def _classify(queries: list[str]) -> tuple[str, str]: """Return (halacha_sql, chunk_sql) from the captured queries.""" halacha = next(q for q in queries if "FROM halachot h" in q) chunk = next(q for q in queries if "FROM precedent_chunks pc" in q) return halacha, chunk @pytest.fixture() def fake_pool(monkeypatch: pytest.MonkeyPatch) -> _FakePool: pool = _FakePool() async def _get_pool() -> _FakePool: return pool monkeypatch.setattr(db, "get_pool", _get_pool) return pool @pytest.mark.parametrize("source_kind", ["external_upload", "internal_committee"]) def test_semantic_halacha_query_is_source_kind_scoped( fake_pool: _FakePool, source_kind: str ) -> None: asyncio.run( db.search_precedent_library_semantic( query_embedding=[0.0] * 8, source_kind=source_kind, include_halachot=True, limit=5, ) ) halacha_sql, chunk_sql = _classify(fake_pool.queries) predicate = f"cl.source_kind = '{source_kind}'" assert predicate in chunk_sql, "chunk query must be source_kind-scoped (precondition)" assert predicate in halacha_sql, ( "halacha query MUST carry the same source_kind predicate as the " "chunk query — otherwise cross-corpus halacha leakage (GAP-10)" ) @pytest.mark.parametrize("source_kind", ["external_upload", "internal_committee"]) def test_lexical_halacha_query_is_source_kind_scoped( fake_pool: _FakePool, source_kind: str ) -> None: asyncio.run( db.search_precedent_library_lexical( query="zoning setback", source_kind=source_kind, include_halachot=True, limit=5, ) ) halacha_sql, chunk_sql = _classify(fake_pool.queries) predicate = f"cl.source_kind = '{source_kind}'" assert predicate in chunk_sql, "chunk query must be source_kind-scoped (precondition)" assert predicate in halacha_sql, ( "halacha query MUST carry the same source_kind predicate as the " "chunk query — otherwise cross-corpus halacha leakage (GAP-10)" )