legal-ai/mcp-server/tests/test_precedent_corpus_isolation.py

"""Regression test for GAP-10 / INV-RET1: corpus separation enforced on
EVERY precedent-library query path — including the halacha sub-query.

Bug: ``search_precedent_library_semantic`` and
``search_precedent_library_lexical`` filtered the *chunk* sub-query by
``cl.source_kind`` but NOT the *halacha* sub-query. So an external
(``source_kind='external_upload'``) search leaked internal-committee
halachot, and an internal search leaked external-ruling halachot — a
cross-corpus contamination of the rule-level results.

Fix: the same ``cl.source_kind = '<kind>'`` predicate that gates the
chunk query now also gates the halacha query, in BOTH functions.

This test runs fully OFFLINE — it monkeypatches ``db.get_pool`` with a
fake pool that captures every SQL string passed to ``fetch`` instead of
hitting Postgres. It asserts the captured halacha SQL carries the
source_kind predicate identical to the chunk SQL.
"""

from __future__ import annotations

import asyncio

import pytest

from legal_mcp.services import db


class _FakePool:
    """Captures SQL passed to ``fetch``; returns no rows."""

    def __init__(self) -> None:
        self.queries: list[str] = []

    async def fetch(self, sql: str, *args) -> list:  # noqa: ANN002
        self.queries.append(sql)
        return []


def _classify(queries: list[str]) -> tuple[str, str]:
    """Return (halacha_sql, chunk_sql) from the captured queries."""
    halacha = next(q for q in queries if "FROM halachot h" in q)
    chunk = next(q for q in queries if "FROM precedent_chunks pc" in q)
    return halacha, chunk


@pytest.fixture()
def fake_pool(monkeypatch: pytest.MonkeyPatch) -> _FakePool:
    pool = _FakePool()

    async def _get_pool() -> _FakePool:
        return pool

    monkeypatch.setattr(db, "get_pool", _get_pool)
    return pool


@pytest.mark.parametrize("source_kind", ["external_upload", "internal_committee"])
def test_semantic_halacha_query_is_source_kind_scoped(
    fake_pool: _FakePool, source_kind: str
) -> None:
    asyncio.run(
        db.search_precedent_library_semantic(
            query_embedding=[0.0] * 8,
            source_kind=source_kind,
            include_halachot=True,
            limit=5,
        )
    )
    halacha_sql, chunk_sql = _classify(fake_pool.queries)
    predicate = f"cl.source_kind = '{source_kind}'"
    assert predicate in chunk_sql, "chunk query must be source_kind-scoped (precondition)"
    assert predicate in halacha_sql, (
        "halacha query MUST carry the same source_kind predicate as the "
        "chunk query — otherwise cross-corpus halacha leakage (GAP-10)"
    )


@pytest.mark.parametrize("source_kind", ["external_upload", "internal_committee"])
def test_lexical_halacha_query_is_source_kind_scoped(
    fake_pool: _FakePool, source_kind: str
) -> None:
    asyncio.run(
        db.search_precedent_library_lexical(
            query="zoning setback",
            source_kind=source_kind,
            include_halachot=True,
            limit=5,
        )
    )
    halacha_sql, chunk_sql = _classify(fake_pool.queries)
    predicate = f"cl.source_kind = '{source_kind}'"
    assert predicate in chunk_sql, "chunk query must be source_kind-scoped (precondition)"
    assert predicate in halacha_sql, (
        "halacha query MUST carry the same source_kind predicate as the "
        "chunk query — otherwise cross-corpus halacha leakage (GAP-10)"
    )