98 lines
3.3 KiB
Python
98 lines
3.3 KiB
Python
"""Regression test for GAP-10 / INV-RET1: corpus separation enforced on
|
|
EVERY precedent-library query path — including the halacha sub-query.
|
|
|
|
Bug: ``search_precedent_library_semantic`` and
|
|
``search_precedent_library_lexical`` filtered the *chunk* sub-query by
|
|
``cl.source_kind`` but NOT the *halacha* sub-query. So an external
|
|
(``source_kind='external_upload'``) search leaked internal-committee
|
|
halachot, and an internal search leaked external-ruling halachot — a
|
|
cross-corpus contamination of the rule-level results.
|
|
|
|
Fix: the same ``cl.source_kind = '<kind>'`` predicate that gates the
|
|
chunk query now also gates the halacha query, in BOTH functions.
|
|
|
|
This test runs fully OFFLINE — it monkeypatches ``db.get_pool`` with a
|
|
fake pool that captures every SQL string passed to ``fetch`` instead of
|
|
hitting Postgres. It asserts the captured halacha SQL carries the
|
|
source_kind predicate identical to the chunk SQL.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
|
|
import pytest
|
|
|
|
from legal_mcp.services import db
|
|
|
|
|
|
class _FakePool:
|
|
"""Captures SQL passed to ``fetch``; returns no rows."""
|
|
|
|
def __init__(self) -> None:
|
|
self.queries: list[str] = []
|
|
|
|
async def fetch(self, sql: str, *args) -> list: # noqa: ANN002
|
|
self.queries.append(sql)
|
|
return []
|
|
|
|
|
|
def _classify(queries: list[str]) -> tuple[str, str]:
|
|
"""Return (halacha_sql, chunk_sql) from the captured queries."""
|
|
halacha = next(q for q in queries if "FROM halachot h" in q)
|
|
chunk = next(q for q in queries if "FROM precedent_chunks pc" in q)
|
|
return halacha, chunk
|
|
|
|
|
|
@pytest.fixture()
|
|
def fake_pool(monkeypatch: pytest.MonkeyPatch) -> _FakePool:
|
|
pool = _FakePool()
|
|
|
|
async def _get_pool() -> _FakePool:
|
|
return pool
|
|
|
|
monkeypatch.setattr(db, "get_pool", _get_pool)
|
|
return pool
|
|
|
|
|
|
@pytest.mark.parametrize("source_kind", ["external_upload", "internal_committee"])
|
|
def test_semantic_halacha_query_is_source_kind_scoped(
|
|
fake_pool: _FakePool, source_kind: str
|
|
) -> None:
|
|
asyncio.run(
|
|
db.search_precedent_library_semantic(
|
|
query_embedding=[0.0] * 8,
|
|
source_kind=source_kind,
|
|
include_halachot=True,
|
|
limit=5,
|
|
)
|
|
)
|
|
halacha_sql, chunk_sql = _classify(fake_pool.queries)
|
|
predicate = f"cl.source_kind = '{source_kind}'"
|
|
assert predicate in chunk_sql, "chunk query must be source_kind-scoped (precondition)"
|
|
assert predicate in halacha_sql, (
|
|
"halacha query MUST carry the same source_kind predicate as the "
|
|
"chunk query — otherwise cross-corpus halacha leakage (GAP-10)"
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("source_kind", ["external_upload", "internal_committee"])
|
|
def test_lexical_halacha_query_is_source_kind_scoped(
|
|
fake_pool: _FakePool, source_kind: str
|
|
) -> None:
|
|
asyncio.run(
|
|
db.search_precedent_library_lexical(
|
|
query="zoning setback",
|
|
source_kind=source_kind,
|
|
include_halachot=True,
|
|
limit=5,
|
|
)
|
|
)
|
|
halacha_sql, chunk_sql = _classify(fake_pool.queries)
|
|
predicate = f"cl.source_kind = '{source_kind}'"
|
|
assert predicate in chunk_sql, "chunk query must be source_kind-scoped (precondition)"
|
|
assert predicate in halacha_sql, (
|
|
"halacha query MUST carry the same source_kind predicate as the "
|
|
"chunk query — otherwise cross-corpus halacha leakage (GAP-10)"
|
|
)
|