From 1af689a969918b2ba38812c5703dc4f442b3d9c4 Mon Sep 17 00:00:00 2001 From: Chaim Date: Sat, 30 May 2026 17:46:59 +0000 Subject: [PATCH] =?UTF-8?q?fix(retrieval):=20enforce=20source=5Fkind=20on?= =?UTF-8?q?=20halacha=5Ffilters=20=E2=80=94=20close=20cross-corpus=20leak?= =?UTF-8?q?=20(GAP-10,=20INV-RET1)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 --- mcp-server/src/legal_mcp/services/db.py | 10 +- .../tests/test_precedent_corpus_isolation.py | 97 +++++++++++++++++++ 2 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 mcp-server/tests/test_precedent_corpus_isolation.py diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py index a3f22ae..540ff1a 100644 --- a/mcp-server/src/legal_mcp/services/db.py +++ b/mcp-server/src/legal_mcp/services/db.py @@ -3165,7 +3165,10 @@ async def search_precedent_library_semantic( of halacha review status. """ pool = await get_pool() - halacha_filters = ["h.review_status IN ('approved', 'published')"] + halacha_filters = [ + "h.review_status IN ('approved', 'published')", + f"cl.source_kind = '{source_kind}'", + ] chunk_filters = [f"cl.source_kind = '{source_kind}'"] h_params: list = [query_embedding, limit] c_params: list = [query_embedding, limit] @@ -3398,7 +3401,10 @@ async def search_precedent_library_lexical( return [] pool = await get_pool() - halacha_filters = ["h.review_status IN ('approved', 'published')"] + halacha_filters = [ + "h.review_status IN ('approved', 'published')", + f"cl.source_kind = '{source_kind}'", + ] chunk_filters = [f"cl.source_kind = '{source_kind}'"] # $1 = query, $2 = limit. Filters append starting at $3. h_params: list = [query, limit] diff --git a/mcp-server/tests/test_precedent_corpus_isolation.py b/mcp-server/tests/test_precedent_corpus_isolation.py new file mode 100644 index 0000000..5d3d809 --- /dev/null +++ b/mcp-server/tests/test_precedent_corpus_isolation.py @@ -0,0 +1,97 @@ +"""Regression test for GAP-10 / INV-RET1: corpus separation enforced on +EVERY precedent-library query path — including the halacha sub-query. + +Bug: ``search_precedent_library_semantic`` and +``search_precedent_library_lexical`` filtered the *chunk* sub-query by +``cl.source_kind`` but NOT the *halacha* sub-query. So an external +(``source_kind='external_upload'``) search leaked internal-committee +halachot, and an internal search leaked external-ruling halachot — a +cross-corpus contamination of the rule-level results. + +Fix: the same ``cl.source_kind = ''`` predicate that gates the +chunk query now also gates the halacha query, in BOTH functions. + +This test runs fully OFFLINE — it monkeypatches ``db.get_pool`` with a +fake pool that captures every SQL string passed to ``fetch`` instead of +hitting Postgres. It asserts the captured halacha SQL carries the +source_kind predicate identical to the chunk SQL. +""" + +from __future__ import annotations + +import asyncio + +import pytest + +from legal_mcp.services import db + + +class _FakePool: + """Captures SQL passed to ``fetch``; returns no rows.""" + + def __init__(self) -> None: + self.queries: list[str] = [] + + async def fetch(self, sql: str, *args) -> list: # noqa: ANN002 + self.queries.append(sql) + return [] + + +def _classify(queries: list[str]) -> tuple[str, str]: + """Return (halacha_sql, chunk_sql) from the captured queries.""" + halacha = next(q for q in queries if "FROM halachot h" in q) + chunk = next(q for q in queries if "FROM precedent_chunks pc" in q) + return halacha, chunk + + +@pytest.fixture() +def fake_pool(monkeypatch: pytest.MonkeyPatch) -> _FakePool: + pool = _FakePool() + + async def _get_pool() -> _FakePool: + return pool + + monkeypatch.setattr(db, "get_pool", _get_pool) + return pool + + +@pytest.mark.parametrize("source_kind", ["external_upload", "internal_committee"]) +def test_semantic_halacha_query_is_source_kind_scoped( + fake_pool: _FakePool, source_kind: str +) -> None: + asyncio.run( + db.search_precedent_library_semantic( + query_embedding=[0.0] * 8, + source_kind=source_kind, + include_halachot=True, + limit=5, + ) + ) + halacha_sql, chunk_sql = _classify(fake_pool.queries) + predicate = f"cl.source_kind = '{source_kind}'" + assert predicate in chunk_sql, "chunk query must be source_kind-scoped (precondition)" + assert predicate in halacha_sql, ( + "halacha query MUST carry the same source_kind predicate as the " + "chunk query — otherwise cross-corpus halacha leakage (GAP-10)" + ) + + +@pytest.mark.parametrize("source_kind", ["external_upload", "internal_committee"]) +def test_lexical_halacha_query_is_source_kind_scoped( + fake_pool: _FakePool, source_kind: str +) -> None: + asyncio.run( + db.search_precedent_library_lexical( + query="zoning setback", + source_kind=source_kind, + include_halachot=True, + limit=5, + ) + ) + halacha_sql, chunk_sql = _classify(fake_pool.queries) + predicate = f"cl.source_kind = '{source_kind}'" + assert predicate in chunk_sql, "chunk query must be source_kind-scoped (precondition)" + assert predicate in halacha_sql, ( + "halacha query MUST carry the same source_kind predicate as the " + "chunk query — otherwise cross-corpus halacha leakage (GAP-10)" + )