diff --git a/mcp-server/src/legal_mcp/tools/search.py b/mcp-server/src/legal_mcp/tools/search.py index 0288cbb..f1390ce 100644 --- a/mcp-server/src/legal_mcp/tools/search.py +++ b/mcp-server/src/legal_mcp/tools/search.py @@ -7,7 +7,7 @@ import logging import time from uuid import UUID -from legal_mcp.services import db, embeddings, hybrid_search, telemetry +from legal_mcp.services import db, embeddings, hybrid_search, practice_area as pa, telemetry logger = logging.getLogger(__name__) @@ -30,7 +30,9 @@ async def search_decisions( appeal_subtype: סוג ערר לסינון (building_permit/betterment_levy/compensation_197) case_number: אם סופק, ה-practice_area/subtype יוסקו אוטומטית מהתיק """ - # Auto-resolve practice_area from case_number if available + # Auto-resolve practice_area from case_number if available (GAP-12 / INV-RET1): + # explicit practice_area wins; otherwise derive from the case so the search is + # scoped to the case's legal domain. Case-less search stays cross-domain. resolved_case_id: UUID | None = None if case_number and not practice_area: case = await db.get_case_by_number(case_number) @@ -42,6 +44,22 @@ async def search_decisions( except (KeyError, ValueError, TypeError): resolved_case_id = None + # Case row had no practice_area — fall back to deriving from the + # case-number prefix (1xxx/8xxx/9xxx). Returns "" for unknown prefixes. + if not practice_area: + practice_area = pa.derive_domain_practice_area(case_number) + + # Still undeterminable: a case is present but we cannot scope the + # search to its domain. This is a data anomaly — BLOCK rather than + # silently running a cross-domain search for a specific case. + if not practice_area: + return ( + f"שגיאה: לא ניתן לקבוע את התחום המשפטי (practice_area) של תיק " + f"{case_number}. לתיק אין practice_area מוגדר ולא ניתן להסיק אותו " + f"ממספר התיק. זוהי אנומליית נתונים — נא להגדיר את ה-practice_area " + f"של התיק (למשל דרך case_update) לפני הרצת חיפוש מסונן לתיק זה." + ) + if not practice_area: logger.warning( "search_decisions called without practice_area filter — " diff --git a/mcp-server/tests/test_search_domain_scope.py b/mcp-server/tests/test_search_domain_scope.py new file mode 100644 index 0000000..66968f7 --- /dev/null +++ b/mcp-server/tests/test_search_domain_scope.py @@ -0,0 +1,144 @@ +"""Domain-scope tests for search_decisions (GAP-12 / INV-RET1). + +Policy under test (see CLAUDE.md + tools/search.py): + 1. explicit practice_area -> used as-is, search runs; + 2. case_number + case.practice_area set -> use case value, runs; + 3. case_number + empty case.practice_area but derivable prefix (8xxx) + -> derive domain, runs; + 4. case_number present but UNDETERMINABLE (case.practice_area empty AND + prefix not 1/8/9) -> BLOCK (return Hebrew error, hybrid search + NEVER called); + 5. no case_number, no practice_area -> warn + proceed (runs). + +All DB / embedding / hybrid-search / telemetry calls are monkeypatched so +the test runs fully offline with no live Postgres or model. +""" + +from __future__ import annotations + +import asyncio +import json +from uuid import uuid4 + +import pytest + +from legal_mcp.services import db, embeddings, hybrid_search, telemetry +from legal_mcp.tools import search as search_tool + + +def _run(coro): + return asyncio.run(coro) + + +@pytest.fixture() +def patched(monkeypatch: pytest.MonkeyPatch) -> dict: + """Patch all I/O boundaries. Record what hybrid_search received. + + ``calls["hybrid"]`` is appended to ONLY when the real search runs, so + asserting ``calls["hybrid"] == []`` proves the search was blocked. + """ + calls: dict = {"hybrid": [], "cases": {}} + + async def _embed_query(query: str): + return [0.0] * 8 + + async def _search_documents_hybrid(**kwargs): + calls["hybrid"].append(kwargs) + # one synthetic hit so the formatting path is exercised + return [ + { + "score": 0.9, + "case_number": "X", + "document_title": "doc", + "section_type": "facts", + "page_number": 1, + "content": "hit", + "match_type": "text", + "image_thumbnail_path": None, + } + ] + + async def _get_case_by_number(case_number: str): + return calls["cases"].get(case_number) + + def _log_search_bg(**kwargs): + return None + + monkeypatch.setattr(embeddings, "embed_query", _embed_query) + monkeypatch.setattr( + hybrid_search, "search_documents_hybrid", _search_documents_hybrid + ) + monkeypatch.setattr(db, "get_case_by_number", _get_case_by_number) + monkeypatch.setattr(telemetry, "log_search_bg", _log_search_bg) + return calls + + +def test_explicit_practice_area_used(patched: dict) -> None: + out = _run( + search_tool.search_decisions( + query="זכויות בנייה", practice_area="betterment_levy" + ) + ) + assert len(patched["hybrid"]) == 1 + assert patched["hybrid"][0]["practice_area"] == "betterment_levy" + # explicit value must not trigger a case lookup + assert patched["cases"] == {} + # ran -> JSON result, not an error string + assert json.loads(out)[0]["content"] == "hit" + + +def test_case_practice_area_used(patched: dict) -> None: + patched["cases"]["8126/25"] = { + "id": str(uuid4()), + "practice_area": "betterment_levy", + "appeal_subtype": "betterment_levy", + } + out = _run( + search_tool.search_decisions(query="היטל", case_number="8126/25") + ) + assert len(patched["hybrid"]) == 1 + assert patched["hybrid"][0]["practice_area"] == "betterment_levy" + assert json.loads(out)[0]["content"] == "hit" + + +def test_case_empty_practice_area_derived_from_prefix(patched: dict) -> None: + # case row exists but practice_area is empty -> derive from 8xxx prefix + patched["cases"]["8126/25"] = { + "id": str(uuid4()), + "practice_area": "", + "appeal_subtype": "", + } + out = _run( + search_tool.search_decisions(query="היטל", case_number="8126/25") + ) + assert len(patched["hybrid"]) == 1 + assert patched["hybrid"][0]["practice_area"] == "betterment_levy" + assert json.loads(out)[0]["content"] == "hit" + + +def test_case_undeterminable_is_blocked(patched: dict) -> None: + # case exists, empty practice_area, and prefix is NOT 1/8/9 -> block + patched["cases"]["7777/25"] = { + "id": str(uuid4()), + "practice_area": "", + "appeal_subtype": "", + } + out = _run( + search_tool.search_decisions(query="משהו", case_number="7777/25") + ) + # hybrid search must NOT have been called + assert patched["hybrid"] == [] + # returns a Hebrew error string, not JSON + assert out.startswith("שגיאה") + assert "7777/25" in out + with pytest.raises(json.JSONDecodeError): + json.loads(out) + + +def test_no_case_no_practice_area_proceeds(patched: dict) -> None: + # exploratory / chat search: cross-domain, must NOT be blocked + out = _run(search_tool.search_decisions(query="חיפוש חופשי")) + assert len(patched["hybrid"]) == 1 + assert patched["hybrid"][0]["practice_area"] is None + assert patched["cases"] == {} + assert json.loads(out)[0]["content"] == "hit"