"""Regression tests for Stage-A corpus integrity fixes (TaskMaster #30, #31). These tests document the bugs that were closed in Stage A so they don't regress quietly. Each test maps to a real bug or constraint: 1. DB CHECK ``cases_practice_area_check`` rejects the legacy ``'appeals_committee'`` value — only domain values (rishuy_uvniya / betterment_levy / compensation_197) and ``''`` are allowed. (Bug: many ``cases`` rows stored ``'appeals_committee'`` instead of the domain.) 2. DB CHECK ``case_law_internal_chair_check`` and ``case_law_internal_district_check`` reject internal_committee rows with empty chair_name/district. (Bug: 6 records had source_kind='external_upload' but were really internal committee decisions; the flip to internal_committee in Stage A.2 surfaced the missing chair/district fields.) 3. DB CHECK ``case_law_external_arar_check`` rejects external_upload rows whose case_number starts with ``"ערר"`` or ``"בל\\"מ"`` — committee decisions must go through internal_decision_upload, not precedent_library_upload. (Bug: the legacy upload path stored everything as external_upload, including appeal-committee decisions; the citation guard now redirects them.) 4. MCP tool ``precedent_library_upload`` returns an ``_err`` envelope when the citation starts with ``"ערר"`` (citation guard, not DB constraint — fires before INSERT to surface a helpful error). These tests connect to the live local Postgres (port 5433) — they do not mock asyncpg. Run with:: pytest mcp-server/tests/test_corpus_constraints.py -v If you don't have ``DATABASE_URL`` set, the tests are skipped. """ from __future__ import annotations import asyncio import json import os from uuid import uuid4 import asyncpg import pytest def _dsn() -> str | None: return ( os.environ.get("DATABASE_URL") or os.environ.get("LEGAL_AI_DATABASE_URL") or "postgresql://legal_ai:od0ASJZFYibOlWK59krLvvETmgqwlXe8@localhost:5433/legal_ai" ) @pytest.fixture() def dsn() -> str: d = _dsn() if not d: pytest.skip("No DATABASE_URL set; skipping live-DB regression tests") return d @pytest.fixture() def event_loop(): """Provide a fresh event loop per test so asyncpg doesn't leak across cases.""" loop = asyncio.new_event_loop() try: yield loop finally: loop.close() def _run(loop, coro): return loop.run_until_complete(coro) # ── 1. cases.practice_area CHECK ───────────────────────────────────── def test_cases_rejects_appeals_committee_practice_area(dsn: str, event_loop) -> None: """``cases.practice_area = 'appeals_committee'`` must violate the CHECK.""" async def attempt() -> None: conn = await asyncpg.connect(dsn) try: with pytest.raises(asyncpg.exceptions.CheckViolationError): await conn.execute( """INSERT INTO cases (id, case_number, title, practice_area) VALUES ($1, $2, $3, $4)""", uuid4(), f"TEST-{uuid4().hex[:8]}", "regression-test", "appeals_committee", ) finally: await conn.close() _run(event_loop, attempt()) def test_cases_accepts_domain_practice_area(dsn: str, event_loop) -> None: """Sanity check: rishuy_uvniya / betterment_levy / compensation_197 + empty string must be accepted.""" async def attempt() -> None: conn = await asyncpg.connect(dsn) try: tx = conn.transaction() await tx.start() try: for value in ("rishuy_uvniya", "betterment_levy", "compensation_197", ""): await conn.execute( """INSERT INTO cases (id, case_number, title, practice_area) VALUES ($1, $2, $3, $4)""", uuid4(), f"TEST-{uuid4().hex[:8]}", f"regression-{value or 'empty'}", value, ) finally: await tx.rollback() finally: await conn.close() _run(event_loop, attempt()) # ── 2. case_law internal_committee chair/district CHECK ───────────── def test_case_law_internal_requires_chair_and_district(dsn: str, event_loop) -> None: """``case_law`` rows with ``source_kind='internal_committee'`` must have non-empty ``chair_name`` AND ``district``.""" async def attempt_missing_chair() -> None: conn = await asyncpg.connect(dsn) try: with pytest.raises(asyncpg.exceptions.CheckViolationError): await conn.execute( """INSERT INTO case_law (id, case_number, case_name, source_kind, district, chair_name) VALUES ($1, $2, $3, $4, $5, $6)""", uuid4(), f"ערר {uuid4().hex[:6]}", "test internal w/o chair", "internal_committee", "ירושלים", "", ) finally: await conn.close() async def attempt_missing_district() -> None: conn = await asyncpg.connect(dsn) try: with pytest.raises(asyncpg.exceptions.CheckViolationError): await conn.execute( """INSERT INTO case_law (id, case_number, case_name, source_kind, district, chair_name) VALUES ($1, $2, $3, $4, $5, $6)""", uuid4(), f"ערר {uuid4().hex[:6]}", "test internal w/o district", "internal_committee", "", "עו\"ד דפנה תמיר", ) finally: await conn.close() _run(event_loop, attempt_missing_chair()) _run(event_loop, attempt_missing_district()) # ── 3. case_law external_upload + ערר citation CHECK ──────────────── def test_case_law_external_upload_rejects_arar_citation(dsn: str, event_loop) -> None: """``case_law`` rows with ``source_kind='external_upload'`` cannot have a ``case_number`` that starts with ``"ערר"`` or ``"בל\"מ"`` — those are committee decisions and must use ``source_kind='internal_committee'``.""" async def attempt_arar() -> None: conn = await asyncpg.connect(dsn) try: with pytest.raises(asyncpg.exceptions.CheckViolationError): await conn.execute( """INSERT INTO case_law (id, case_number, case_name, source_kind) VALUES ($1, $2, $3, $4)""", uuid4(), "ערר 1170/24 חיים נ' ועדה", "test external arar", "external_upload", ) finally: await conn.close() async def attempt_balam() -> None: conn = await asyncpg.connect(dsn) try: with pytest.raises(asyncpg.exceptions.CheckViolationError): await conn.execute( """INSERT INTO case_law (id, case_number, case_name, source_kind) VALUES ($1, $2, $3, $4)""", uuid4(), 'בל"מ 1234/25 פלוני', "test external balam", "external_upload", ) finally: await conn.close() _run(event_loop, attempt_arar()) _run(event_loop, attempt_balam()) # ── 4. MCP precedent_library_upload citation guard ────────────────── def test_mcp_precedent_upload_rejects_arar_citation() -> None: """The MCP tool ``precedent_library_upload`` must short-circuit citations that start with ``"ערר"`` / ``"בל\"מ"`` and return an ``_err`` envelope (a helpful message redirecting to ``internal_decision_upload``), without touching the DB.""" from legal_mcp.tools import precedent_library as tools async def call(citation: str) -> dict: # file_path won't be touched because the guard fires first. return json.loads( await tools.precedent_library_upload( file_path="/nonexistent", citation=citation, ) ) loop = asyncio.new_event_loop() try: for citation in ( "ערר 1170/24 חיים נ' ועדה", 'בל"מ 1234/25 פלוני', "ARAR 8126-25 ב. קרן-נכסים", ): result = loop.run_until_complete(call(citation)) assert "error" in result, ( f"expected guard to reject {citation!r}, got {result!r}" ) # The error message should mention internal_decision_upload so # the caller knows the alternative path. assert "internal_decision_upload" in result["error"], ( f"error message should redirect to internal_decision_upload, " f"got {result['error']!r}" ) finally: loop.close() def test_practice_area_module_invariants() -> None: """Quick guard that the ``practice_area`` service module exposes the helpers tools and tests depend on, and that derivation is consistent with the case-number convention (1xxx/8xxx/9xxx).""" from legal_mcp.services import practice_area as pa # Domain mapping is consistent with the case-number prefix convention. assert pa.derive_domain_practice_area("1170") == "rishuy_uvniya" assert pa.derive_domain_practice_area("8126/25") == "betterment_levy" assert pa.derive_domain_practice_area("9001") == "compensation_197" assert pa.derive_domain_practice_area("ARAR-25-8126") == "betterment_levy" # Unparseable input → empty (caller decides fallback). assert pa.derive_domain_practice_area("foo") == "" assert pa.derive_domain_practice_area("") == "" # Empty practice_area is valid (DB allows it as 'unclassified'). pa.validate("", "unknown") pa.validate("rishuy_uvniya", "building_permit") pa.validate("betterment_levy", "betterment_levy") # appeals_committee (axis A) is still recognised for backward-compat. pa.validate("appeals_committee", "building_permit") # is_override returns False when subtype matches derivation. assert pa.is_override("1170", "rishuy_uvniya", "building_permit") is False assert pa.is_override("8126", "betterment_levy", "betterment_levy") is False