legal-ai/mcp-server/tests/test_metadata_extract_failure_status.py

"""Regression test for #138 — metadata extraction must distinguish a transient
failure (Gemini hiccup despite the row having text) from a permanent empty
(no text to extract). Conflating them as 'no_metadata' let the drain settle the
row to 'completed' and silently strand it with empty metadata.

``extract_and_apply`` returns:
  * ``extraction_failed`` when ``extract_metadata`` yields nothing BUT the row
    has full_text → retryable.
  * ``no_metadata`` when the row has no text → genuinely nothing to do.

Runs fully OFFLINE — monkeypatches ``extract_metadata`` and ``db.get_case_law``.
"""

from __future__ import annotations

import asyncio
from uuid import uuid4

import pytest

from legal_mcp.services import db, precedent_metadata_extractor as pme


def _run(coro):
    loop = asyncio.new_event_loop()
    try:
        return loop.run_until_complete(coro)
    finally:
        loop.close()


@pytest.fixture()
def empty_extract(monkeypatch: pytest.MonkeyPatch):
    async def _empty(_cid):
        return {}
    monkeypatch.setattr(pme, "extract_metadata", _empty)


def test_empty_result_with_text_is_transient_failure(empty_extract, monkeypatch):
    async def _rec(_cid):
        return {"full_text": "פסק דין ארוך עם תוכן ממשי לחילוץ"}
    monkeypatch.setattr(db, "get_case_law", _rec)

    out = _run(pme.extract_and_apply(uuid4()))
    assert out["status"] == "extraction_failed", out


def test_empty_result_without_text_is_no_metadata(empty_extract, monkeypatch):
    async def _rec(_cid):
        return {"full_text": ""}
    monkeypatch.setattr(db, "get_case_law", _rec)

    out = _run(pme.extract_and_apply(uuid4()))
    assert out["status"] == "no_metadata", out


def test_missing_record_is_no_metadata(empty_extract, monkeypatch):
    async def _none(_cid):
        return None
    monkeypatch.setattr(db, "get_case_law", _none)

    out = _run(pme.extract_and_apply(uuid4()))
    assert out["status"] == "no_metadata", out