diff --git a/mcp-server/src/legal_mcp/services/precedent_library.py b/mcp-server/src/legal_mcp/services/precedent_library.py index 218c89d..1aa8012 100644 --- a/mcp-server/src/legal_mcp/services/precedent_library.py +++ b/mcp-server/src/legal_mcp/services/precedent_library.py @@ -291,10 +291,16 @@ async def process_pending_extractions(kind: str = "metadata", limit: int = 20) - if result.get("status") == "extraction_failed": await db.set_case_law_halacha_status(cid, "failed") await db.clear_extraction_request(cid, kind=kind) + elif result.get("status") == "extraction_failed": + # metadata transient failure (Gemini hiccup despite full text) — + # do NOT settle 'completed' or the row is silently stranded with + # empty metadata and the drain never revisits it (#138). Revert + # to 'pending' (the queue timestamp is preserved) so it re-drains. + await db.set_case_law_metadata_status(cid, "pending") else: - # metadata — set terminal 'completed' status (also clears the - # request timestamp) so the UI badge settles instead of - # lingering on 'processing'. + # metadata success / no_changes / no_metadata(no text) — set + # terminal 'completed' (also clears the request timestamp) so the + # UI badge settles instead of lingering on 'processing'. await db.set_case_law_metadata_status(cid, "completed") processed += 1 results.append({ diff --git a/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py b/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py index 1a689d5..32f37e9 100644 --- a/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py +++ b/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py @@ -428,7 +428,20 @@ async def extract_and_apply( """Convenience wrapper: extract → merge into row → return summary.""" suggested = await extract_metadata(case_law_id) if not suggested: - return {"status": "no_metadata", "fields": []} + # Empty result has two very different meanings (#138): the precedent has + # NO text to extract from (permanent — nothing the queue can ever do), vs + # the Gemini call FAILED despite the row having full text (transient — a + # key/network/rate-limit hiccup that a retry can recover). Conflating + # them as 'no_metadata' let the drain settle the row to 'completed' on a + # transient failure, silently stranding it with empty metadata. Branch on + # whether text was actually present so the caller can retry the transient + # case and only settle the genuinely-empty one. + record = await db.get_case_law(case_law_id) + has_text = bool(((record or {}).get("full_text") or "").strip()) + return { + "status": "extraction_failed" if has_text else "no_metadata", + "fields": [], + } result = await apply_to_record(case_law_id, suggested, overwrite_case_number=overwrite_case_number) if result["updated"]: await db.recompute_searchable(case_law_id) diff --git a/mcp-server/tests/test_metadata_extract_failure_status.py b/mcp-server/tests/test_metadata_extract_failure_status.py new file mode 100644 index 0000000..4833293 --- /dev/null +++ b/mcp-server/tests/test_metadata_extract_failure_status.py @@ -0,0 +1,63 @@ +"""Regression test for #138 — metadata extraction must distinguish a transient +failure (Gemini hiccup despite the row having text) from a permanent empty +(no text to extract). Conflating them as 'no_metadata' let the drain settle the +row to 'completed' and silently strand it with empty metadata. + +``extract_and_apply`` returns: + * ``extraction_failed`` when ``extract_metadata`` yields nothing BUT the row + has full_text → retryable. + * ``no_metadata`` when the row has no text → genuinely nothing to do. + +Runs fully OFFLINE — monkeypatches ``extract_metadata`` and ``db.get_case_law``. +""" + +from __future__ import annotations + +import asyncio +from uuid import uuid4 + +import pytest + +from legal_mcp.services import db, precedent_metadata_extractor as pme + + +def _run(coro): + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(coro) + finally: + loop.close() + + +@pytest.fixture() +def empty_extract(monkeypatch: pytest.MonkeyPatch): + async def _empty(_cid): + return {} + monkeypatch.setattr(pme, "extract_metadata", _empty) + + +def test_empty_result_with_text_is_transient_failure(empty_extract, monkeypatch): + async def _rec(_cid): + return {"full_text": "פסק דין ארוך עם תוכן ממשי לחילוץ"} + monkeypatch.setattr(db, "get_case_law", _rec) + + out = _run(pme.extract_and_apply(uuid4())) + assert out["status"] == "extraction_failed", out + + +def test_empty_result_without_text_is_no_metadata(empty_extract, monkeypatch): + async def _rec(_cid): + return {"full_text": ""} + monkeypatch.setattr(db, "get_case_law", _rec) + + out = _run(pme.extract_and_apply(uuid4())) + assert out["status"] == "no_metadata", out + + +def test_missing_record_is_no_metadata(empty_extract, monkeypatch): + async def _none(_cid): + return None + monkeypatch.setattr(db, "get_case_law", _none) + + out = _run(pme.extract_and_apply(uuid4())) + assert out["status"] == "no_metadata", out