From 77817a46ad6804c792a5ddfd2e4ac47f514b711b Mon Sep 17 00:00:00 2001
From: Chaim <chaim@marcus-law.co.il>
Date: Mon, 15 Jun 2026 03:34:43 +0000
Subject: [PATCH] =?UTF-8?q?fix(metadata):=20=D7=9C=D7=90=20=D7=9C=D7=94?=
 =?UTF-8?q?=D7=AA=D7=99=D7=99=D7=A9=D7=91=20'completed'=20=D7=91=D7=9B?=
 =?UTF-8?q?=D7=A9=D7=9C-=D7=97=D7=99=D7=9C=D7=95=D7=A5-Gemini=20=D7=97?=
 =?UTF-8?q?=D7=95=D7=9C=D7=A3=20(#138)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

צד-המטא-דאטה (precedent_metadata_extractor) קרא רק GEMINI_API_KEY בעוד בסביבה
קיים GOOGLE_GEMINI_API_KEY — תוקן ב-PR #255 (fallback). הבאג המשני שנותר: כש-
extract_and_apply החזיר 'no_metadata' (כשל-Gemini), מסלול-הדריינר
process_pending_extractions התיישב ל-metadata_status='completed' ללא-תנאי, כך
שהרשומה ננטשה בשקט עם מטא ריק והדריינר לא חזר אליה (נצפה: da2d9ccb '4491-02-21',
5fabdac5 '14306-09-23' — completed אך court/date/summary ריקים).

תיקון (G1 — אבחנת-מקור):
- extract_and_apply מבדיל תוצאה-ריקה: יש full_text → 'extraction_failed' (חולף,
  בר-retry); אין full_text → 'no_metadata' (אין מה לחלץ).
- process_pending_extractions (metadata): 'extraction_failed' → חוזר ל-'pending'
  (משמר את חותם-התור) במקום להתיישב 'completed'. retry-loop הקיים מנסה שוב,
  ואחרי-מיצוי הרשומה נשארת בתור. מסלול reextract_metadata כבר עקבי (חוזר pending
  על כל מה שאינו completed/no_changes).

תיקון-נתון (בוצע ידנית דרך כלי-MCP precedent_extract_metadata): da2d9ccb +
5fabdac5 חולצו-מחדש בהצלחה (court/date/summary/headnote/tags מלאים). 0 נותרו
external 'completed-but-empty'.

הערה: מפתח-Gemini אינו נדרש ב-Coolify — המחלץ רץ רק מקומית (precedent_library →
extract_and_apply, host ~/.env עם GOOGLE_GEMINI_API_KEY); app.py מייבא רק את
הקבוע PLACEHOLDER_PENDING_EXTRACTION, לא את פונקציית-החילוץ.

בדיקות: test_metadata_extract_failure_status (transient/permanent/missing). כל
335 בדיקות mcp עוברות. guards נקיים.

Invariants: G1 (אבחנת-מקור, לא התיישבות-בקריאה), INV-G3/X16 (עמידות — בר-retry),
G12 (leak-guard נקי).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../legal_mcp/services/precedent_library.py   | 12 +++-
 .../services/precedent_metadata_extractor.py  | 15 ++++-
 .../test_metadata_extract_failure_status.py   | 63 +++++++++++++++++++
 3 files changed, 86 insertions(+), 4 deletions(-)
 create mode 100644 mcp-server/tests/test_metadata_extract_failure_status.py

diff --git a/mcp-server/src/legal_mcp/services/precedent_library.py b/mcp-server/src/legal_mcp/services/precedent_library.py
index 218c89d..1aa8012 100644
--- a/mcp-server/src/legal_mcp/services/precedent_library.py
+++ b/mcp-server/src/legal_mcp/services/precedent_library.py
@@ -291,10 +291,16 @@ async def process_pending_extractions(kind: str = "metadata", limit: int = 20) -
                 if result.get("status") == "extraction_failed":
                     await db.set_case_law_halacha_status(cid, "failed")
                 await db.clear_extraction_request(cid, kind=kind)
+            elif result.get("status") == "extraction_failed":
+                # metadata transient failure (Gemini hiccup despite full text) —
+                # do NOT settle 'completed' or the row is silently stranded with
+                # empty metadata and the drain never revisits it (#138). Revert
+                # to 'pending' (the queue timestamp is preserved) so it re-drains.
+                await db.set_case_law_metadata_status(cid, "pending")
             else:
-                # metadata — set terminal 'completed' status (also clears the
-                # request timestamp) so the UI badge settles instead of
-                # lingering on 'processing'.
+                # metadata success / no_changes / no_metadata(no text) — set
+                # terminal 'completed' (also clears the request timestamp) so the
+                # UI badge settles instead of lingering on 'processing'.
                 await db.set_case_law_metadata_status(cid, "completed")
             processed += 1
             results.append({
diff --git a/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py b/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py
index 1a689d5..32f37e9 100644
--- a/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py
+++ b/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py
@@ -428,7 +428,20 @@ async def extract_and_apply(
     """Convenience wrapper: extract → merge into row → return summary."""
     suggested = await extract_metadata(case_law_id)
     if not suggested:
-        return {"status": "no_metadata", "fields": []}
+        # Empty result has two very different meanings (#138): the precedent has
+        # NO text to extract from (permanent — nothing the queue can ever do), vs
+        # the Gemini call FAILED despite the row having full text (transient — a
+        # key/network/rate-limit hiccup that a retry can recover). Conflating
+        # them as 'no_metadata' let the drain settle the row to 'completed' on a
+        # transient failure, silently stranding it with empty metadata. Branch on
+        # whether text was actually present so the caller can retry the transient
+        # case and only settle the genuinely-empty one.
+        record = await db.get_case_law(case_law_id)
+        has_text = bool(((record or {}).get("full_text") or "").strip())
+        return {
+            "status": "extraction_failed" if has_text else "no_metadata",
+            "fields": [],
+        }
     result = await apply_to_record(case_law_id, suggested, overwrite_case_number=overwrite_case_number)
     if result["updated"]:
         await db.recompute_searchable(case_law_id)
diff --git a/mcp-server/tests/test_metadata_extract_failure_status.py b/mcp-server/tests/test_metadata_extract_failure_status.py
new file mode 100644
index 0000000..4833293
--- /dev/null
+++ b/mcp-server/tests/test_metadata_extract_failure_status.py
@@ -0,0 +1,63 @@
+"""Regression test for #138 — metadata extraction must distinguish a transient
+failure (Gemini hiccup despite the row having text) from a permanent empty
+(no text to extract). Conflating them as 'no_metadata' let the drain settle the
+row to 'completed' and silently strand it with empty metadata.
+
+``extract_and_apply`` returns:
+  * ``extraction_failed`` when ``extract_metadata`` yields nothing BUT the row
+    has full_text → retryable.
+  * ``no_metadata`` when the row has no text → genuinely nothing to do.
+
+Runs fully OFFLINE — monkeypatches ``extract_metadata`` and ``db.get_case_law``.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from uuid import uuid4
+
+import pytest
+
+from legal_mcp.services import db, precedent_metadata_extractor as pme
+
+
+def _run(coro):
+    loop = asyncio.new_event_loop()
+    try:
+        return loop.run_until_complete(coro)
+    finally:
+        loop.close()
+
+
+@pytest.fixture()
+def empty_extract(monkeypatch: pytest.MonkeyPatch):
+    async def _empty(_cid):
+        return {}
+    monkeypatch.setattr(pme, "extract_metadata", _empty)
+
+
+def test_empty_result_with_text_is_transient_failure(empty_extract, monkeypatch):
+    async def _rec(_cid):
+        return {"full_text": "פסק דין ארוך עם תוכן ממשי לחילוץ"}
+    monkeypatch.setattr(db, "get_case_law", _rec)
+
+    out = _run(pme.extract_and_apply(uuid4()))
+    assert out["status"] == "extraction_failed", out
+
+
+def test_empty_result_without_text_is_no_metadata(empty_extract, monkeypatch):
+    async def _rec(_cid):
+        return {"full_text": ""}
+    monkeypatch.setattr(db, "get_case_law", _rec)
+
+    out = _run(pme.extract_and_apply(uuid4()))
+    assert out["status"] == "no_metadata", out
+
+
+def test_missing_record_is_no_metadata(empty_extract, monkeypatch):
+    async def _none(_cid):
+        return None
+    monkeypatch.setattr(db, "get_case_law", _none)
+
+    out = _run(pme.extract_and_apply(uuid4()))
+    assert out["status"] == "no_metadata", out
-- 
2.49.1