feat(extraction): precedent metadata via Gemini Flash + scheduled drainer

The /precedents metadata queue was stuck — 24 rows requested, nothing draining them — and the agentic claude CLI hit error_max_turns on what is a single structured text→JSON task (slow + flaky). Metadata extraction is bounded extraction, the wrong fit for an agentic loop. - gemini_session.py: query_json drop-in (gemini-2.5-flash, JSON mode, httpx — no new SDK dep). Reads GEMINI_API_KEY (~/.env; SoT Infisical nautilus:/external-apis/gemini). Host-side only — no LLM from the container. - precedent_metadata_extractor: claude_session.query_json → gemini_session. Validated live: rich, accurate fields (case_name/summary/appeal_subtype/tags). - process_pending_extractions: kind-aware cooldown — metadata 2s (Gemini, fast), halacha keeps 30s (Claude rate limits). - drain_metadata_queue.py + legal-metadata-drain.config.cjs (pm2 cron */15) so the queue never clogs again. SCRIPTS.md. - X8 INV-FP5 updated: per-task engine choice (Gemini=bounded metadata, claude_session=agentic halacha), both host-side, single canonical queue (G2). Agentic/voice-sensitive work (writing, analysis, halacha) stays on claude_session (Daphna's subscription). Gemini cost ≈ $0.10/1M tokens — negligible. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 05:13:49 +00:00
parent cc9adc5c1f
commit d95a36f310
7 changed files with 202 additions and 9 deletions
--- a/mcp-server/src/legal_mcp/services/gemini_session.py
+++ b/mcp-server/src/legal_mcp/services/gemini_session.py
@@ -0,0 +1,97 @@
+"""Gemini structured-output helper — a drop-in for ``claude_session.query_json``
+for BOUNDED extraction tasks (text → JSON).
+
+Why a second LLM path: metadata extraction is a single structured call (fill
+case_name/summary/headnote/tags from a verdict's text), not an agentic loop. The
+``claude -p`` CLI behind ``claude_session`` is agentic — it reaches for tools and
+hits ``error_max_turns`` on a task that should be one shot — so it was slow and
+flaky for the precedent metadata queue. Gemini Flash with JSON mode
+(``responseMimeType: application/json``) is the right tool: one call, schema-
+clean JSON, fast, and ~$0.10/1M tokens (negligible for this volume).
+
+Scope: **bounded extraction only** (precedent metadata). The agentic, voice-
+sensitive work — decision writing, analysis, halacha extraction — stays on
+``claude_session`` (Daphna's subscription, zero API cost). This is a deliberate
+per-task provider choice, not a wholesale move off Claude.
+
+Key: ``GEMINI_API_KEY`` (host ~/.env; SoT Infisical nautilus:/external-apis/gemini
+as ``GOOGLE_GEMINI_API_KEY``). Model: ``GEMINI_MODEL`` (default gemini-2.5-flash).
+Direct REST via httpx — no extra SDK dependency.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+_BASE = "https://generativelanguage.googleapis.com/v1beta"
+_DEFAULT_MODEL = os.environ.get("GEMINI_MODEL", "gemini-2.5-flash")
+_DEFAULT_TIMEOUT = float(os.environ.get("GEMINI_TIMEOUT_S", "120"))
+
+
+class GeminiError(RuntimeError):
+    """Gemini API call failed or returned an unexpected shape."""
+
+
+def _api_key() -> str:
+    key = os.environ.get("GEMINI_API_KEY", "").strip()
+    if not key:
+        raise GeminiError(
+            "GEMINI_API_KEY אינו מוגדר (host ~/.env / Infisical "
+            "nautilus:/external-apis/gemini)."
+        )
+    return key
+
+
+async def query_json(
+    prompt: str,
+    timeout: float | int = _DEFAULT_TIMEOUT,
+    *,
+    system: str | None = None,
+    model: str | None = None,
+    # Accepted for drop-in parity with claude_session.query_json; ignored here.
+    effort: str | None = None,
+    tools: str | None = None,
+) -> dict | list | None:
+    """Single structured-output call → parsed JSON. Drop-in for
+    ``claude_session.query_json``. Raises ``GeminiError`` on failure (the caller
+    treats that like any extraction failure — recorded, never silently wrong).
+    """
+    model = model or _DEFAULT_MODEL
+    body: dict = {
+        "contents": [{"role": "user", "parts": [{"text": prompt}]}],
+        "generationConfig": {
+            "responseMimeType": "application/json",
+            "temperature": 0,
+        },
+    }
+    if system:
+        body["system_instruction"] = {"parts": [{"text": system}]}
+
+    url = f"{_BASE}/models/{model}:generateContent"
+    try:
+        async with httpx.AsyncClient(timeout=float(timeout)) as client:
+            resp = await client.post(url, params={"key": _api_key()}, json=body)
+    except httpx.HTTPError as e:
+        raise GeminiError(f"Gemini request failed: {e}") from e
+    if resp.status_code != 200:
+        raise GeminiError(f"Gemini HTTP {resp.status_code}: {resp.text[:200]}")
+
+    data = resp.json()
+    # Surface an explicit safety/finish block rather than returning empty.
+    cand = (data.get("candidates") or [{}])[0]
+    if cand.get("finishReason") in ("SAFETY", "RECITATION", "PROHIBITED_CONTENT"):
+        raise GeminiError(f"Gemini blocked output: finishReason={cand['finishReason']}")
+    try:
+        text = cand["content"]["parts"][0]["text"]
+    except (KeyError, IndexError, TypeError) as e:
+        raise GeminiError(f"Gemini unexpected response: {str(data)[:200]}") from e
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError as e:
+        raise GeminiError(f"Gemini returned non-JSON: {text[:200]}") from e
--- a/mcp-server/src/legal_mcp/services/precedent_library.py
+++ b/mcp-server/src/legal_mcp/services/precedent_library.py
@@ -15,6 +15,7 @@ from __future__ import annotations

 import asyncio
 import logging
+import os
 from pathlib import Path
 from typing import Awaitable, Callable
 from uuid import UUID
@@ -179,6 +180,9 @@ async def reextract_halachot(
 # precedent into a 429 storm. Observed 2026-05-03: 1110/20 succeeded with 9
 # halachot, 317/10 immediately after returned silent no_halachot.
 INTER_PRECEDENT_COOLDOWN_SEC = 30
+# Metadata extraction is on Gemini (fast, high rate limits) — a brief spacer is
+# enough; the 30s above is for the Claude-backed halacha path.
+METADATA_COOLDOWN_SEC = float(os.environ.get("METADATA_COOLDOWN_SEC", "2"))

 # How many times to retry a precedent that came back as 'extraction_failed'
 # (i.e. >50% chunks crashed). Each retry uses a longer cooldown.
@@ -226,11 +230,14 @@ async def process_pending_extractions(kind: str = "metadata", limit: int = 20) -
            cid, effort=config.HALACHA_BULK_EXTRACT_EFFORT,
        )

+    # Metadata extraction runs on Gemini (high rate limits, fast) — the long
+    # cooldown is only needed for halacha (Claude/Anthropic rate limits).
+    cooldown = METADATA_COOLDOWN_SEC if kind == "metadata" else INTER_PRECEDENT_COOLDOWN_SEC
    results: list[dict] = []
    processed = 0
    for idx, row in enumerate(pending):
        if idx > 0:
-            await asyncio.sleep(INTER_PRECEDENT_COOLDOWN_SEC)
+            await asyncio.sleep(cooldown)
        cid = UUID(str(row["id"]))
        attempts = 0
        result: dict = {}
--- a/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py
+++ b/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py
@@ -19,7 +19,7 @@ from datetime import date as date_type
 from uuid import UUID

 from legal_mcp.config import parse_llm_json
-from legal_mcp.services import claude_session, db
+from legal_mcp.services import db, gemini_session

 logger = logging.getLogger(__name__)

@@ -150,7 +150,10 @@ async def extract_metadata(case_law_id: UUID | str) -> dict:
    )

    try:
-        result = await claude_session.query_json(
+        # Bounded structured extraction → Gemini Flash (JSON mode). The agentic
+        # claude CLI hit error_max_turns on this single-shot task; see
+        # gemini_session.py. Voice-sensitive/agentic work stays on claude_session.
+        result = await gemini_session.query_json(
            user_msg, system=METADATA_EXTRACTION_PROMPT,
        )
    except Exception as e: