legal-ai/scripts/drain_metadata_queue.py

"""Drain the precedent metadata-extraction queue.

Calls ``process_pending_extractions(kind='metadata')`` in batches until the
queue is empty (two consecutive zero-progress rounds). Metadata extraction runs
on **Gemini Flash** (structured JSON) — fast and reliable, unlike the agentic
claude CLI which hit ``error_max_turns`` on this bounded task. A no-op (fast)
when the queue is empty.

Host-only (reads GEMINI_API_KEY + POSTGRES_URL from ~/.env via legal_mcp.config).
Scheduled by ``legal-metadata-drain`` (pm2 cron); also runnable by hand:

    mcp-server/.venv/bin/python scripts/drain_metadata_queue.py [batch]
"""

import asyncio
import os
import sys

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "mcp-server", "src"))

from legal_mcp.services import precedent_library as pl


async def main() -> int:
    batch = int(sys.argv[1]) if len(sys.argv) > 1 else 10
    total = 0
    empty_rounds = 0
    rnd = 0
    while empty_rounds < 2:
        rnd += 1
        out = await pl.process_pending_extractions(kind="metadata", limit=batch)
        processed = out.get("processed", 0)
        total += processed
        print(f"[round {rnd}] processed={processed} total_pending={out.get('total_pending', 0)} "
              f"status={out.get('status')}", flush=True)
        for r in out.get("results", []):
            print(f"    {str(r.get('case_number',''))[:42]}: {r.get('status')}", flush=True)
        if processed == 0:
            empty_rounds += 1
            await asyncio.sleep(3)
        else:
            empty_rounds = 0
    print(f"===DONE=== metadata extracted (cumulative cases handled={total})", flush=True)
    return 0


if __name__ == "__main__":
    sys.exit(asyncio.run(main()))