"""Reconcile stale ``metadata_extraction_status='pending'`` rows (G1). The column defaults to 'pending', but only ``source_kind='external_upload'`` rows with extractable text genuinely need the Gemini metadata drain. Internal committee decisions carry deterministic metadata (never Gemini) and cited_only stubs have no text to extract — both linger forever as phantom backlog that the drain (which scans ``metadata_extraction_requested_at IS NOT NULL``) can never clear, inflating the /operations "ממתין" counter. This settles each row to a truthful terminal state at the source: - internal_committee → 'completed' (metadata is deterministic, out of Gemini pipeline) - external_upload w/ name+summary → 'completed' (already filled) - external_upload w/ text but missing name/summary → stamp requested_at (real work → drain picks it up) - cited_only (no text) → 'skipped' (terminal; nothing to extract) Idempotent and re-runnable (a healthy DB reports all-zero). The companion source fix lives in db.create_internal_committee_decision (inserts 'completed' directly) so internal rows never re-enter this state. Host-only (reads POSTGRES_URL from ~/.env via legal_mcp.config). Run by hand: mcp-server/.venv/bin/python scripts/reconcile_metadata_status.py """ import asyncio import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "mcp-server", "src")) from legal_mcp.services import db async def main() -> int: pool = await db.get_pool() internal = await pool.execute( "UPDATE case_law SET metadata_extraction_status = 'completed' " "WHERE source_kind = 'internal_committee' " "AND metadata_extraction_status = 'pending'" ) external_done = await pool.execute( "UPDATE case_law SET metadata_extraction_status = 'completed' " "WHERE source_kind = 'external_upload' " "AND metadata_extraction_status = 'pending' " "AND coalesce(case_name, '') <> '' AND coalesce(summary, '') <> ''" ) external_requeued = await pool.execute( "UPDATE case_law SET metadata_extraction_requested_at = now() " "WHERE source_kind = 'external_upload' " "AND metadata_extraction_status = 'pending' " "AND coalesce(full_text, '') <> '' " "AND (coalesce(case_name, '') = '' OR coalesce(summary, '') = '') " "AND metadata_extraction_requested_at IS NULL" ) cited = await pool.execute( "UPDATE case_law SET metadata_extraction_status = 'skipped' " "WHERE source_kind = 'cited_only' " "AND metadata_extraction_status = 'pending'" ) def n(tag: str) -> str: try: return tag.split()[-1] except (AttributeError, IndexError): return "?" print(f"internal_committee → completed : {n(internal)}") print(f"external_upload → completed : {n(external_done)}") print(f"external_upload → requeued : {n(external_requeued)}") print(f"cited_only → skipped : {n(cited)}") rows = await pool.fetch( "SELECT coalesce(metadata_extraction_status,'NULL') s, count(*) n " "FROM case_law GROUP BY 1 ORDER BY 2 DESC" ) print("\nresulting metadata_extraction_status distribution:") for r in rows: print(f" {r['s']:<12} {r['n']}") return 0 if __name__ == "__main__": sys.exit(asyncio.run(main()))