diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py
index 7cfb28f..824f081 100644
--- a/mcp-server/src/legal_mcp/services/db.py
+++ b/mcp-server/src/legal_mcp/services/db.py
@@ -3401,12 +3401,20 @@ async def create_internal_committee_decision(
subject_tags, summary, full_text,
source_kind, source_type, document_id,
extraction_status, halacha_extraction_status,
+ metadata_extraction_status,
practice_area, appeal_subtype, is_binding, proceeding_type, content_hash
) VALUES (
$1, $2, $3, $4, $5, $6,
$7, $8, $9,
'internal_committee', 'appeals_committee', $10,
'processing', 'pending',
+ -- Internal committee decisions carry deterministic metadata
+ -- computed from the case record (see app.py uniform-citation
+ -- builder); the Gemini metadata extractor is tuned for EXTERNAL
+ -- rulings and returns no_metadata for these. Settle the metadata
+ -- status to 'completed' at the source so these rows never linger
+ -- as phantom 'pending' backlog the drain can never clear.
+ 'completed',
$11, $12, $13, $14, $15
)
ON CONFLICT (case_number, proceeding_type)
@@ -3428,6 +3436,8 @@ async def create_internal_committee_decision(
document_id = COALESCE(EXCLUDED.document_id, case_law.document_id),
extraction_status = 'processing',
halacha_extraction_status = 'pending',
+ -- Keep metadata settled on re-upsert (deterministic, never Gemini).
+ metadata_extraction_status = 'completed',
content_hash = EXCLUDED.content_hash
RETURNING *
""",
diff --git a/scripts/SCRIPTS.md b/scripts/SCRIPTS.md
index bc76be3..f14b1ef 100644
--- a/scripts/SCRIPTS.md
+++ b/scripts/SCRIPTS.md
@@ -28,6 +28,7 @@
| `legal-court-fetch-drain.config.cjs` | pm2/js | **תזמון שעתי של `drain_court_fetch.py`** (cron `17 * * * *`, `COURT_FETCH_DRAIN_CRON` לעקיפה) — הופך את לולאת יומון→אחזור→קליטה ל-fully-autonomous. `autorestart:false` (one-shot per tick). דורש `legal-court-fetch-service` רץ. התקנה: `pm2 start scripts/legal-court-fetch-drain.config.cjs && pm2 save`. | pm2 cron (host-side) |
| `drain_metadata_queue.py` | python | **ריקון תור חילוץ-המטא של הפסיקה** — `process_pending_extractions(kind='metadata')` ב-batches עד ריק. רץ על **Gemini Flash** (structured JSON, `gemini_session`) — מהיר ואמין, במקום ה-claude CLI ה-agentic שפגע ב-`error_max_turns`. no-op מהיר כשריק. הרצה ידנית: `mcp-server/.venv/bin/python scripts/drain_metadata_queue.py [batch]`. | דרך `legal-metadata-drain.config.cjs` (pm2 cron) |
| `legal-metadata-drain.config.cjs` | pm2/js | **תזמון כל 15 דק' של `drain_metadata_queue.py`** (cron `*/15 * * * *`, `METADATA_DRAIN_CRON` לעקיפה) — מונע סתימה של תור חילוץ-המטא ב-/precedents. דורש `GEMINI_API_KEY` ב-`~/.env`. התקנה: `pm2 start scripts/legal-metadata-drain.config.cjs && pm2 save`. | pm2 cron (host-side) |
+| `reconcile_metadata_status.py` | python | **נרמול `metadata_extraction_status` תקוע (G1)** — שורות עם ברירת-המחדל `'pending'` שאינן בצנרת-Gemini נערמות כ-backlog-רפאים שהדריינר (סורק `*_requested_at IS NOT NULL`) לעולם לא מנקה ומנפח את מונה "ממתין" ב-/operations. מיישב כל שורה למצב-אמת במקור: `internal_committee`→`completed` (מטא דטרמיניסטי, מחוץ ל-Gemini), `external_upload` מלא→`completed`, `external_upload` עם טקסט וחסר שם/תקציר→חותם `requested_at` (הדריינר יטפל), `cited_only` (אין טקסט)→`skipped`. אידמפוטנטי. תיקון-המקור הנלווה ב-`db.create_internal_committee_decision`. הרצה: `mcp-server/.venv/bin/python scripts/reconcile_metadata_status.py`. | חד-פעמי / re-runnable כהגנת-drift |
| `auto-sync-cases.sh` | bash | סנכרון תיקי ערר ל-Gitea — רץ כל דקה | `* * * * *` (cron) |
| `backup-db.sh` | bash | גיבוי PostgreSQL יומי ל-`data/backups/` (gzip) | לתזמן: `0 2 * * *` |
| `restore-db.sh` | bash | שחזור DB מגיבוי (companion ל-backup-db.sh) | ידני |
diff --git a/scripts/reconcile_metadata_status.py b/scripts/reconcile_metadata_status.py
new file mode 100644
index 0000000..c604716
--- /dev/null
+++ b/scripts/reconcile_metadata_status.py
@@ -0,0 +1,83 @@
+"""Reconcile stale ``metadata_extraction_status='pending'`` rows (G1).
+
+The column defaults to 'pending', but only ``source_kind='external_upload'``
+rows with extractable text genuinely need the Gemini metadata drain. Internal
+committee decisions carry deterministic metadata (never Gemini) and cited_only
+stubs have no text to extract — both linger forever as phantom backlog that the
+drain (which scans ``metadata_extraction_requested_at IS NOT NULL``) can never
+clear, inflating the /operations "ממתין" counter.
+
+This settles each row to a truthful terminal state at the source:
+ - internal_committee → 'completed' (metadata is deterministic, out of Gemini pipeline)
+ - external_upload w/ name+summary → 'completed' (already filled)
+ - external_upload w/ text but missing name/summary → stamp requested_at (real work → drain picks it up)
+ - cited_only (no text) → 'skipped' (terminal; nothing to extract)
+
+Idempotent and re-runnable (a healthy DB reports all-zero). The companion source
+fix lives in db.create_internal_committee_decision (inserts 'completed' directly)
+so internal rows never re-enter this state.
+
+Host-only (reads POSTGRES_URL from ~/.env via legal_mcp.config). Run by hand:
+ mcp-server/.venv/bin/python scripts/reconcile_metadata_status.py
+"""
+
+import asyncio
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "mcp-server", "src"))
+
+from legal_mcp.services import db
+
+
+async def main() -> int:
+ pool = await db.get_pool()
+
+ internal = await pool.execute(
+ "UPDATE case_law SET metadata_extraction_status = 'completed' "
+ "WHERE source_kind = 'internal_committee' "
+ "AND metadata_extraction_status = 'pending'"
+ )
+ external_done = await pool.execute(
+ "UPDATE case_law SET metadata_extraction_status = 'completed' "
+ "WHERE source_kind = 'external_upload' "
+ "AND metadata_extraction_status = 'pending' "
+ "AND coalesce(case_name, '') <> '' AND coalesce(summary, '') <> ''"
+ )
+ external_requeued = await pool.execute(
+ "UPDATE case_law SET metadata_extraction_requested_at = now() "
+ "WHERE source_kind = 'external_upload' "
+ "AND metadata_extraction_status = 'pending' "
+ "AND coalesce(full_text, '') <> '' "
+ "AND (coalesce(case_name, '') = '' OR coalesce(summary, '') = '') "
+ "AND metadata_extraction_requested_at IS NULL"
+ )
+ cited = await pool.execute(
+ "UPDATE case_law SET metadata_extraction_status = 'skipped' "
+ "WHERE source_kind = 'cited_only' "
+ "AND metadata_extraction_status = 'pending'"
+ )
+
+ def n(tag: str) -> str:
+ try:
+ return tag.split()[-1]
+ except (AttributeError, IndexError):
+ return "?"
+
+ print(f"internal_committee → completed : {n(internal)}")
+ print(f"external_upload → completed : {n(external_done)}")
+ print(f"external_upload → requeued : {n(external_requeued)}")
+ print(f"cited_only → skipped : {n(cited)}")
+
+ rows = await pool.fetch(
+ "SELECT coalesce(metadata_extraction_status,'NULL') s, count(*) n "
+ "FROM case_law GROUP BY 1 ORDER BY 2 DESC"
+ )
+ print("\nresulting metadata_extraction_status distribution:")
+ for r in rows:
+ print(f" {r['s']:<12} {r['n']}")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(asyncio.run(main()))
diff --git a/web-ui/src/app/operations/page.tsx b/web-ui/src/app/operations/page.tsx
index fd7f184..9777748 100644
--- a/web-ui/src/app/operations/page.tsx
+++ b/web-ui/src/app/operations/page.tsx
@@ -295,10 +295,10 @@ function UniformStats({ p }: { p: PipelineStats }) {
title="ממתינים שנדרשו במפורש לעיבוד — אלה שה-drain הבא יטפל בהם"
/>
diff --git a/web-ui/src/lib/api/operations.ts b/web-ui/src/lib/api/operations.ts
index d884cd5..57af3d4 100644
--- a/web-ui/src/lib/api/operations.ts
+++ b/web-ui/src/lib/api/operations.ts
@@ -32,7 +32,7 @@ export type IngestedRow = {
/** The uniform per-pipeline shape every background drain reports. */
export type PipelineStats = {
- pending: number; // backlog: rows not yet processed (status default)
+ pending: number; // awaiting processing (status='pending') — not necessarily in the active queue
processing: number; // being worked right now
done: number; // completed
failed: number; // terminal failures (court_fetch folds in 'manual')