diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py index 7cfb28f..824f081 100644 --- a/mcp-server/src/legal_mcp/services/db.py +++ b/mcp-server/src/legal_mcp/services/db.py @@ -3401,12 +3401,20 @@ async def create_internal_committee_decision( subject_tags, summary, full_text, source_kind, source_type, document_id, extraction_status, halacha_extraction_status, + metadata_extraction_status, practice_area, appeal_subtype, is_binding, proceeding_type, content_hash ) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, 'internal_committee', 'appeals_committee', $10, 'processing', 'pending', + -- Internal committee decisions carry deterministic metadata + -- computed from the case record (see app.py uniform-citation + -- builder); the Gemini metadata extractor is tuned for EXTERNAL + -- rulings and returns no_metadata for these. Settle the metadata + -- status to 'completed' at the source so these rows never linger + -- as phantom 'pending' backlog the drain can never clear. + 'completed', $11, $12, $13, $14, $15 ) ON CONFLICT (case_number, proceeding_type) @@ -3428,6 +3436,8 @@ async def create_internal_committee_decision( document_id = COALESCE(EXCLUDED.document_id, case_law.document_id), extraction_status = 'processing', halacha_extraction_status = 'pending', + -- Keep metadata settled on re-upsert (deterministic, never Gemini). + metadata_extraction_status = 'completed', content_hash = EXCLUDED.content_hash RETURNING * """, diff --git a/scripts/SCRIPTS.md b/scripts/SCRIPTS.md index bc76be3..f14b1ef 100644 --- a/scripts/SCRIPTS.md +++ b/scripts/SCRIPTS.md @@ -28,6 +28,7 @@ | `legal-court-fetch-drain.config.cjs` | pm2/js | **תזמון שעתי של `drain_court_fetch.py`** (cron `17 * * * *`, `COURT_FETCH_DRAIN_CRON` לעקיפה) — הופך את לולאת יומון→אחזור→קליטה ל-fully-autonomous. `autorestart:false` (one-shot per tick). דורש `legal-court-fetch-service` רץ. התקנה: `pm2 start scripts/legal-court-fetch-drain.config.cjs && pm2 save`. | pm2 cron (host-side) | | `drain_metadata_queue.py` | python | **ריקון תור חילוץ-המטא של הפסיקה** — `process_pending_extractions(kind='metadata')` ב-batches עד ריק. רץ על **Gemini Flash** (structured JSON, `gemini_session`) — מהיר ואמין, במקום ה-claude CLI ה-agentic שפגע ב-`error_max_turns`. no-op מהיר כשריק. הרצה ידנית: `mcp-server/.venv/bin/python scripts/drain_metadata_queue.py [batch]`. | דרך `legal-metadata-drain.config.cjs` (pm2 cron) | | `legal-metadata-drain.config.cjs` | pm2/js | **תזמון כל 15 דק' של `drain_metadata_queue.py`** (cron `*/15 * * * *`, `METADATA_DRAIN_CRON` לעקיפה) — מונע סתימה של תור חילוץ-המטא ב-/precedents. דורש `GEMINI_API_KEY` ב-`~/.env`. התקנה: `pm2 start scripts/legal-metadata-drain.config.cjs && pm2 save`. | pm2 cron (host-side) | +| `reconcile_metadata_status.py` | python | **נרמול `metadata_extraction_status` תקוע (G1)** — שורות עם ברירת-המחדל `'pending'` שאינן בצנרת-Gemini נערמות כ-backlog-רפאים שהדריינר (סורק `*_requested_at IS NOT NULL`) לעולם לא מנקה ומנפח את מונה "ממתין" ב-/operations. מיישב כל שורה למצב-אמת במקור: `internal_committee`→`completed` (מטא דטרמיניסטי, מחוץ ל-Gemini), `external_upload` מלא→`completed`, `external_upload` עם טקסט וחסר שם/תקציר→חותם `requested_at` (הדריינר יטפל), `cited_only` (אין טקסט)→`skipped`. אידמפוטנטי. תיקון-המקור הנלווה ב-`db.create_internal_committee_decision`. הרצה: `mcp-server/.venv/bin/python scripts/reconcile_metadata_status.py`. | חד-פעמי / re-runnable כהגנת-drift | | `auto-sync-cases.sh` | bash | סנכרון תיקי ערר ל-Gitea — רץ כל דקה | `* * * * *` (cron) | | `backup-db.sh` | bash | גיבוי PostgreSQL יומי ל-`data/backups/` (gzip) | לתזמן: `0 2 * * *` | | `restore-db.sh` | bash | שחזור DB מגיבוי (companion ל-backup-db.sh) | ידני | diff --git a/scripts/reconcile_metadata_status.py b/scripts/reconcile_metadata_status.py new file mode 100644 index 0000000..c604716 --- /dev/null +++ b/scripts/reconcile_metadata_status.py @@ -0,0 +1,83 @@ +"""Reconcile stale ``metadata_extraction_status='pending'`` rows (G1). + +The column defaults to 'pending', but only ``source_kind='external_upload'`` +rows with extractable text genuinely need the Gemini metadata drain. Internal +committee decisions carry deterministic metadata (never Gemini) and cited_only +stubs have no text to extract — both linger forever as phantom backlog that the +drain (which scans ``metadata_extraction_requested_at IS NOT NULL``) can never +clear, inflating the /operations "ממתין" counter. + +This settles each row to a truthful terminal state at the source: + - internal_committee → 'completed' (metadata is deterministic, out of Gemini pipeline) + - external_upload w/ name+summary → 'completed' (already filled) + - external_upload w/ text but missing name/summary → stamp requested_at (real work → drain picks it up) + - cited_only (no text) → 'skipped' (terminal; nothing to extract) + +Idempotent and re-runnable (a healthy DB reports all-zero). The companion source +fix lives in db.create_internal_committee_decision (inserts 'completed' directly) +so internal rows never re-enter this state. + +Host-only (reads POSTGRES_URL from ~/.env via legal_mcp.config). Run by hand: + mcp-server/.venv/bin/python scripts/reconcile_metadata_status.py +""" + +import asyncio +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "mcp-server", "src")) + +from legal_mcp.services import db + + +async def main() -> int: + pool = await db.get_pool() + + internal = await pool.execute( + "UPDATE case_law SET metadata_extraction_status = 'completed' " + "WHERE source_kind = 'internal_committee' " + "AND metadata_extraction_status = 'pending'" + ) + external_done = await pool.execute( + "UPDATE case_law SET metadata_extraction_status = 'completed' " + "WHERE source_kind = 'external_upload' " + "AND metadata_extraction_status = 'pending' " + "AND coalesce(case_name, '') <> '' AND coalesce(summary, '') <> ''" + ) + external_requeued = await pool.execute( + "UPDATE case_law SET metadata_extraction_requested_at = now() " + "WHERE source_kind = 'external_upload' " + "AND metadata_extraction_status = 'pending' " + "AND coalesce(full_text, '') <> '' " + "AND (coalesce(case_name, '') = '' OR coalesce(summary, '') = '') " + "AND metadata_extraction_requested_at IS NULL" + ) + cited = await pool.execute( + "UPDATE case_law SET metadata_extraction_status = 'skipped' " + "WHERE source_kind = 'cited_only' " + "AND metadata_extraction_status = 'pending'" + ) + + def n(tag: str) -> str: + try: + return tag.split()[-1] + except (AttributeError, IndexError): + return "?" + + print(f"internal_committee → completed : {n(internal)}") + print(f"external_upload → completed : {n(external_done)}") + print(f"external_upload → requeued : {n(external_requeued)}") + print(f"cited_only → skipped : {n(cited)}") + + rows = await pool.fetch( + "SELECT coalesce(metadata_extraction_status,'NULL') s, count(*) n " + "FROM case_law GROUP BY 1 ORDER BY 2 DESC" + ) + print("\nresulting metadata_extraction_status distribution:") + for r in rows: + print(f" {r['s']:<12} {r['n']}") + return 0 + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) diff --git a/web-ui/src/app/operations/page.tsx b/web-ui/src/app/operations/page.tsx index fd7f184..9777748 100644 --- a/web-ui/src/app/operations/page.tsx +++ b/web-ui/src/app/operations/page.tsx @@ -295,10 +295,10 @@ function UniformStats({ p }: { p: PipelineStats }) { title="ממתינים שנדרשו במפורש לעיבוד — אלה שה-drain הבא יטפל בהם" /> diff --git a/web-ui/src/lib/api/operations.ts b/web-ui/src/lib/api/operations.ts index d884cd5..57af3d4 100644 --- a/web-ui/src/lib/api/operations.ts +++ b/web-ui/src/lib/api/operations.ts @@ -32,7 +32,7 @@ export type IngestedRow = { /** The uniform per-pipeline shape every background drain reports. */ export type PipelineStats = { - pending: number; // backlog: rows not yet processed (status default) + pending: number; // awaiting processing (status='pending') — not necessarily in the active queue processing: number; // being worked right now done: number; // completed failed: number; // terminal failures (court_fetch folds in 'manual')