Merge pull request 'fix(ops): ייבוש backlog-רפאים של חילוץ-מטא — נרמול-במקור (G1)' (#210) from worktree-metadata-backlog-fix into main
Some checks failed
Build & Deploy / build-and-deploy (push) Has been cancelled
G12 Leak-Guard / leak-guard (push) Has been cancelled

This commit was merged in pull request #210.
This commit is contained in:
2026-06-11 22:11:08 +00:00
5 changed files with 97 additions and 3 deletions

View File

@@ -3401,12 +3401,20 @@ async def create_internal_committee_decision(
subject_tags, summary, full_text,
source_kind, source_type, document_id,
extraction_status, halacha_extraction_status,
metadata_extraction_status,
practice_area, appeal_subtype, is_binding, proceeding_type, content_hash
) VALUES (
$1, $2, $3, $4, $5, $6,
$7, $8, $9,
'internal_committee', 'appeals_committee', $10,
'processing', 'pending',
-- Internal committee decisions carry deterministic metadata
-- computed from the case record (see app.py uniform-citation
-- builder); the Gemini metadata extractor is tuned for EXTERNAL
-- rulings and returns no_metadata for these. Settle the metadata
-- status to 'completed' at the source so these rows never linger
-- as phantom 'pending' backlog the drain can never clear.
'completed',
$11, $12, $13, $14, $15
)
ON CONFLICT (case_number, proceeding_type)
@@ -3428,6 +3436,8 @@ async def create_internal_committee_decision(
document_id = COALESCE(EXCLUDED.document_id, case_law.document_id),
extraction_status = 'processing',
halacha_extraction_status = 'pending',
-- Keep metadata settled on re-upsert (deterministic, never Gemini).
metadata_extraction_status = 'completed',
content_hash = EXCLUDED.content_hash
RETURNING *
""",

View File

@@ -28,6 +28,7 @@
| `legal-court-fetch-drain.config.cjs` | pm2/js | **תזמון שעתי של `drain_court_fetch.py`** (cron `17 * * * *`, `COURT_FETCH_DRAIN_CRON` לעקיפה) — הופך את לולאת יומון→אחזור→קליטה ל-fully-autonomous. `autorestart:false` (one-shot per tick). דורש `legal-court-fetch-service` רץ. התקנה: `pm2 start scripts/legal-court-fetch-drain.config.cjs && pm2 save`. | pm2 cron (host-side) |
| `drain_metadata_queue.py` | python | **ריקון תור חילוץ-המטא של הפסיקה**`process_pending_extractions(kind='metadata')` ב-batches עד ריק. רץ על **Gemini Flash** (structured JSON, `gemini_session`) — מהיר ואמין, במקום ה-claude CLI ה-agentic שפגע ב-`error_max_turns`. no-op מהיר כשריק. הרצה ידנית: `mcp-server/.venv/bin/python scripts/drain_metadata_queue.py [batch]`. | דרך `legal-metadata-drain.config.cjs` (pm2 cron) |
| `legal-metadata-drain.config.cjs` | pm2/js | **תזמון כל 15 דק' של `drain_metadata_queue.py`** (cron `*/15 * * * *`, `METADATA_DRAIN_CRON` לעקיפה) — מונע סתימה של תור חילוץ-המטא ב-/precedents. דורש `GEMINI_API_KEY` ב-`~/.env`. התקנה: `pm2 start scripts/legal-metadata-drain.config.cjs && pm2 save`. | pm2 cron (host-side) |
| `reconcile_metadata_status.py` | python | **נרמול `metadata_extraction_status` תקוע (G1)** — שורות עם ברירת-המחדל `'pending'` שאינן בצנרת-Gemini נערמות כ-backlog-רפאים שהדריינר (סורק `*_requested_at IS NOT NULL`) לעולם לא מנקה ומנפח את מונה "ממתין" ב-/operations. מיישב כל שורה למצב-אמת במקור: `internal_committee``completed` (מטא דטרמיניסטי, מחוץ ל-Gemini), `external_upload` מלא→`completed`, `external_upload` עם טקסט וחסר שם/תקציר→חותם `requested_at` (הדריינר יטפל), `cited_only` (אין טקסט)→`skipped`. אידמפוטנטי. תיקון-המקור הנלווה ב-`db.create_internal_committee_decision`. הרצה: `mcp-server/.venv/bin/python scripts/reconcile_metadata_status.py`. | חד-פעמי / re-runnable כהגנת-drift |
| `auto-sync-cases.sh` | bash | סנכרון תיקי ערר ל-Gitea — רץ כל דקה | `* * * * *` (cron) |
| `backup-db.sh` | bash | גיבוי PostgreSQL יומי ל-`data/backups/` (gzip) | לתזמן: `0 2 * * *` |
| `restore-db.sh` | bash | שחזור DB מגיבוי (companion ל-backup-db.sh) | ידני |

View File

@@ -0,0 +1,83 @@
"""Reconcile stale ``metadata_extraction_status='pending'`` rows (G1).
The column defaults to 'pending', but only ``source_kind='external_upload'``
rows with extractable text genuinely need the Gemini metadata drain. Internal
committee decisions carry deterministic metadata (never Gemini) and cited_only
stubs have no text to extract — both linger forever as phantom backlog that the
drain (which scans ``metadata_extraction_requested_at IS NOT NULL``) can never
clear, inflating the /operations "ממתין" counter.
This settles each row to a truthful terminal state at the source:
- internal_committee → 'completed' (metadata is deterministic, out of Gemini pipeline)
- external_upload w/ name+summary → 'completed' (already filled)
- external_upload w/ text but missing name/summary → stamp requested_at (real work → drain picks it up)
- cited_only (no text) → 'skipped' (terminal; nothing to extract)
Idempotent and re-runnable (a healthy DB reports all-zero). The companion source
fix lives in db.create_internal_committee_decision (inserts 'completed' directly)
so internal rows never re-enter this state.
Host-only (reads POSTGRES_URL from ~/.env via legal_mcp.config). Run by hand:
mcp-server/.venv/bin/python scripts/reconcile_metadata_status.py
"""
import asyncio
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "mcp-server", "src"))
from legal_mcp.services import db
async def main() -> int:
pool = await db.get_pool()
internal = await pool.execute(
"UPDATE case_law SET metadata_extraction_status = 'completed' "
"WHERE source_kind = 'internal_committee' "
"AND metadata_extraction_status = 'pending'"
)
external_done = await pool.execute(
"UPDATE case_law SET metadata_extraction_status = 'completed' "
"WHERE source_kind = 'external_upload' "
"AND metadata_extraction_status = 'pending' "
"AND coalesce(case_name, '') <> '' AND coalesce(summary, '') <> ''"
)
external_requeued = await pool.execute(
"UPDATE case_law SET metadata_extraction_requested_at = now() "
"WHERE source_kind = 'external_upload' "
"AND metadata_extraction_status = 'pending' "
"AND coalesce(full_text, '') <> '' "
"AND (coalesce(case_name, '') = '' OR coalesce(summary, '') = '') "
"AND metadata_extraction_requested_at IS NULL"
)
cited = await pool.execute(
"UPDATE case_law SET metadata_extraction_status = 'skipped' "
"WHERE source_kind = 'cited_only' "
"AND metadata_extraction_status = 'pending'"
)
def n(tag: str) -> str:
try:
return tag.split()[-1]
except (AttributeError, IndexError):
return "?"
print(f"internal_committee → completed : {n(internal)}")
print(f"external_upload → completed : {n(external_done)}")
print(f"external_upload → requeued : {n(external_requeued)}")
print(f"cited_only → skipped : {n(cited)}")
rows = await pool.fetch(
"SELECT coalesce(metadata_extraction_status,'NULL') s, count(*) n "
"FROM case_law GROUP BY 1 ORDER BY 2 DESC"
)
print("\nresulting metadata_extraction_status distribution:")
for r in rows:
print(f" {r['s']:<12} {r['n']}")
return 0
if __name__ == "__main__":
sys.exit(asyncio.run(main()))

View File

@@ -295,10 +295,10 @@ function UniformStats({ p }: { p: PipelineStats }) {
title="ממתינים שנדרשו במפורש לעיבוד — אלה שה-drain הבא יטפל בהם"
/>
<StatTile
label="ממתין (בקלוג)"
label="ממתין"
value={p.pending}
tone="muted"
title="כל הפריטים שטרם עובדו (ברירת-מחדל) — לאו דווקא בתור הפעיל"
title="פריטים שטרם עובדו וממתינים לעיבוד — לאו דווקא בתור הפעיל"
/>
<StatTile label="בעיבוד" value={p.processing} tone="amber" />
<StatTile label="הושלם" value={p.done} tone="green" />

View File

@@ -32,7 +32,7 @@ export type IngestedRow = {
/** The uniform per-pipeline shape every background drain reports. */
export type PipelineStats = {
pending: number; // backlog: rows not yet processed (status default)
pending: number; // awaiting processing (status='pending') — not necessarily in the active queue
processing: number; // being worked right now
done: number; // completed
failed: number; // terminal failures (court_fetch folds in 'manual')