From 4fce9d503fdd1ad4e9e3d78fb3e5e432a0860617 Mon Sep 17 00:00:00 2001 From: Chaim Date: Sun, 31 May 2026 14:12:45 +0000 Subject: [PATCH] =?UTF-8?q?feat(migration):=20FU-2c=20=E2=80=94=20reconcil?= =?UTF-8?q?e=20external=20case=5Flaw=20identifiers=20(GAP-08,=20#68)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit External court precedents stored the full citation (designator + docket + parties + Nevo date) inside case_number, violating INV-ID2/G1 (citation as identifier). Chair decision 2026-05-31 (Option A): canonical external case_number = proceeding-designator + docket, '/' preserved (court convention, not X1's '/'→'-'); parties/court/date → citation_formatted. scripts/fu2c_reconcile_external_case_numbers.py — deterministic dry-run → chair-review → apply, mirroring FU-2b: - extracts designator+docket; flags split into BLOCKING (MISMATCH / CIT_NO_DOCKET / DESIG_MISMATCH / DUP_CHECK / NO_DOCKET) vs ADVISORY (NO_CITATION — case_number fix still deterministic, missing citation is a separate gap), so advisory rows apply while uncertain identity does not. - --overrides CSV (id,proposed_canonical,citation_formatted,reason) for audited chair adjudication of blocking rows. - apply scoped to source_kind='external_upload' (task target) while keeping cited_only/nevo_seed in the reconciliation VIEW so DUP_CHECK spans the full external unique space; pre-flight collision guard before every UPDATE. Applied to production 2026-05-31: 21 case_number normalized + 3 citation_formatted reconciled (D = consolidated Supreme Court judgment לויתן/קלמנוביץ → lead docket 25226-04-25; 2×C empty citations composed from metadata). אהוד שפר עע"מ 317/10 deferred — cross-source duplicate with an existing cited_only reference (collision guard held; → #70). 49 cited_only records out of scope → new task #70 (committee-form NNNN-NN dockets the extractor misses, dedup, unresolvable "ערר אדלר"). Extraction + gating verified offline on all 24 records. Co-Authored-By: Claude Opus 4.8 (1M context) --- .taskmaster/tasks/tasks.json | 35 +- scripts/SCRIPTS.md | 1 + .../fu2c_reconcile_external_case_numbers.py | 340 ++++++++++++++++++ 3 files changed, 366 insertions(+), 10 deletions(-) create mode 100644 scripts/fu2c_reconcile_external_case_numbers.py diff --git a/.taskmaster/tasks/tasks.json b/.taskmaster/tasks/tasks.json index 47c8bfe..a832d0c 100644 --- a/.taskmaster/tasks/tasks.json +++ b/.taskmaster/tasks/tasks.json @@ -2300,9 +2300,9 @@ "id": "66", "title": "[FU-8a] מחסומי-תהליך→קוד: enforce sync + Paperclip-access guard (pure-code)", "description": "אכיפת cross-company sync (--verify יוצא non-zero על drift; adapter_type mismatch = drift לא silent skip) + fitness-function שחוסם גישת-Paperclip לא-מאושרת (raw http / INSERT agent_wakeup_requests).", - "details": "מכסה GAP-21,22. מספק INV-MC1/INT1/INT3. severity: High. סוג: pure-code. GAP-23 (חיווט ספ→סוכנים) הופרד ל-#69 (משנה התנהגות-ייצור).", + "details": "מכסה GAP-21,22. מספק INV-MC1/INT1/INT3. severity: High. סוג: pure-code. GAP-23 (חיווט ספ→סוכנים) הופרד ל-#69 (משנה התנהגות-ייצור). | DONE 2026-05-31 PR#16: --verify drift-gate (exit≠0) + Paperclip-access fitness function. GAP-23→#69.", "testStrategy": "", - "status": "pending", + "status": "done", "dependencies": [], "priority": "medium", "subtasks": [ @@ -2312,7 +2312,7 @@ "description": "sync ידני ולא-נאכף; adapter_type-mismatch מדולג בשקט (sync...py:387-389).", "dependencies": [], "details": "INV-MC1", - "status": "pending", + "status": "done", "testStrategy": "", "parentId": "66" }, @@ -2322,7 +2322,7 @@ "description": "אין אילוץ-schema נגד INSERT ישיר ל-agent_wakeup_requests; אין linter נגד httpx/curl גולמי.", "dependencies": [], "details": "INV-INT1/INT3", - "status": "pending", + "status": "done", "testStrategy": "", "parentId": "66" } @@ -2367,14 +2367,15 @@ "id": "68", "title": "[FU-2c] תיאום מזהי external_upload (case_number↔citation_formatted)", "description": "פסיקה חיצונית: case_number מחזיק ציטוט מלא; citation_formatted לא תמיד תואם (נמצאה סתירה 25226-04-25 מול 1975/24). דורש קודם תיקון סתירות citation_formatted↔case_number, ואז הכרעה אם docket מחולץ הופך ל-case_number או שהציטוט נשאר המזהה.", - "details": "מקור: בדיקת DB 2026-05-31 (FU-2b scoping). 22/24 external עם ציטוט ב-case_number; citation_formatted נוצר בנפרד (LLM) ולא אמין כ-ground truth. שונה מ-internal (שם 0 סתירות). דורש סקירת-יו\"ר פר-רשומה. severity: Medium. סוג: data-migration + chair. תלוי בהחלטה: האם זהות external = ציטוט (FU-1) או docket מנורמל (INV-ID2). מופרד מ-FU-2b לפי החלטת chaim 2026-05-31.", + "details": "מקור: בדיקת DB 2026-05-31 (FU-2b scoping). 22/24 external עם ציטוט ב-case_number; citation_formatted נוצר בנפרד (LLM) ולא אמין כ-ground truth. שונה מ-internal (שם 0 סתירות). דורש סקירת-יו\"ר פר-רשומה. severity: Medium. סוג: data-migration + chair. תלוי בהחלטה: האם זהות external = ציטוט (FU-1) או docket מנורמל (INV-ID2). מופרד מ-FU-2b לפי החלטת chaim 2026-05-31. | APPLIED 2026-05-31: chair decision Option A (designator+docket, '/' kept). 21 external_upload case_number normalized + 3 citation_formatted fixed (D=לויתן/קלמנוביץ consolidated→25226-04-25; 2×C empty-citation composed). אהוד שפר עע\"מ 317/10 deferred (cross-source dup w/ cited_only → #70). collision-guard: 0. Backups data/audit/fu2c-backup-20260531T140943Z.csv. cited_only(49)→#70.", "testStrategy": "", - "status": "pending", + "status": "done", "dependencies": [ "67" ], "priority": "medium", - "subtasks": [] + "subtasks": [], + "updatedAt": "2026-05-31T14:11:37.689Z" }, { "id": "69", @@ -2399,13 +2400,27 @@ "parentId": "69" } ] + }, + { + "id": "70", + "title": "[FU-2c-b] תיאום + dedup של cited_only (49 רשומות) + אהוד שפר cross-source", + "description": "המשך ל-FU-2c (#68). ה-dry-run של תיאום-המזהים החיצוני חשף 49 רשומות source_kind='cited_only' (הפניות-ציטוט שחולצו מהחלטות) שלא היו בהיקף #68. דורשות נרמול נפרד: צורות-ועדה כמו 'ערר 1093-19' (NNNN-NN) שה-extractor הנוכחי לא תופס (NO_DOCKET), 'בש\"א 2487-14', dups, ו-'ערר אדלר' בלתי-פתיר (ללא מספר). בנוסף: dedup חוצה-source של אהוד שפר — external_upload 'עע\"מ 317/10 אהוד שפר' מול cited_only קיים 'עע\"מ 317/10' (אותו תיק; ה-collision-guard מנע התנגשות ב-uq_case_law_external_number, ה-external_upload נשאר עם case_number מנופח עד הכרעה).", + "details": "מקור: dry-run FU-2c 2026-05-31 (data/audit/fu2c-reconciliation-20260531T140632Z.{csv,md}). 73 רשומות <> internal_committee = 24 external_upload (טופלו ב-#68) + 49 cited_only. מתוך ה-cited_only: ~17 will_change (refs בצורת בית-משפט), 6 NO_DOCKET (ערר NNNN-NN + ערר אדלר), 5+ DUP_CHECK. דרוש: (1) הרחבת _DOCKET_RE לצורת-ועדה NNNN-NN; (2) הכרעה אם cited_only refs מקבלים נרמול מלא או נשארים כ-display; (3) dedup חוצה-source (cited_only שהפך ל-external_upload → מיזוג/הסרה, ראה precedent_link_cases/precedent_unlink_cases); (4) 'ערר אדלר' — סגירה ידנית. severity: Medium. סוג: data-migration + chair. הסקריפט scripts/fu2c_reconcile_external_case_numbers.py כבר מסנן apply ל-external_upload בלבד ומשאיר cited_only בשדה-ראייה לזיהוי-dup.", + "testStrategy": "אחרי תיקון: 0 NO_DOCKET ב-cited_only (פרט ל-ערר אדלר המתועד); אין case_number כפול בין external_upload ל-cited_only; אהוד שפר עע\"מ 317/10 = רשומה אחת.", + "status": "pending", + "dependencies": [ + "68" + ], + "priority": "medium", + "subtasks": [], + "updatedAt": "2026-05-31T14:11:27.861937+00:00" } ], "metadata": { "version": "1.0.0", - "lastModified": "2026-05-30T18:30:11.522Z", - "taskCount": 66, - "completedCount": 55, + "lastModified": "2026-05-31T14:11:37.689Z", + "taskCount": 70, + "completedCount": 62, "tags": [ "legal-ai" ] diff --git a/scripts/SCRIPTS.md b/scripts/SCRIPTS.md index 53933c0..4a8f936 100644 --- a/scripts/SCRIPTS.md +++ b/scripts/SCRIPTS.md @@ -14,6 +14,7 @@ | `fix_paperclipai_skills_drift.py` | python | סקריפט חד-פעמי (בוצע 2026-05-04) שניקה drift על `paperclipai/*` skills בין CMP ל-CMPA. הסיר `paperclip-dev` מכל 14 הסוכנים, ודאג ש-`paperclip-converting-plans-to-tasks` קיים רק על CEO ו-analyst. תומך `--apply` (ברירת מחדל: dry-run). דורש `PAPERCLIP_BOARD_API_KEY`. נשמר לרפרנס למקרה שhdrift חוזר. | חד-פעמי (בוצע) | | `test_retrieval_by_name.py` | python | בדיקת אחזור-לפי-שם (#52/RC-A) — מאמת ש`search_precedent_library`/`search_internal_decisions` מדרגים את ההחלטה עצמה (אגסי) מעל מי שמצטט אותה, + רגרסיות לשאילתות מהותיות. הרצה: `DOTENV_PATH=/home/chaim/.env DATA_DIR=.../data mcp-server/.venv/bin/python scripts/test_retrieval_by_name.py` (exit 0 = עבר). | ידני אחרי שינוי שכבת חיפוש | | `fu2b_reconcile_internal_case_numbers.py` | python | **FU-2b (GAP-07/08) — תיאום `case_number` של `internal_committee`** מציטוט-מלא למספר-בסיס קנוני (X1: trim·prefix-strip·`/`→`-`, חודש נשמר). דטרמיניסטי (token יחיד; 0/>1 → flag). `--dry-run` (ברירת-מחדל) מפיק טבלת-תיאום ל-`data/audit/fu2b-reconciliation-*.{csv,md}` עם flags (DUP_CHECK / PROC_MISMATCH / MISMATCH). `--apply --approved ` מגבה ואז מעדכן רק שורות שאושרו ע"י היו"ר. scope: internal בלבד (external → #68). FK-safe. | חד-פעמי, **chair-gated** (apply רק אחרי אישור דפנה) | +| `fu2c_reconcile_external_case_numbers.py` | python | **FU-2c (GAP-08, #68) — תיאום `case_number` של פסיקה חיצונית** (`source_kind <> internal_committee`) מציטוט-מלא לצורה קנונית **מציין-הליך + docket** (החלטת-יו"ר 2026-05-31, Option A: `/` נשמר, *לא* `-`; תואם db.py:369 ו-INV-ID2). דטרמיניסטי (designator+docket; 0/>1 docket → flag). `--dry-run` (ברירת-מחדל) מפיק `data/audit/fu2c-reconciliation-*.{csv,md}` עם flags (MISMATCH / NO_CITATION / CIT_NO_DOCKET / DESIG_MISMATCH / DUP_CHECK). `--apply --approved ` מגבה ואז מעדכן שורות לא-חוסמות (כולל ADVISORY/NO_CITATION). `--overrides ` (id,proposed_canonical,reason) פותח שורות-חוסמות בהכרעת-יו"ר מפורשת (למשל פס"ד מאוחד — ראה `data/audit/fu2c-overrides.csv` לרשומת לויתן/קלמנוביץ). לוגיקת-החילוץ + פיצול flags אומתו offline על 24 רשומות. scope: external בלבד (internal = FU-2b). FK-safe. | חד-פעמי, **chair-gated** (apply רק אחרי אישור דפנה) | | `auto-sync-cases.sh` | bash | סנכרון תיקי ערר ל-Gitea — רץ כל דקה | `* * * * *` (cron) | | `backup-db.sh` | bash | גיבוי PostgreSQL יומי ל-`data/backups/` (gzip) | לתזמן: `0 2 * * *` | | `restore-db.sh` | bash | שחזור DB מגיבוי (companion ל-backup-db.sh) | ידני | diff --git a/scripts/fu2c_reconcile_external_case_numbers.py b/scripts/fu2c_reconcile_external_case_numbers.py new file mode 100644 index 0000000..8bf9e7e --- /dev/null +++ b/scripts/fu2c_reconcile_external_case_numbers.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python3 +"""FU-2c — reconcile external case_law case_number → canonical designator+docket. + +External court precedents stored the FULL citation (proceeding designator + +docket + district qualifier + parties + Nevo date) inside case_number. The +canonical form (chair decision 2026-05-31, Option A; consistent with db.py:369 +comment "עע\"מ 3975/22" and INV-ID2/X1) is **proceeding-designator + docket +only**, with court-docket '/' PRESERVED (court convention, NOT normalized to +'-'). Parties / court-name / district-qualifier / Nevo-date live in +citation_formatted (the display field), never in the identifier. + + Current : עע"מ (מינהליים ת"א) 14306-09-23 עדינה בולקינד נ' הוועדה... (נבו 11.2.2024) + Target : עמ"נ 14306-09-23 (designator + docket; '/' kept where present) + +DETERMINISTIC — no LLM. The docket is the single docket-shaped token; the +designator is the leading proceeding token. 0 dockets, >1 distinct dockets, a +citation whose docket disagrees with case_number, or an empty citation are +FLAGGED for chair review, never guessed/auto-applied. + +Scope: source_kind <> 'internal_committee' (external_upload / cited_only / +nevo_seed) — the external partial-unique space (X1 §2, uq_case_law_external_number). +internal_committee was handled by FU-2b. FK-safe: all case_law FKs reference +case_law.id (UUID), not case_number. + +Usage (must use the mcp-server venv — asyncpg/pgvector vendored there): + PY=/home/chaim/legal-ai/mcp-server/.venv/bin/python + + # Dry-run (default): builds the reconciliation table for chair review. + $PY scripts/fu2c_reconcile_external_case_numbers.py + + # Apply ONLY chair-approved rows (after Dafna's review), backup first: + $PY scripts/fu2c_reconcile_external_case_numbers.py --apply \ + --approved data/audit/fu2c-reconciliation-.csv +""" +from __future__ import annotations + +import argparse +import asyncio +import csv +import os +import re +import sys +from collections import Counter +from datetime import datetime, timezone +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(REPO_ROOT / "mcp-server" / "src")) + +if "POSTGRES_URL" not in os.environ: + os.environ["POSTGRES_URL"] = ( + f"postgres://{os.environ.get('POSTGRES_USER','legal_ai')}:" + f"{os.environ.get('POSTGRES_PASSWORD','')}@" + f"{os.environ.get('POSTGRES_HOST','127.0.0.1')}:" + f"{os.environ.get('POSTGRES_PORT','5433')}/" + f"{os.environ.get('POSTGRES_DB','legal_ai')}" + ) + +AUDIT_DIR = REPO_ROOT / "data" / "audit" + +# Docket shapes: district-court admin (NNNNN-NN-NN) or classic court (NNNN/YY). +_DOCKET_RE = re.compile(r"\d{3,6}-\d{1,2}-\d{2}|\d{1,6}/\d{2}") +# Layout / RTL-LTR / bracket marks that wrap citations from OCR + LLM output. +_MARKS = dict.fromkeys(map(ord, "‎‏‪‫‬‭‮"), None) +# Gershayim variants: ASCII " and Hebrew punctuation ״ (U+05F4), ' and ׳ (U+05F3). + + +def _clean(s: str) -> str: + """Drop RTL/LTR marks and the ‏(‏ ‏)‏ wrappers OCR sprays around qualifiers.""" + return (s or "").translate(_MARKS).replace("‏", "").strip() + + +def _designator_eq(a: str, b: str) -> bool: + """בר\"מ ≡ בר\"ם (final-mem variant); normalize gershayim before compare.""" + norm = lambda x: (x or "").replace("״", '"').replace("׳", "'").replace("ם", "מ").strip() + return norm(a) == norm(b) + + +def _extract(case_number: str) -> tuple[str | None, str | None, str]: + """Return (designator, docket, flag). flag ∈ {OK, NO_DOCKET, MULTI_DOCKET}. + + designator = leading proceeding token (בג"ץ / עע"מ / בר"מ / עמ"נ / עת"מ / ע"א …). + docket = the single docket-shaped token ('/' preserved per chair decision). + Canonical = f"{designator} {docket}". 0 or >1 distinct dockets → flag (chair). + """ + cn = _clean(case_number) + dockets = _DOCKET_RE.findall(cn) + distinct = list(dict.fromkeys(dockets)) + if not distinct: + return None, None, "NO_DOCKET" + docket = distinct[0] + m = _DOCKET_RE.search(cn) + prefix = cn[: m.start()].strip() + # designator is the first whitespace token of the prefix (a parenthesised + # district qualifier, if any, comes after it and is dropped). + designator = prefix.split()[0] if prefix.split() else "" + flag = "OK" if len(distinct) == 1 else "MULTI_DOCKET" + return designator or None, docket, flag + + +def _citation_docket(citation_formatted: str) -> str | None: + """First docket-shaped token inside the formatted citation, if any.""" + m = _DOCKET_RE.search(_clean(citation_formatted)) + return m.group() if m else None + + +def _consistency(docket: str | None, citation_formatted: str) -> str: + """OK if case_number docket matches citation docket; MISMATCH if they differ; + NO_CITATION if citation empty; CIT_NO_DOCKET if citation has no docket token.""" + if not _clean(citation_formatted): + return "NO_CITATION" + if not docket: + return "NO_DOCKET" + cd = _citation_docket(citation_formatted) + if cd is None: + return "CIT_NO_DOCKET" + return "OK" if cd == docket else "MISMATCH" + + +async def _build_reconciliation() -> list[dict]: + from legal_mcp.services import db + pool = await db.get_pool() + async with pool.acquire() as conn: + rows = await conn.fetch( + "SELECT id, case_number, source_kind, coalesce(court,'') AS court, " + "coalesce(citation_formatted,'') AS cf " + "FROM case_law WHERE source_kind <> 'internal_committee' " + "ORDER BY source_kind, case_number") + out: list[dict] = [] + for r in rows: + designator, docket, flag = _extract(r["case_number"]) + canonical = f"{designator} {docket}" if designator and docket else (docket or "") + cons = _consistency(docket, r["cf"]) + cd = _citation_docket(r["cf"]) + cit_desig = _clean(r["cf"]).split()[0] if _clean(r["cf"]) else "" + desig_flag = "" + if designator and cit_desig and cd == docket and not _designator_eq(designator, cit_desig): + desig_flag = "DESIG_MISMATCH" + changes = bool(canonical) and canonical != _clean(r["case_number"]) + out.append({ + "id": str(r["id"]), + "source_kind": r["source_kind"], + "current_case_number": r["case_number"], + "proposed_canonical": canonical, + "court": r["court"], + "citation_formatted": r["cf"], + "extract_flag": flag, + "consistency": cons, + "desig_flag": desig_flag, + "will_change": "yes" if changes else "no", + }) + canon_counts = Counter(d["proposed_canonical"] for d in out if d["proposed_canonical"]) + for d in out: + d["dup_check"] = "DUP_CHECK" if (d["proposed_canonical"] and canon_counts[d["proposed_canonical"]] > 1) else "" + return out + + +def _ts() -> str: + return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + + +# BLOCKING flags forbid auto-apply (identity is uncertain/corrupt — chair must adjudicate). +# ADVISORY flags are surfaced for review but do NOT block the case_number fix, because the +# docket extraction is still deterministic and unambiguous (e.g. NO_CITATION = the display +# citation is missing, an orthogonal backfill gap — it does not make the docket wrong). +def _is_blocking(r: dict) -> bool: + return bool(r["extract_flag"] != "OK" or r["consistency"] in {"MISMATCH", "CIT_NO_DOCKET"} + or r["desig_flag"] or r["dup_check"]) + + +def _is_flagged(r: dict) -> bool: + """Anything worth showing the chair (blocking + advisory NO_CITATION).""" + return _is_blocking(r) or r["consistency"] == "NO_CITATION" + + +def _write_table(rows: list[dict], ts: str) -> tuple[Path, Path]: + AUDIT_DIR.mkdir(parents=True, exist_ok=True) + csv_path = AUDIT_DIR / f"fu2c-reconciliation-{ts}.csv" + md_path = AUDIT_DIR / f"fu2c-reconciliation-{ts}.md" + cols = ["id", "source_kind", "current_case_number", "proposed_canonical", "court", + "citation_formatted", "extract_flag", "consistency", "desig_flag", "dup_check", "will_change"] + with csv_path.open("w", newline="", encoding="utf-8") as f: + w = csv.DictWriter(f, fieldnames=cols) + w.writeheader() + w.writerows(rows) + changing = [r for r in rows if r["will_change"] == "yes"] + flagged = [r for r in rows if _is_flagged(r)] + with md_path.open("w", encoding="utf-8") as f: + f.write(f"# FU-2c — טבלת-תיאום מזהים חיצוניים (case_law non-internal) — {ts}\n\n") + f.write(f"- סה\"כ רשומות: {len(rows)}\n- ישתנו: {len(changing)}\n- מסומנות לסקירה: {len(flagged)}\n\n") + f.write("## דורש הכרעת-יו\"ר (flags)\n\n") + f.write("BLOCK = חוסם auto-apply (זהות לא-ודאית); ADVISORY = תיקון case_number בטוח, פער נלווה.\n\n") + f.write("| current_case_number | proposed_canonical | flags | gate |\n|---|---|---|---|\n") + for r in flagged: + fl = " ".join(x for x in [ + r["extract_flag"] if r["extract_flag"] != "OK" else "", + r["consistency"] if r["consistency"] in {"MISMATCH", "NO_CITATION", "CIT_NO_DOCKET"} else "", + r["desig_flag"], r["dup_check"]] if x) + gate = "BLOCK" if _is_blocking(r) else "ADVISORY" + f.write(f"| {r['current_case_number'][:55]} | {r['proposed_canonical']} | {fl} | {gate} |\n") + f.write("\n## שינויים שיוחלו ב-apply (will_change=yes, לא-חוסם — כולל ADVISORY)\n\n") + f.write("| current_case_number | → proposed_canonical |\n|---|---|\n") + for r in changing: + if _is_blocking(r): + continue + f.write(f"| {r['current_case_number'][:60]} | {r['proposed_canonical']} |\n") + return csv_path, md_path + + +def _load_overrides(overrides_csv: Path | None) -> dict[str, dict]: + """Chair per-record adjudication of BLOCKING rows. id → {canonical, citation, reason}. + + Columns: id, proposed_canonical, reason (required); citation_formatted (optional — + when present, the record's display citation is reconciled too). Each row is an + explicit, audited chair decision that unblocks one record (e.g. a consolidated + judgment whose lead docket the deterministic extractor cannot choose on its own).""" + if overrides_csv is None: + return {} + out: dict[str, dict] = {} + with overrides_csv.open(encoding="utf-8") as f: + for r in csv.DictReader(f): + cid, canon = r.get("id"), (r.get("proposed_canonical") or "").strip() + if cid and canon: + out[cid] = { + "canonical": canon, + "citation": (r.get("citation_formatted") or "").strip(), + "reason": (r.get("reason") or "").strip(), + } + return out + + +async def _apply(approved_csv: Path, overrides_csv: Path | None, ts: str) -> dict: + from legal_mcp.services import db + overrides = _load_overrides(overrides_csv) + with approved_csv.open(encoding="utf-8") as f: + all_rows = [r for r in csv.DictReader(f) if r.get("will_change") == "yes"] + # Decide the target per row. SCOPE: apply only to source_kind='external_upload' + # (the reviewed FU-2c target, task #68) OR an explicit chair override. cited_only / + # nevo_seed stay in the reconciliation VIEW (so DUP_CHECK spans the full external + # unique space) but are NOT migrated here — they are a separate, unreviewed category. + # Per row: non-blocking → proposed_canonical (NO_CITATION is advisory, still safe); + # blocking → only via override. An override may also carry citation_formatted. + plan: list[dict] = [] # {id, canonical, citation|None, source} + skipped_blocking: list[str] = [] + skipped_out_of_scope = 0 + for r in all_rows: + in_scope = r.get("source_kind") == "external_upload" or r["id"] in overrides + if not in_scope: + skipped_out_of_scope += 1 + continue + blocking = _is_blocking({ + "extract_flag": r.get("extract_flag", "OK"), "consistency": r.get("consistency", ""), + "desig_flag": r.get("desig_flag", ""), "dup_check": r.get("dup_check", ""), + }) + if r["id"] in overrides: + ov = overrides[r["id"]] + plan.append({"id": r["id"], "canonical": ov["canonical"], + "citation": ov["citation"] or None, "source": f"override:{ov['reason']}"}) + elif not blocking and r.get("proposed_canonical"): + plan.append({"id": r["id"], "canonical": r["proposed_canonical"], "citation": None, "source": "auto"}) + elif blocking: + skipped_blocking.append(r["id"]) + if not plan: + return {"applied": 0, "note": "no applicable rows", "skipped_blocking": skipped_blocking, + "skipped_out_of_scope": skipped_out_of_scope} + AUDIT_DIR.mkdir(parents=True, exist_ok=True) + backup = AUDIT_DIR / f"fu2c-backup-{ts}.csv" + pool = await db.get_pool() + applied = 0 + cit_applied = 0 + collisions: list[str] = [] + with backup.open("w", newline="", encoding="utf-8") as bf: + bw = csv.writer(bf) + bw.writerow(["id", "old_case_number", "new_case_number", "old_citation", "new_citation", "source"]) + async with pool.acquire() as conn: + for p in plan: + rec = await conn.fetchrow( + "SELECT case_number, coalesce(citation_formatted,'') AS cf FROM case_law WHERE id=$1", p["id"]) + if rec is None: + continue + # Pre-flight collision guard: the external unique index spans ALL + # source_kind<>'internal_committee'. Skip if another row already holds + # the target value, rather than letting the UPDATE raise (e.g. a cited_only + # reference that pre-existed the uploaded precedent → needs dedup, not migrate). + if p["canonical"] != rec["case_number"]: + clash = await conn.fetchval( + "SELECT id FROM case_law WHERE case_number=$1 " + "AND source_kind <> 'internal_committee' AND id <> $2", p["canonical"], p["id"]) + if clash is not None: + collisions.append(f"{p['id']}→{p['canonical']} (clash {clash})") + continue + new_cit = p["citation"] if p["citation"] is not None else rec["cf"] + bw.writerow([p["id"], rec["case_number"], p["canonical"], rec["cf"], new_cit, p["source"]]) + if p["citation"] is not None: + await conn.execute( + "UPDATE case_law SET case_number=$2, citation_formatted=$3 WHERE id=$1 " + "AND source_kind <> 'internal_committee'", + p["id"], p["canonical"], p["citation"]) + cit_applied += 1 + else: + await conn.execute( + "UPDATE case_law SET case_number=$2 WHERE id=$1 " + "AND source_kind <> 'internal_committee'", + p["id"], p["canonical"]) + applied += 1 + return {"applied": applied, "citations_fixed": cit_applied, "overrides": len(overrides), + "skipped_blocking": skipped_blocking, "skipped_out_of_scope": skipped_out_of_scope, + "collisions": collisions, "backup": str(backup)} + + +async def main() -> int: + parser = argparse.ArgumentParser(description="FU-2c external case_number reconciliation") + parser.add_argument("--apply", action="store_true", help="apply approved changes (default: dry-run)") + parser.add_argument("--approved", type=str, help="path to chair-approved CSV (required with --apply)") + parser.add_argument("--overrides", type=str, help="optional CSV (id,proposed_canonical,reason) of " + "chair-adjudicated BLOCKING rows to unblock (e.g. consolidated-judgment lead docket)") + args = parser.parse_args() + ts = _ts() + + if not args.apply: + rows = await _build_reconciliation() + csv_path, md_path = _write_table(rows, ts) + changing = sum(1 for r in rows if r["will_change"] == "yes") + flagged = sum(1 for r in rows if _is_flagged(r)) + print(f"DRY-RUN: {len(rows)} rows | will_change={changing} | flagged={flagged}") + print(f" table: {md_path}") + print(f" csv: {csv_path}") + print("Review the table with the chair, then run --apply --approved .") + return 0 + + if not args.approved: + print("ERROR: --apply requires --approved (the chair-reviewed table).", file=sys.stderr) + return 2 + result = await _apply(Path(args.approved), Path(args.overrides) if args.overrides else None, ts) + print(f"APPLIED: {result}") + return 0 + + +if __name__ == "__main__": + sys.exit(asyncio.run(main()))