From c8344342a81a6f00c9a4a78275086ee8870e135a Mon Sep 17 00:00:00 2001 From: Chaim Date: Mon, 8 Jun 2026 10:57:57 +0000 Subject: [PATCH] =?UTF-8?q?fix(style-panel):=20idempotency=20+=20dedup=20?= =?UTF-8?q?=E2=80=94=20re-running=20--apply=20never=20duplicates=20lessons?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit style_lesson_panel.py: before writing 2/2-keep lessons, skip any whose normalized lesson_text already exists on the corpus (any source), and collapse duplicates within a run. Makes the run-learning button safe to click repeatedly (the curator may re-run the pipeline) — it converges instead of piling up duplicate decision_lessons. Verified on בל"מ 8126-03-25: re-running --apply with 7 existing lessons wrote 0 ("1 כפילויות דולגו"), count stayed 7. Invariants: INV-LRN1/G10 unchanged (proposals only, manual fold). Co-Authored-By: Claude Opus 4.8 (1M context) --- scripts/style_lesson_panel.py | 38 +++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/scripts/style_lesson_panel.py b/scripts/style_lesson_panel.py index eaa1da5..2533298 100644 --- a/scripts/style_lesson_panel.py +++ b/scripts/style_lesson_panel.py @@ -199,6 +199,23 @@ async def _resolve_corpus_id(decision_number: str) -> str | None: return str(row["id"]) if row else None +def _norm(text: str) -> str: + """Normalize a lesson for dedup — collapse whitespace, strip.""" + return " ".join((text or "").split()) + + +async def _existing_lesson_texts(corpus_id: str) -> set[str]: + """Normalized lesson_texts already attached to this corpus (any source) — + so re-running --apply is idempotent and never duplicates a lesson.""" + pool = await db.get_pool() + async with pool.acquire() as conn: + rows = await conn.fetch( + "SELECT lesson_text FROM decision_lessons WHERE style_corpus_id = $1", + UUID(corpus_id), + ) + return {_norm(r["lesson_text"]) for r in rows} + + async def _load_pair(args) -> dict | None: if args.pair_id: return await db.get_draft_final_pair(UUID(args.pair_id)) @@ -281,6 +298,19 @@ async def main(args: argparse.Namespace) -> int: return 1 keeps = [r for r in results if r["_verdict"] == "agree_yes" and _lesson_text(r["_change"])] + + # Idempotency / dedup — skip keeps already attached to the corpus (any source), + # and collapse duplicates WITHIN this run. Re-running --apply writes nothing new. + existing = await _existing_lesson_texts(corpus_id) + fresh, seen = [], set(existing) + for r in keeps: + n = _norm(_lesson_text(r["_change"])) + if n in seen: + continue + seen.add(n) + fresh.append(r) + skipped_dup = len(keeps) - len(fresh) + ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") audit = Path(__file__).resolve().parent.parent / "data" / "audit" audit.mkdir(parents=True, exist_ok=True) @@ -288,12 +318,12 @@ async def main(args: argparse.Namespace) -> int: with backup.open("w", encoding="utf-8", newline="") as f: w = csv.writer(f) w.writerow(["corpus_id", "category", "source", "lesson_text"]) - for r in keeps: + for r in fresh: w.writerow([corpus_id, _category(r["_change"]), "panel:deepseek+gemini", _lesson_text(r["_change"])]) written = 0 - for r in keeps: + for r in fresh: await db.add_decision_lesson( UUID(corpus_id), lesson_text=_lesson_text(r["_change"]), @@ -305,8 +335,8 @@ async def main(args: argparse.Namespace) -> int: chair = cc["split"] + cc["incomplete"] print(f"\nAPPLIED (reversible): wrote {written} decision_lesson proposals " - f"(source=panel:deepseek+gemini) · {chair} escalated to chair · " - f"{len(substance)} substance skipped") + f"(source=panel:deepseek+gemini) · {skipped_dup} כפילויות דולגו · " + f"{chair} escalated to chair · {len(substance)} substance skipped") print(f"backup → {backup}") print("NB: fold into SKILL.md / legal-decision-lessons.md stays a manual chair gate (INV-G10).") return 0