diff --git a/scripts/style_lesson_panel.py b/scripts/style_lesson_panel.py index eaa1da5..2533298 100644 --- a/scripts/style_lesson_panel.py +++ b/scripts/style_lesson_panel.py @@ -199,6 +199,23 @@ async def _resolve_corpus_id(decision_number: str) -> str | None: return str(row["id"]) if row else None +def _norm(text: str) -> str: + """Normalize a lesson for dedup — collapse whitespace, strip.""" + return " ".join((text or "").split()) + + +async def _existing_lesson_texts(corpus_id: str) -> set[str]: + """Normalized lesson_texts already attached to this corpus (any source) — + so re-running --apply is idempotent and never duplicates a lesson.""" + pool = await db.get_pool() + async with pool.acquire() as conn: + rows = await conn.fetch( + "SELECT lesson_text FROM decision_lessons WHERE style_corpus_id = $1", + UUID(corpus_id), + ) + return {_norm(r["lesson_text"]) for r in rows} + + async def _load_pair(args) -> dict | None: if args.pair_id: return await db.get_draft_final_pair(UUID(args.pair_id)) @@ -281,6 +298,19 @@ async def main(args: argparse.Namespace) -> int: return 1 keeps = [r for r in results if r["_verdict"] == "agree_yes" and _lesson_text(r["_change"])] + + # Idempotency / dedup — skip keeps already attached to the corpus (any source), + # and collapse duplicates WITHIN this run. Re-running --apply writes nothing new. + existing = await _existing_lesson_texts(corpus_id) + fresh, seen = [], set(existing) + for r in keeps: + n = _norm(_lesson_text(r["_change"])) + if n in seen: + continue + seen.add(n) + fresh.append(r) + skipped_dup = len(keeps) - len(fresh) + ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") audit = Path(__file__).resolve().parent.parent / "data" / "audit" audit.mkdir(parents=True, exist_ok=True) @@ -288,12 +318,12 @@ async def main(args: argparse.Namespace) -> int: with backup.open("w", encoding="utf-8", newline="") as f: w = csv.writer(f) w.writerow(["corpus_id", "category", "source", "lesson_text"]) - for r in keeps: + for r in fresh: w.writerow([corpus_id, _category(r["_change"]), "panel:deepseek+gemini", _lesson_text(r["_change"])]) written = 0 - for r in keeps: + for r in fresh: await db.add_decision_lesson( UUID(corpus_id), lesson_text=_lesson_text(r["_change"]), @@ -305,8 +335,8 @@ async def main(args: argparse.Namespace) -> int: chair = cc["split"] + cc["incomplete"] print(f"\nAPPLIED (reversible): wrote {written} decision_lesson proposals " - f"(source=panel:deepseek+gemini) · {chair} escalated to chair · " - f"{len(substance)} substance skipped") + f"(source=panel:deepseek+gemini) · {skipped_dup} כפילויות דולגו · " + f"{chair} escalated to chair · {len(substance)} substance skipped") print(f"backup → {backup}") print("NB: fold into SKILL.md / legal-decision-lessons.md stays a manual chair gate (INV-G10).") return 0