From 2e20e27e171f9638e1d7d8cb6d5646990a6c8dbb Mon Sep 17 00:00:00 2001 From: Chaim Date: Sat, 6 Jun 2026 18:10:01 +0000 Subject: [PATCH] =?UTF-8?q?feat(style-acq=20T1-T3):=20=D7=A7=D7=95=D7=A8?= =?UTF-8?q?=D7=A4=D7=95=D7=A1-=D7=93=D7=95=D7=92=D7=9E=D7=90=D7=95=D7=AA?= =?UTF-8?q?=20=D7=A9=D7=9C=20=D7=93=D7=A4=D7=A0=D7=94=20=D7=9C=D7=9B=D7=95?= =?UTF-8?q?=D7=AA=D7=91=20(style=5Fexemplars)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ממלא את ערוץ-הדוגמאות (B) של מערכת רכישת-הסגנון: הכותב מאחזר פסקאות-בלוק אמיתיות של דפנה בזמן כתיבה, ממוקדות section+outcome+practice_area. T1 — תשתית + backfill: - SCHEMA_V27: טבלת style_exemplars (purpose-built — בלי תיקים מזויפים בשרשרת decision_paragraphs). decision_number/source/section/outcome/practice_area+embedding. - db: insert/delete/search_style_exemplars + count_style_exemplars. - scripts/backfill_style_exemplars.py: מפצל קורפוס דפנה (style_corpus + internal_committee) לסעיפים→פסקאות, embed, שמירה. אידמפוטנטי, dry-run/apply. T2 — אחזור ממוקד: - search_style_exemplars(section, outcome, practice_area) — section=hard filter, outcome/practice_area=soft. block_writer._build_precedents_context ממפה block→section ומאחזר (ראשי), לצד הנתיב הישן (משלים). T3 — contrastive/adapt: - הדוגמאות מתויגות "מבנה/קול בלבד — התאם, אל תעתיק תוכן"; פסקה מלאה (1100 תווים). INV-LRN5 (טוהר — סגנון בלבד). G11. הרצת backfill --apply בנפרד. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/legal_mcp/services/block_writer.py | 28 +++- mcp-server/src/legal_mcp/services/db.py | 104 +++++++++++++- scripts/SCRIPTS.md | 1 + scripts/backfill_style_exemplars.py | 131 ++++++++++++++++++ 4 files changed, 261 insertions(+), 3 deletions(-) create mode 100644 scripts/backfill_style_exemplars.py diff --git a/mcp-server/src/legal_mcp/services/block_writer.py b/mcp-server/src/legal_mcp/services/block_writer.py index ede0ad0..fd8c8d0 100644 --- a/mcp-server/src/legal_mcp/services/block_writer.py +++ b/mcp-server/src/legal_mcp/services/block_writer.py @@ -725,19 +725,43 @@ async def _build_precedents_context( style_parts: list[str] = [] caselaw_parts: list[str] = [] case_law_ids: list[str] = [] + # block → golden-ratio section, for targeted exemplar retrieval (T2) + _BLOCK_SECTION = { + "block-vav": "background", "block-zayin": "claims", + "block-yod": "discussion", "block-yod-alef": "summary", + } try: case = await db.get_case(case_id) case_number = case.get("case_number", "") if case else "" subject = case.get("subject", "") if case else "" + practice_area = case.get("practice_area", "") if case else "" + decision = await db.get_decision_by_case(case_id) + outcome = (decision or {}).get("outcome", "") query = f"דיון משפטי בנושא {subject}" if subject else "דיון משפטי ועדת ערר" query_emb = await embeddings.embed_query(query) + section = _BLOCK_SECTION.get(block_id) - # Stream 1: paragraph_embeddings — Dafna's own prose (STYLE exemplars, not content) + # Stream 1a (PRIMARY): Dafna's own block-level prose from her corpus + # (style_exemplars) — matched by section + outcome + practice_area (T2/T3). + if section: + exemplars = await db.search_style_exemplars( + query_embedding=query_emb, section=section, + outcome=outcome or None, practice_area=practice_area or None, limit=6, + ) + exemplars = [e for e in exemplars if e.get("decision_number", "") != case_number] + for e in exemplars[:4]: + style_parts.append( + f"[דוגמת-סגנון (מבנה/קול בלבד — התאם, אל תעתיק תוכן) — " + f"{e.get('decision_number', '?')}, {section}, " + f"outcome={e.get('outcome') or '—'}]\n{e['paragraph_text'][:1100]}" + ) + + # Stream 1b: paragraphs from pipeline cases (legacy path; may be empty) para_results = await db.search_similar_paragraphs( query_embedding=query_emb, limit=10, block_type="block-yod", ) para_results = [r for r in para_results if r.get("case_number", "") != case_number] - for r in para_results[:4]: + for r in para_results[:2]: style_parts.append( f"[דוגמת-סגנון — החלטת {r.get('case_number', '?')} " f"{r.get('case_title', '')}, בלוק {r.get('block_type', '')}]\n{r['content'][:500]}" diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py index 34ad703..1c10ca8 100644 --- a/mcp-server/src/legal_mcp/services/db.py +++ b/mcp-server/src/legal_mcp/services/db.py @@ -1204,6 +1204,28 @@ CREATE INDEX IF NOT EXISTS idx_draft_final_pairs_case ON draft_final_pairs(case_ CREATE INDEX IF NOT EXISTS idx_draft_final_pairs_status ON draft_final_pairs(status); """ +SCHEMA_V27_SQL = """ +-- style_exemplars (T1-T3): block-level paragraphs from Dafna's OWN decisions +-- (style_corpus + internal_committee finals), embedded for retrieval as +-- style exemplars at write-time. Purpose-built so we DON'T fabricate synthetic +-- cases just to reuse decision_paragraphs. INV-LRN5: style material only — the +-- writer is told to adapt structure/voice, copy only boilerplate, never substance. +CREATE TABLE IF NOT EXISTS style_exemplars ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + decision_number TEXT DEFAULT '', + source TEXT DEFAULT '', -- style_corpus | internal_committee + practice_area TEXT DEFAULT '', + outcome TEXT DEFAULT '', -- rejection | partial_acceptance | full_acceptance | '' + section TEXT DEFAULT 'other', -- background | claims | discussion | summary | other + paragraph_text TEXT NOT NULL, + word_count INTEGER DEFAULT 0, + embedding vector(1024), + created_at TIMESTAMPTZ DEFAULT now() +); +CREATE INDEX IF NOT EXISTS idx_style_exemplars_section ON style_exemplars(section); +CREATE INDEX IF NOT EXISTS idx_style_exemplars_decision ON style_exemplars(decision_number, source); +""" + async def _run_schema_migrations(pool: asyncpg.Pool) -> None: async with pool.acquire() as conn: @@ -1234,7 +1256,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None: await conn.execute(SCHEMA_V24_SQL) await conn.execute(SCHEMA_V25_SQL) await conn.execute(SCHEMA_V26_SQL) - logger.info("Database schema initialized (v1-v26)") + await conn.execute(SCHEMA_V27_SQL) + logger.info("Database schema initialized (v1-v27)") async def init_schema() -> None: @@ -2329,6 +2352,85 @@ async def list_draft_final_pairs(status: str | None = None, limit: int = 200) -> return [dict(r) for r in rows] +async def insert_style_exemplar( + decision_number: str, source: str, practice_area: str, outcome: str, + section: str, paragraph_text: str, word_count: int, embedding: list[float], +) -> None: + """Insert one block-level style exemplar (T1 backfill).""" + pool = await get_pool() + async with pool.acquire() as conn: + await conn.execute( + """INSERT INTO style_exemplars + (decision_number, source, practice_area, outcome, section, + paragraph_text, word_count, embedding) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8)""", + decision_number, source, practice_area, outcome, section, + paragraph_text, word_count, str(embedding), + ) + + +async def delete_style_exemplars(decision_number: str, source: str) -> int: + """Idempotent backfill: clear a decision's exemplars before re-inserting.""" + pool = await get_pool() + async with pool.acquire() as conn: + res = await conn.execute( + "DELETE FROM style_exemplars WHERE decision_number = $1 AND source = $2", + decision_number, source, + ) + try: + return int(res.split()[-1]) + except (ValueError, IndexError): + return 0 + + +async def search_style_exemplars( + query_embedding: list[float], + section: str | None = None, + outcome: str | None = None, + practice_area: str | None = None, + limit: int = 6, +) -> list[dict]: + """Retrieve Dafna's own block-level paragraphs as STYLE exemplars (T2). + Filters by section (block) + optionally outcome/practice_area for the closest + match to the block being written. Soft filters: outcome/practice_area narrow but + never zero-out — section is the hard filter.""" + pool = await get_pool() + conditions, params, idx = [], [query_embedding, limit], 3 + if section: + conditions.append(f"section = ${idx}"); params.append(section); idx += 1 + if outcome: + conditions.append(f"(outcome = ${idx} OR outcome = '')"); params.append(outcome); idx += 1 + if practice_area: + conditions.append(f"(practice_area = ${idx} OR practice_area = '')"); params.append(practice_area); idx += 1 + where = f"WHERE {' AND '.join(conditions)}" if conditions else "" + sql = f""" + SELECT decision_number, source, section, outcome, practice_area, + paragraph_text, word_count, + 1 - (embedding <=> $1) AS score + FROM style_exemplars + {where} + ORDER BY embedding <=> $1 + LIMIT $2 + """ + async with pool.acquire() as conn: + rows = await conn.fetch(sql, *params) + return [dict(r) for r in rows] + + +async def count_style_exemplars() -> dict: + """Coverage check for the backfill.""" + pool = await get_pool() + async with pool.acquire() as conn: + total = await conn.fetchval("SELECT count(*) FROM style_exemplars") + by_section = await conn.fetch( + "SELECT section, count(*) AS n FROM style_exemplars GROUP BY section ORDER BY n DESC" + ) + decisions = await conn.fetchval( + "SELECT count(DISTINCT decision_number) FROM style_exemplars" + ) + return {"total": total, "decisions": decisions, "by_section": [dict(r) for r in by_section]} + + async def upsert_style_pattern( pattern_type: str, pattern_text: str, diff --git a/scripts/SCRIPTS.md b/scripts/SCRIPTS.md index cca23bc..76f4ef3 100644 --- a/scripts/SCRIPTS.md +++ b/scripts/SCRIPTS.md @@ -45,6 +45,7 @@ | `backfill_multimodal_precedents.py` | python | Backfill voyage-multimodal-3 page embeddings על רשומות `case_law` (external_upload + internal_committee) שחסרות `precedent_image_embeddings`. בונה אינדקס קבצים מ-`data/precedent-library/` ו-`data/internal-decisions/`, מנסה התאמה לפי tokens של מספרי תיק (כולל parts-match לפורמטים שונים של Nevo doc-id). מדלג על רשומות בלי קובץ-מקור או עם MD בלבד (PyMuPDF לא מרנדר MD). תומך `--dry-run` (default) / `--apply` / `--only external_upload\|internal_committee` / `--limit N`. רץ בקונטיינר (יש `/data` + Voyage env). **הופעל 2026-05-26**: 70 חסרים → 26 backfilled (503 pages, ~$0.21 voyage tokens), 44 אין-קובץ-מקור. ניתן להריץ שוב אחרי שיועלו עוד PDF/DOCX לספרייה | ידני | | `monitor_halacha_quality.py` | python | מנטר איכות חילוץ הלכות. בודק drift של `avg(confidence)` בין baseline היסטורי לחלון אחרון. מחזיר JSON מטריקות + alert ב-stderr אם drift > threshold (ברירת מחדל 5%). 2 סדרות: trusted (approved+published) ו-all_extracted. תומך `--window N` / `--threshold X` / `--min-sample N` / `--silent` / `--exit-on-alert`. רץ ב-container או מקומית עם `mcp-server/.venv` (אין תלות ב-LLM, רק SQL). **תזמון מומלץ**: `0 8 * * 1` (יום ראשון 08:00, שבועי) | `0 8 * * 1` (לתזמן) | | `audit_training_corpus.py` | python | audit של `style_corpus` — לכל החלטה: שדות מטא-דאטה מאוכלסים (`summary`/`outcome`/`key_principles`/`appeal_subtype`/`subject_categories`), קישור ל-`documents` (FK + chunks + embeddings). מפיק `data/audit/corpus-YYYY-MM-DD.json` + summary בקונסול. דרוש `POSTGRES_URL` או POSTGRES_*. אין תלויות חיצוניות מלבד asyncpg. **רץ מהמכונה המקומית** (לא קונטיינר) — חיבור ישיר ל-Postgres :5433 | ידני / קדם-עבודה לפני enrichment של מטא-דאטה | +| `backfill_style_exemplars.py` | python | **T1 (style-acquisition)** — מאכלס `style_exemplars` מקורפוס דפנה (`style_corpus` + `internal_committee` chair=דפנה): מפצל לסעיפים (`chunker._split_into_sections`) → פסקאות (25-450 מילים) → embed (Voyage) → שמירה עם `section`/`outcome`/`practice_area`. מאפשר לכותב לאחזר פסקאות-בלוק אמיתיות של דפנה (T2/T3). מקור-סגנון בלבד (INV-LRN5). אידמפוטנטי (מנקה per-decision). `--dry-run` (default) / `--apply`. דורש POSTGRES_URL + Voyage. **רץ מקומית** (venv). | ידני (`python scripts/backfill_style_exemplars.py --apply`) | ## תיקיית `.archive/` — סקריפטים שהושלמו diff --git a/scripts/backfill_style_exemplars.py b/scripts/backfill_style_exemplars.py new file mode 100644 index 0000000..3d9f28f --- /dev/null +++ b/scripts/backfill_style_exemplars.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +"""T1 — אכלוס style_exemplars מקורפוס דפנה (style_corpus + internal_committee). + +מפצל כל החלטה של דפנה לסעיפים (chunker._split_into_sections), ומכל סעיף לפסקאות, +מטמיע (Voyage) ושומר ב-style_exemplars עם section/outcome/practice_area — כדי +שהכותב יוכל לאחזר פסקאות-בלוק אמיתיות של דפנה בזמן כתיבה (T2/T3). + +מקור-סגנון בלבד (INV-LRN5) — לא מהות. אידמפוטנטי: מנקה לכל decision לפני הכנסה. + +שימוש: + python3 scripts/backfill_style_exemplars.py # dry-run (סופר בלבד) + python3 scripts/backfill_style_exemplars.py --apply # מטמיע ושומר + +דורש POSTGRES_URL + מפתח Voyage בסביבה (כמו שאר ה-MCP). +""" +from __future__ import annotations + +import argparse +import asyncio +import logging + +from legal_mcp.services import db, embeddings +from legal_mcp.services.chunker import _split_into_sections + +logging.basicConfig(level=logging.INFO, format="%(message)s") +log = logging.getLogger("backfill_exemplars") + +# chunker section_type → style_exemplars.section +_SECTION_MAP = { + "facts": "background", + "appellant_claims": "claims", + "respondent_claims": "claims", + "legal_analysis": "discussion", + "conclusion": "summary", + "ruling": "summary", + "intro": "other", + "other": "other", +} + +MIN_WORDS = 25 # skip tiny fragments +MAX_WORDS = 450 # skip over-long blobs (likely un-split) +MAX_PER_SECTION = 15 + + +def _paragraphs(section_text: str) -> list[str]: + """Split a section into paragraph units (blank-line separated; fall back to lines).""" + raw = [p.strip() for p in section_text.split("\n\n")] + if len(raw) <= 1: + raw = [p.strip() for p in section_text.split("\n")] + out = [] + for p in raw: + wc = len(p.split()) + if MIN_WORDS <= wc <= MAX_WORDS: + out.append(p) + return out[:MAX_PER_SECTION] + + +async def _gather_sources() -> list[dict]: + """All of Dafna's decisions: style_corpus + internal_committee (chair דפנה).""" + pool = await db.get_pool() + rows: list[dict] = [] + async with pool.acquire() as conn: + sc = await conn.fetch( + "SELECT decision_number, full_text, outcome, practice_area " + "FROM style_corpus WHERE full_text <> ''" + ) + for r in sc: + rows.append({ + "decision_number": r["decision_number"] or "", + "source": "style_corpus", + "full_text": r["full_text"], + "outcome": r["outcome"] or "", + "practice_area": r["practice_area"] or "", + }) + ic = await conn.fetch( + "SELECT case_number, full_text, practice_area FROM case_law " + "WHERE source_kind = 'internal_committee' AND coalesce(chair_name,'') LIKE '%דפנה%' " + "AND coalesce(full_text,'') <> ''" + ) + for r in ic: + rows.append({ + "decision_number": r["case_number"] or "", + "source": "internal_committee", + "full_text": r["full_text"], + "outcome": "", + "practice_area": r["practice_area"] or "", + }) + return rows + + +async def main(apply: bool) -> None: + sources = await _gather_sources() + log.info("מקורות: %d החלטות של דפנה (style_corpus + internal_committee)", len(sources)) + + total_paras = 0 + for src in sources: + units: list[tuple[str, str]] = [] # (section, paragraph) + for section_type, section_text in _split_into_sections(src["full_text"]): + section = _SECTION_MAP.get(section_type, "other") + for para in _paragraphs(section_text): + units.append((section, para)) + if not units: + continue + total_paras += len(units) + log.info(" %-14s %-16s → %d פסקאות", src["source"], src["decision_number"], len(units)) + if not apply: + continue + + await db.delete_style_exemplars(src["decision_number"], src["source"]) + texts = [u[1] for u in units] + vecs = await embeddings.embed_texts(texts, input_type="document") + for (section, para), vec in zip(units, vecs): + await db.insert_style_exemplar( + decision_number=src["decision_number"], source=src["source"], + practice_area=src["practice_area"], outcome=src["outcome"], + section=section, paragraph_text=para, word_count=len(para.split()), + embedding=vec, + ) + + if apply: + cov = await db.count_style_exemplars() + log.info("הושלם. style_exemplars: %s", cov) + else: + log.info("dry-run: %d פסקאות יוטמעו. הרץ --apply לביצוע.", total_paras) + + +if __name__ == "__main__": + ap = argparse.ArgumentParser() + ap.add_argument("--apply", action="store_true", help="embed + insert (default: dry-run)") + args = ap.parse_args() + asyncio.run(main(args.apply))