feat(style-acq T1-T3): קורפוס-דוגמאות של דפנה לכותב (style_exemplars)

ממלא את ערוץ-הדוגמאות (B) של מערכת רכישת-הסגנון: הכותב מאחזר פסקאות-בלוק אמיתיות של דפנה בזמן כתיבה, ממוקדות section+outcome+practice_area. T1 — תשתית + backfill: - SCHEMA_V27: טבלת style_exemplars (purpose-built — בלי תיקים מזויפים בשרשרת decision_paragraphs). decision_number/source/section/outcome/practice_area+embedding. - db: insert/delete/search_style_exemplars + count_style_exemplars. - scripts/backfill_style_exemplars.py: מפצל קורפוס דפנה (style_corpus + internal_committee) לסעיפים→פסקאות, embed, שמירה. אידמפוטנטי, dry-run/apply. T2 — אחזור ממוקד: - search_style_exemplars(section, outcome, practice_area) — section=hard filter, outcome/practice_area=soft. block_writer._build_precedents_context ממפה block→section ומאחזר (ראשי), לצד הנתיב הישן (משלים). T3 — contrastive/adapt: - הדוגמאות מתויגות "מבנה/קול בלבד — התאם, אל תעתיק תוכן"; פסקה מלאה (1100 תווים). INV-LRN5 (טוהר — סגנון בלבד). G11. הרצת backfill --apply בנפרד. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 18:10:01 +00:00
parent a3451775fa
commit 2e20e27e17
4 changed files with 261 additions and 3 deletions
--- a/mcp-server/src/legal_mcp/services/block_writer.py
+++ b/mcp-server/src/legal_mcp/services/block_writer.py
@@ -725,19 +725,43 @@ async def _build_precedents_context(
    style_parts: list[str] = []
    caselaw_parts: list[str] = []
    case_law_ids: list[str] = []
+    # block → golden-ratio section, for targeted exemplar retrieval (T2)
+    _BLOCK_SECTION = {
+        "block-vav": "background", "block-zayin": "claims",
+        "block-yod": "discussion", "block-yod-alef": "summary",
+    }
    try:
        case = await db.get_case(case_id)
        case_number = case.get("case_number", "") if case else ""
        subject = case.get("subject", "") if case else ""
+        practice_area = case.get("practice_area", "") if case else ""
+        decision = await db.get_decision_by_case(case_id)
+        outcome = (decision or {}).get("outcome", "")
        query = f"דיון משפטי בנושא {subject}" if subject else "דיון משפטי ועדת ערר"
        query_emb = await embeddings.embed_query(query)
+        section = _BLOCK_SECTION.get(block_id)

-        # Stream 1: paragraph_embeddings — Dafna's own prose (STYLE exemplars, not content)
+        # Stream 1a (PRIMARY): Dafna's own block-level prose from her corpus
+        # (style_exemplars) — matched by section + outcome + practice_area (T2/T3).
+        if section:
+            exemplars = await db.search_style_exemplars(
+                query_embedding=query_emb, section=section,
+                outcome=outcome or None, practice_area=practice_area or None, limit=6,
+            )
+            exemplars = [e for e in exemplars if e.get("decision_number", "") != case_number]
+            for e in exemplars[:4]:
+                style_parts.append(
+                    f"[דוגמת-סגנון (מבנה/קול בלבד — התאם, אל תעתיק תוכן) — "
+                    f"{e.get('decision_number', '?')}, {section}, "
+                    f"outcome={e.get('outcome') or '—'}]\n{e['paragraph_text'][:1100]}"
+                )
+
+        # Stream 1b: paragraphs from pipeline cases (legacy path; may be empty)
        para_results = await db.search_similar_paragraphs(
            query_embedding=query_emb, limit=10, block_type="block-yod",
        )
        para_results = [r for r in para_results if r.get("case_number", "") != case_number]
-        for r in para_results[:4]:
+        for r in para_results[:2]:
            style_parts.append(
                f"[דוגמת-סגנון — החלטת {r.get('case_number', '?')} "
                f"{r.get('case_title', '')}, בלוק {r.get('block_type', '')}]\n{r['content'][:500]}"
--- a/mcp-server/src/legal_mcp/services/db.py
+++ b/mcp-server/src/legal_mcp/services/db.py
@@ -1204,6 +1204,28 @@ CREATE INDEX IF NOT EXISTS idx_draft_final_pairs_case ON draft_final_pairs(case_
 CREATE INDEX IF NOT EXISTS idx_draft_final_pairs_status ON draft_final_pairs(status);
 """

+SCHEMA_V27_SQL = """
+-- style_exemplars (T1-T3): block-level paragraphs from Dafna's OWN decisions
+-- (style_corpus + internal_committee finals), embedded for retrieval as
+-- style exemplars at write-time. Purpose-built so we DON'T fabricate synthetic
+-- cases just to reuse decision_paragraphs. INV-LRN5: style material only — the
+-- writer is told to adapt structure/voice, copy only boilerplate, never substance.
+CREATE TABLE IF NOT EXISTS style_exemplars (
+    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
+    decision_number TEXT DEFAULT '',
+    source TEXT DEFAULT '',                 -- style_corpus | internal_committee
+    practice_area TEXT DEFAULT '',
+    outcome TEXT DEFAULT '',                -- rejection | partial_acceptance | full_acceptance | ''
+    section TEXT DEFAULT 'other',           -- background | claims | discussion | summary | other
+    paragraph_text TEXT NOT NULL,
+    word_count INTEGER DEFAULT 0,
+    embedding vector(1024),
+    created_at TIMESTAMPTZ DEFAULT now()
+);
+CREATE INDEX IF NOT EXISTS idx_style_exemplars_section ON style_exemplars(section);
+CREATE INDEX IF NOT EXISTS idx_style_exemplars_decision ON style_exemplars(decision_number, source);
+"""
+

 async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
    async with pool.acquire() as conn:
@@ -1234,7 +1256,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
        await conn.execute(SCHEMA_V24_SQL)
        await conn.execute(SCHEMA_V25_SQL)
        await conn.execute(SCHEMA_V26_SQL)
-    logger.info("Database schema initialized (v1-v26)")
+        await conn.execute(SCHEMA_V27_SQL)
+    logger.info("Database schema initialized (v1-v27)")


 async def init_schema() -> None:
@@ -2329,6 +2352,85 @@ async def list_draft_final_pairs(status: str | None = None, limit: int = 200) ->
    return [dict(r) for r in rows]


+async def insert_style_exemplar(
+    decision_number: str, source: str, practice_area: str, outcome: str,
+    section: str, paragraph_text: str, word_count: int, embedding: list[float],
+) -> None:
+    """Insert one block-level style exemplar (T1 backfill)."""
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        await conn.execute(
+            """INSERT INTO style_exemplars
+                   (decision_number, source, practice_area, outcome, section,
+                    paragraph_text, word_count, embedding)
+               VALUES ($1, $2, $3, $4, $5, $6, $7, $8)""",
+            decision_number, source, practice_area, outcome, section,
+            paragraph_text, word_count, str(embedding),
+        )
+
+
+async def delete_style_exemplars(decision_number: str, source: str) -> int:
+    """Idempotent backfill: clear a decision's exemplars before re-inserting."""
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        res = await conn.execute(
+            "DELETE FROM style_exemplars WHERE decision_number = $1 AND source = $2",
+            decision_number, source,
+        )
+    try:
+        return int(res.split()[-1])
+    except (ValueError, IndexError):
+        return 0
+
+
+async def search_style_exemplars(
+    query_embedding: list[float],
+    section: str | None = None,
+    outcome: str | None = None,
+    practice_area: str | None = None,
+    limit: int = 6,
+) -> list[dict]:
+    """Retrieve Dafna's own block-level paragraphs as STYLE exemplars (T2).
+    Filters by section (block) + optionally outcome/practice_area for the closest
+    match to the block being written. Soft filters: outcome/practice_area narrow but
+    never zero-out — section is the hard filter."""
+    pool = await get_pool()
+    conditions, params, idx = [], [query_embedding, limit], 3
+    if section:
+        conditions.append(f"section = ${idx}"); params.append(section); idx += 1
+    if outcome:
+        conditions.append(f"(outcome = ${idx} OR outcome = '')"); params.append(outcome); idx += 1
+    if practice_area:
+        conditions.append(f"(practice_area = ${idx} OR practice_area = '')"); params.append(practice_area); idx += 1
+    where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
+    sql = f"""
+        SELECT decision_number, source, section, outcome, practice_area,
+               paragraph_text, word_count,
+               1 - (embedding <=> $1) AS score
+        FROM style_exemplars
+        {where}
+        ORDER BY embedding <=> $1
+        LIMIT $2
+    """
+    async with pool.acquire() as conn:
+        rows = await conn.fetch(sql, *params)
+    return [dict(r) for r in rows]
+
+
+async def count_style_exemplars() -> dict:
+    """Coverage check for the backfill."""
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        total = await conn.fetchval("SELECT count(*) FROM style_exemplars")
+        by_section = await conn.fetch(
+            "SELECT section, count(*) AS n FROM style_exemplars GROUP BY section ORDER BY n DESC"
+        )
+        decisions = await conn.fetchval(
+            "SELECT count(DISTINCT decision_number) FROM style_exemplars"
+        )
+    return {"total": total, "decisions": decisions, "by_section": [dict(r) for r in by_section]}
+
+
 async def upsert_style_pattern(
    pattern_type: str,
    pattern_text: str,