feat(style-acq T1-T3): קורפוס-דוגמאות של דפנה לכותב (style_exemplars)
ממלא את ערוץ-הדוגמאות (B) של מערכת רכישת-הסגנון: הכותב מאחזר פסקאות-בלוק אמיתיות של דפנה בזמן כתיבה, ממוקדות section+outcome+practice_area. T1 — תשתית + backfill: - SCHEMA_V27: טבלת style_exemplars (purpose-built — בלי תיקים מזויפים בשרשרת decision_paragraphs). decision_number/source/section/outcome/practice_area+embedding. - db: insert/delete/search_style_exemplars + count_style_exemplars. - scripts/backfill_style_exemplars.py: מפצל קורפוס דפנה (style_corpus + internal_committee) לסעיפים→פסקאות, embed, שמירה. אידמפוטנטי, dry-run/apply. T2 — אחזור ממוקד: - search_style_exemplars(section, outcome, practice_area) — section=hard filter, outcome/practice_area=soft. block_writer._build_precedents_context ממפה block→section ומאחזר (ראשי), לצד הנתיב הישן (משלים). T3 — contrastive/adapt: - הדוגמאות מתויגות "מבנה/קול בלבד — התאם, אל תעתיק תוכן"; פסקה מלאה (1100 תווים). INV-LRN5 (טוהר — סגנון בלבד). G11. הרצת backfill --apply בנפרד. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -725,19 +725,43 @@ async def _build_precedents_context(
|
|||||||
style_parts: list[str] = []
|
style_parts: list[str] = []
|
||||||
caselaw_parts: list[str] = []
|
caselaw_parts: list[str] = []
|
||||||
case_law_ids: list[str] = []
|
case_law_ids: list[str] = []
|
||||||
|
# block → golden-ratio section, for targeted exemplar retrieval (T2)
|
||||||
|
_BLOCK_SECTION = {
|
||||||
|
"block-vav": "background", "block-zayin": "claims",
|
||||||
|
"block-yod": "discussion", "block-yod-alef": "summary",
|
||||||
|
}
|
||||||
try:
|
try:
|
||||||
case = await db.get_case(case_id)
|
case = await db.get_case(case_id)
|
||||||
case_number = case.get("case_number", "") if case else ""
|
case_number = case.get("case_number", "") if case else ""
|
||||||
subject = case.get("subject", "") if case else ""
|
subject = case.get("subject", "") if case else ""
|
||||||
|
practice_area = case.get("practice_area", "") if case else ""
|
||||||
|
decision = await db.get_decision_by_case(case_id)
|
||||||
|
outcome = (decision or {}).get("outcome", "")
|
||||||
query = f"דיון משפטי בנושא {subject}" if subject else "דיון משפטי ועדת ערר"
|
query = f"דיון משפטי בנושא {subject}" if subject else "דיון משפטי ועדת ערר"
|
||||||
query_emb = await embeddings.embed_query(query)
|
query_emb = await embeddings.embed_query(query)
|
||||||
|
section = _BLOCK_SECTION.get(block_id)
|
||||||
|
|
||||||
# Stream 1: paragraph_embeddings — Dafna's own prose (STYLE exemplars, not content)
|
# Stream 1a (PRIMARY): Dafna's own block-level prose from her corpus
|
||||||
|
# (style_exemplars) — matched by section + outcome + practice_area (T2/T3).
|
||||||
|
if section:
|
||||||
|
exemplars = await db.search_style_exemplars(
|
||||||
|
query_embedding=query_emb, section=section,
|
||||||
|
outcome=outcome or None, practice_area=practice_area or None, limit=6,
|
||||||
|
)
|
||||||
|
exemplars = [e for e in exemplars if e.get("decision_number", "") != case_number]
|
||||||
|
for e in exemplars[:4]:
|
||||||
|
style_parts.append(
|
||||||
|
f"[דוגמת-סגנון (מבנה/קול בלבד — התאם, אל תעתיק תוכן) — "
|
||||||
|
f"{e.get('decision_number', '?')}, {section}, "
|
||||||
|
f"outcome={e.get('outcome') or '—'}]\n{e['paragraph_text'][:1100]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Stream 1b: paragraphs from pipeline cases (legacy path; may be empty)
|
||||||
para_results = await db.search_similar_paragraphs(
|
para_results = await db.search_similar_paragraphs(
|
||||||
query_embedding=query_emb, limit=10, block_type="block-yod",
|
query_embedding=query_emb, limit=10, block_type="block-yod",
|
||||||
)
|
)
|
||||||
para_results = [r for r in para_results if r.get("case_number", "") != case_number]
|
para_results = [r for r in para_results if r.get("case_number", "") != case_number]
|
||||||
for r in para_results[:4]:
|
for r in para_results[:2]:
|
||||||
style_parts.append(
|
style_parts.append(
|
||||||
f"[דוגמת-סגנון — החלטת {r.get('case_number', '?')} "
|
f"[דוגמת-סגנון — החלטת {r.get('case_number', '?')} "
|
||||||
f"{r.get('case_title', '')}, בלוק {r.get('block_type', '')}]\n{r['content'][:500]}"
|
f"{r.get('case_title', '')}, בלוק {r.get('block_type', '')}]\n{r['content'][:500]}"
|
||||||
|
|||||||
@@ -1204,6 +1204,28 @@ CREATE INDEX IF NOT EXISTS idx_draft_final_pairs_case ON draft_final_pairs(case_
|
|||||||
CREATE INDEX IF NOT EXISTS idx_draft_final_pairs_status ON draft_final_pairs(status);
|
CREATE INDEX IF NOT EXISTS idx_draft_final_pairs_status ON draft_final_pairs(status);
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
SCHEMA_V27_SQL = """
|
||||||
|
-- style_exemplars (T1-T3): block-level paragraphs from Dafna's OWN decisions
|
||||||
|
-- (style_corpus + internal_committee finals), embedded for retrieval as
|
||||||
|
-- style exemplars at write-time. Purpose-built so we DON'T fabricate synthetic
|
||||||
|
-- cases just to reuse decision_paragraphs. INV-LRN5: style material only — the
|
||||||
|
-- writer is told to adapt structure/voice, copy only boilerplate, never substance.
|
||||||
|
CREATE TABLE IF NOT EXISTS style_exemplars (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
decision_number TEXT DEFAULT '',
|
||||||
|
source TEXT DEFAULT '', -- style_corpus | internal_committee
|
||||||
|
practice_area TEXT DEFAULT '',
|
||||||
|
outcome TEXT DEFAULT '', -- rejection | partial_acceptance | full_acceptance | ''
|
||||||
|
section TEXT DEFAULT 'other', -- background | claims | discussion | summary | other
|
||||||
|
paragraph_text TEXT NOT NULL,
|
||||||
|
word_count INTEGER DEFAULT 0,
|
||||||
|
embedding vector(1024),
|
||||||
|
created_at TIMESTAMPTZ DEFAULT now()
|
||||||
|
);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_style_exemplars_section ON style_exemplars(section);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_style_exemplars_decision ON style_exemplars(decision_number, source);
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
||||||
async with pool.acquire() as conn:
|
async with pool.acquire() as conn:
|
||||||
@@ -1234,7 +1256,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
|||||||
await conn.execute(SCHEMA_V24_SQL)
|
await conn.execute(SCHEMA_V24_SQL)
|
||||||
await conn.execute(SCHEMA_V25_SQL)
|
await conn.execute(SCHEMA_V25_SQL)
|
||||||
await conn.execute(SCHEMA_V26_SQL)
|
await conn.execute(SCHEMA_V26_SQL)
|
||||||
logger.info("Database schema initialized (v1-v26)")
|
await conn.execute(SCHEMA_V27_SQL)
|
||||||
|
logger.info("Database schema initialized (v1-v27)")
|
||||||
|
|
||||||
|
|
||||||
async def init_schema() -> None:
|
async def init_schema() -> None:
|
||||||
@@ -2329,6 +2352,85 @@ async def list_draft_final_pairs(status: str | None = None, limit: int = 200) ->
|
|||||||
return [dict(r) for r in rows]
|
return [dict(r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
async def insert_style_exemplar(
|
||||||
|
decision_number: str, source: str, practice_area: str, outcome: str,
|
||||||
|
section: str, paragraph_text: str, word_count: int, embedding: list[float],
|
||||||
|
) -> None:
|
||||||
|
"""Insert one block-level style exemplar (T1 backfill)."""
|
||||||
|
pool = await get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
await conn.execute(
|
||||||
|
"""INSERT INTO style_exemplars
|
||||||
|
(decision_number, source, practice_area, outcome, section,
|
||||||
|
paragraph_text, word_count, embedding)
|
||||||
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)""",
|
||||||
|
decision_number, source, practice_area, outcome, section,
|
||||||
|
paragraph_text, word_count, str(embedding),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def delete_style_exemplars(decision_number: str, source: str) -> int:
|
||||||
|
"""Idempotent backfill: clear a decision's exemplars before re-inserting."""
|
||||||
|
pool = await get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
res = await conn.execute(
|
||||||
|
"DELETE FROM style_exemplars WHERE decision_number = $1 AND source = $2",
|
||||||
|
decision_number, source,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
return int(res.split()[-1])
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
async def search_style_exemplars(
|
||||||
|
query_embedding: list[float],
|
||||||
|
section: str | None = None,
|
||||||
|
outcome: str | None = None,
|
||||||
|
practice_area: str | None = None,
|
||||||
|
limit: int = 6,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Retrieve Dafna's own block-level paragraphs as STYLE exemplars (T2).
|
||||||
|
Filters by section (block) + optionally outcome/practice_area for the closest
|
||||||
|
match to the block being written. Soft filters: outcome/practice_area narrow but
|
||||||
|
never zero-out — section is the hard filter."""
|
||||||
|
pool = await get_pool()
|
||||||
|
conditions, params, idx = [], [query_embedding, limit], 3
|
||||||
|
if section:
|
||||||
|
conditions.append(f"section = ${idx}"); params.append(section); idx += 1
|
||||||
|
if outcome:
|
||||||
|
conditions.append(f"(outcome = ${idx} OR outcome = '')"); params.append(outcome); idx += 1
|
||||||
|
if practice_area:
|
||||||
|
conditions.append(f"(practice_area = ${idx} OR practice_area = '')"); params.append(practice_area); idx += 1
|
||||||
|
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
|
||||||
|
sql = f"""
|
||||||
|
SELECT decision_number, source, section, outcome, practice_area,
|
||||||
|
paragraph_text, word_count,
|
||||||
|
1 - (embedding <=> $1) AS score
|
||||||
|
FROM style_exemplars
|
||||||
|
{where}
|
||||||
|
ORDER BY embedding <=> $1
|
||||||
|
LIMIT $2
|
||||||
|
"""
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
rows = await conn.fetch(sql, *params)
|
||||||
|
return [dict(r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
async def count_style_exemplars() -> dict:
|
||||||
|
"""Coverage check for the backfill."""
|
||||||
|
pool = await get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
total = await conn.fetchval("SELECT count(*) FROM style_exemplars")
|
||||||
|
by_section = await conn.fetch(
|
||||||
|
"SELECT section, count(*) AS n FROM style_exemplars GROUP BY section ORDER BY n DESC"
|
||||||
|
)
|
||||||
|
decisions = await conn.fetchval(
|
||||||
|
"SELECT count(DISTINCT decision_number) FROM style_exemplars"
|
||||||
|
)
|
||||||
|
return {"total": total, "decisions": decisions, "by_section": [dict(r) for r in by_section]}
|
||||||
|
|
||||||
|
|
||||||
async def upsert_style_pattern(
|
async def upsert_style_pattern(
|
||||||
pattern_type: str,
|
pattern_type: str,
|
||||||
pattern_text: str,
|
pattern_text: str,
|
||||||
|
|||||||
@@ -45,6 +45,7 @@
|
|||||||
| `backfill_multimodal_precedents.py` | python | Backfill voyage-multimodal-3 page embeddings על רשומות `case_law` (external_upload + internal_committee) שחסרות `precedent_image_embeddings`. בונה אינדקס קבצים מ-`data/precedent-library/` ו-`data/internal-decisions/`, מנסה התאמה לפי tokens של מספרי תיק (כולל parts-match לפורמטים שונים של Nevo doc-id). מדלג על רשומות בלי קובץ-מקור או עם MD בלבד (PyMuPDF לא מרנדר MD). תומך `--dry-run` (default) / `--apply` / `--only external_upload\|internal_committee` / `--limit N`. רץ בקונטיינר (יש `/data` + Voyage env). **הופעל 2026-05-26**: 70 חסרים → 26 backfilled (503 pages, ~$0.21 voyage tokens), 44 אין-קובץ-מקור. ניתן להריץ שוב אחרי שיועלו עוד PDF/DOCX לספרייה | ידני |
|
| `backfill_multimodal_precedents.py` | python | Backfill voyage-multimodal-3 page embeddings על רשומות `case_law` (external_upload + internal_committee) שחסרות `precedent_image_embeddings`. בונה אינדקס קבצים מ-`data/precedent-library/` ו-`data/internal-decisions/`, מנסה התאמה לפי tokens של מספרי תיק (כולל parts-match לפורמטים שונים של Nevo doc-id). מדלג על רשומות בלי קובץ-מקור או עם MD בלבד (PyMuPDF לא מרנדר MD). תומך `--dry-run` (default) / `--apply` / `--only external_upload\|internal_committee` / `--limit N`. רץ בקונטיינר (יש `/data` + Voyage env). **הופעל 2026-05-26**: 70 חסרים → 26 backfilled (503 pages, ~$0.21 voyage tokens), 44 אין-קובץ-מקור. ניתן להריץ שוב אחרי שיועלו עוד PDF/DOCX לספרייה | ידני |
|
||||||
| `monitor_halacha_quality.py` | python | מנטר איכות חילוץ הלכות. בודק drift של `avg(confidence)` בין baseline היסטורי לחלון אחרון. מחזיר JSON מטריקות + alert ב-stderr אם drift > threshold (ברירת מחדל 5%). 2 סדרות: trusted (approved+published) ו-all_extracted. תומך `--window N` / `--threshold X` / `--min-sample N` / `--silent` / `--exit-on-alert`. רץ ב-container או מקומית עם `mcp-server/.venv` (אין תלות ב-LLM, רק SQL). **תזמון מומלץ**: `0 8 * * 1` (יום ראשון 08:00, שבועי) | `0 8 * * 1` (לתזמן) |
|
| `monitor_halacha_quality.py` | python | מנטר איכות חילוץ הלכות. בודק drift של `avg(confidence)` בין baseline היסטורי לחלון אחרון. מחזיר JSON מטריקות + alert ב-stderr אם drift > threshold (ברירת מחדל 5%). 2 סדרות: trusted (approved+published) ו-all_extracted. תומך `--window N` / `--threshold X` / `--min-sample N` / `--silent` / `--exit-on-alert`. רץ ב-container או מקומית עם `mcp-server/.venv` (אין תלות ב-LLM, רק SQL). **תזמון מומלץ**: `0 8 * * 1` (יום ראשון 08:00, שבועי) | `0 8 * * 1` (לתזמן) |
|
||||||
| `audit_training_corpus.py` | python | audit של `style_corpus` — לכל החלטה: שדות מטא-דאטה מאוכלסים (`summary`/`outcome`/`key_principles`/`appeal_subtype`/`subject_categories`), קישור ל-`documents` (FK + chunks + embeddings). מפיק `data/audit/corpus-YYYY-MM-DD.json` + summary בקונסול. דרוש `POSTGRES_URL` או POSTGRES_*. אין תלויות חיצוניות מלבד asyncpg. **רץ מהמכונה המקומית** (לא קונטיינר) — חיבור ישיר ל-Postgres :5433 | ידני / קדם-עבודה לפני enrichment של מטא-דאטה |
|
| `audit_training_corpus.py` | python | audit של `style_corpus` — לכל החלטה: שדות מטא-דאטה מאוכלסים (`summary`/`outcome`/`key_principles`/`appeal_subtype`/`subject_categories`), קישור ל-`documents` (FK + chunks + embeddings). מפיק `data/audit/corpus-YYYY-MM-DD.json` + summary בקונסול. דרוש `POSTGRES_URL` או POSTGRES_*. אין תלויות חיצוניות מלבד asyncpg. **רץ מהמכונה המקומית** (לא קונטיינר) — חיבור ישיר ל-Postgres :5433 | ידני / קדם-עבודה לפני enrichment של מטא-דאטה |
|
||||||
|
| `backfill_style_exemplars.py` | python | **T1 (style-acquisition)** — מאכלס `style_exemplars` מקורפוס דפנה (`style_corpus` + `internal_committee` chair=דפנה): מפצל לסעיפים (`chunker._split_into_sections`) → פסקאות (25-450 מילים) → embed (Voyage) → שמירה עם `section`/`outcome`/`practice_area`. מאפשר לכותב לאחזר פסקאות-בלוק אמיתיות של דפנה (T2/T3). מקור-סגנון בלבד (INV-LRN5). אידמפוטנטי (מנקה per-decision). `--dry-run` (default) / `--apply`. דורש POSTGRES_URL + Voyage. **רץ מקומית** (venv). | ידני (`python scripts/backfill_style_exemplars.py --apply`) |
|
||||||
|
|
||||||
## תיקיית `.archive/` — סקריפטים שהושלמו
|
## תיקיית `.archive/` — סקריפטים שהושלמו
|
||||||
|
|
||||||
|
|||||||
131
scripts/backfill_style_exemplars.py
Normal file
131
scripts/backfill_style_exemplars.py
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""T1 — אכלוס style_exemplars מקורפוס דפנה (style_corpus + internal_committee).
|
||||||
|
|
||||||
|
מפצל כל החלטה של דפנה לסעיפים (chunker._split_into_sections), ומכל סעיף לפסקאות,
|
||||||
|
מטמיע (Voyage) ושומר ב-style_exemplars עם section/outcome/practice_area — כדי
|
||||||
|
שהכותב יוכל לאחזר פסקאות-בלוק אמיתיות של דפנה בזמן כתיבה (T2/T3).
|
||||||
|
|
||||||
|
מקור-סגנון בלבד (INV-LRN5) — לא מהות. אידמפוטנטי: מנקה לכל decision לפני הכנסה.
|
||||||
|
|
||||||
|
שימוש:
|
||||||
|
python3 scripts/backfill_style_exemplars.py # dry-run (סופר בלבד)
|
||||||
|
python3 scripts/backfill_style_exemplars.py --apply # מטמיע ושומר
|
||||||
|
|
||||||
|
דורש POSTGRES_URL + מפתח Voyage בסביבה (כמו שאר ה-MCP).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from legal_mcp.services import db, embeddings
|
||||||
|
from legal_mcp.services.chunker import _split_into_sections
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
||||||
|
log = logging.getLogger("backfill_exemplars")
|
||||||
|
|
||||||
|
# chunker section_type → style_exemplars.section
|
||||||
|
_SECTION_MAP = {
|
||||||
|
"facts": "background",
|
||||||
|
"appellant_claims": "claims",
|
||||||
|
"respondent_claims": "claims",
|
||||||
|
"legal_analysis": "discussion",
|
||||||
|
"conclusion": "summary",
|
||||||
|
"ruling": "summary",
|
||||||
|
"intro": "other",
|
||||||
|
"other": "other",
|
||||||
|
}
|
||||||
|
|
||||||
|
MIN_WORDS = 25 # skip tiny fragments
|
||||||
|
MAX_WORDS = 450 # skip over-long blobs (likely un-split)
|
||||||
|
MAX_PER_SECTION = 15
|
||||||
|
|
||||||
|
|
||||||
|
def _paragraphs(section_text: str) -> list[str]:
|
||||||
|
"""Split a section into paragraph units (blank-line separated; fall back to lines)."""
|
||||||
|
raw = [p.strip() for p in section_text.split("\n\n")]
|
||||||
|
if len(raw) <= 1:
|
||||||
|
raw = [p.strip() for p in section_text.split("\n")]
|
||||||
|
out = []
|
||||||
|
for p in raw:
|
||||||
|
wc = len(p.split())
|
||||||
|
if MIN_WORDS <= wc <= MAX_WORDS:
|
||||||
|
out.append(p)
|
||||||
|
return out[:MAX_PER_SECTION]
|
||||||
|
|
||||||
|
|
||||||
|
async def _gather_sources() -> list[dict]:
|
||||||
|
"""All of Dafna's decisions: style_corpus + internal_committee (chair דפנה)."""
|
||||||
|
pool = await db.get_pool()
|
||||||
|
rows: list[dict] = []
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
sc = await conn.fetch(
|
||||||
|
"SELECT decision_number, full_text, outcome, practice_area "
|
||||||
|
"FROM style_corpus WHERE full_text <> ''"
|
||||||
|
)
|
||||||
|
for r in sc:
|
||||||
|
rows.append({
|
||||||
|
"decision_number": r["decision_number"] or "",
|
||||||
|
"source": "style_corpus",
|
||||||
|
"full_text": r["full_text"],
|
||||||
|
"outcome": r["outcome"] or "",
|
||||||
|
"practice_area": r["practice_area"] or "",
|
||||||
|
})
|
||||||
|
ic = await conn.fetch(
|
||||||
|
"SELECT case_number, full_text, practice_area FROM case_law "
|
||||||
|
"WHERE source_kind = 'internal_committee' AND coalesce(chair_name,'') LIKE '%דפנה%' "
|
||||||
|
"AND coalesce(full_text,'') <> ''"
|
||||||
|
)
|
||||||
|
for r in ic:
|
||||||
|
rows.append({
|
||||||
|
"decision_number": r["case_number"] or "",
|
||||||
|
"source": "internal_committee",
|
||||||
|
"full_text": r["full_text"],
|
||||||
|
"outcome": "",
|
||||||
|
"practice_area": r["practice_area"] or "",
|
||||||
|
})
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
async def main(apply: bool) -> None:
|
||||||
|
sources = await _gather_sources()
|
||||||
|
log.info("מקורות: %d החלטות של דפנה (style_corpus + internal_committee)", len(sources))
|
||||||
|
|
||||||
|
total_paras = 0
|
||||||
|
for src in sources:
|
||||||
|
units: list[tuple[str, str]] = [] # (section, paragraph)
|
||||||
|
for section_type, section_text in _split_into_sections(src["full_text"]):
|
||||||
|
section = _SECTION_MAP.get(section_type, "other")
|
||||||
|
for para in _paragraphs(section_text):
|
||||||
|
units.append((section, para))
|
||||||
|
if not units:
|
||||||
|
continue
|
||||||
|
total_paras += len(units)
|
||||||
|
log.info(" %-14s %-16s → %d פסקאות", src["source"], src["decision_number"], len(units))
|
||||||
|
if not apply:
|
||||||
|
continue
|
||||||
|
|
||||||
|
await db.delete_style_exemplars(src["decision_number"], src["source"])
|
||||||
|
texts = [u[1] for u in units]
|
||||||
|
vecs = await embeddings.embed_texts(texts, input_type="document")
|
||||||
|
for (section, para), vec in zip(units, vecs):
|
||||||
|
await db.insert_style_exemplar(
|
||||||
|
decision_number=src["decision_number"], source=src["source"],
|
||||||
|
practice_area=src["practice_area"], outcome=src["outcome"],
|
||||||
|
section=section, paragraph_text=para, word_count=len(para.split()),
|
||||||
|
embedding=vec,
|
||||||
|
)
|
||||||
|
|
||||||
|
if apply:
|
||||||
|
cov = await db.count_style_exemplars()
|
||||||
|
log.info("הושלם. style_exemplars: %s", cov)
|
||||||
|
else:
|
||||||
|
log.info("dry-run: %d פסקאות יוטמעו. הרץ --apply לביצוע.", total_paras)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--apply", action="store_true", help="embed + insert (default: dry-run)")
|
||||||
|
args = ap.parse_args()
|
||||||
|
asyncio.run(main(args.apply))
|
||||||
Reference in New Issue
Block a user