ops: switch embeddings to voyage-3 + plan for context-3 + multimodal-3.5

Phase A — voyage-3 migration (executed): - VOYAGE_MODEL=voyage-3 set in Coolify (legal-ai app) and ~/.env - scripts/reembed_voyage.py: re-embeds document_chunks (6157), case_law_embeddings (9), precedent_chunks (385), and halachot (400) using the new model. paragraph_embeddings was empty. 6951 rows re-embedded in 93s, ~75 rows/sec. - Same 1024 dim → no schema change needed. Why voyage-3 over voyage-law-2: benchmark on 3 Hebrew legal queries with real passages from the corpus gave voyage-3 perfect ordering on 3/3 tests AND the largest separation (+0.483 vs voyage-law-2's +0.238). voyage-4 family had bigger separation but missed top-1 on the hardest test. Phase B (voyage-context-3) and Phase C (voyage-multimodal-3.5 for scanned + appraiser docs) are designed in docs/voyage-upgrades-plan.md but deferred — to be picked up in a fresh conversation. The plan includes: - Phase B: contextualized embeddings refactor (~49% recall lift on legal docs per Anthropic's research). Same dim, but ingestion pipeline must pass full doc context per chunk. - Phase C: page-level image embeddings via voyage-multimodal-3.5, stored in a parallel *_image_embeddings table. Hybrid text+image search. Targets appraiser report tables and scanned PDFs where current OCR loses layout. After this commit: MCP server needs a /mcp reconnect to pick up the new VOYAGE_MODEL env, and the legal-ai container will pick it up on its next redeploy. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 16:43:48 +00:00
parent 72c4593e74
commit cb0b4b6a8b
2 changed files with 508 additions and 0 deletions
--- a/scripts/reembed_voyage.py
+++ b/scripts/reembed_voyage.py
@@ -0,0 +1,170 @@
+"""Re-embed all Voyage-stored vectors with the model in env VOYAGE_MODEL.
+
+Use after changing VOYAGE_MODEL in env (e.g. voyage-law-2 → voyage-3).
+The script reads each table that stores embeddings, batches the source
+text through the new model (Voyage allows 128 inputs / call), and
+UPDATEs the rows in place.
+
+Tables touched:
+  - document_chunks       (content)
+  - paragraph_embeddings  (joined with decision_paragraphs.content)
+  - case_law_embeddings   (chunk_text)
+  - precedent_chunks      (content)
+  - halachot              (rule_statement + reasoning_summary)
+
+Run from the legal-ai venv with VOYAGE_API_KEY + VOYAGE_MODEL +
+POSTGRES_* set in env (or ~/.env). Idempotent — safe to re-run.
+
+Usage:
+    /home/chaim/legal-ai/mcp-server/.venv/bin/python \\
+        /home/chaim/legal-ai/scripts/reembed_voyage.py
+"""
+from __future__ import annotations
+
+import asyncio
+import os
+import sys
+import time
+
+# Load ~/.env if present
+ENV_PATH = os.path.expanduser("~/.env")
+if os.path.isfile(ENV_PATH):
+    with open(ENV_PATH) as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#") and "=" in line:
+                k, v = line.split("=", 1)
+                os.environ.setdefault(k, v)
+
+import asyncpg  # noqa: E402
+import voyageai  # noqa: E402
+
+
+VOYAGE_MODEL = os.environ.get("VOYAGE_MODEL", "voyage-3")
+BATCH = 100  # Voyage allows 128, leave headroom for token limits
+
+# (table, primary key, source-text SQL, update SQL with $1=embedding $2=id)
+TABLES = [
+    (
+        "document_chunks",
+        "SELECT id, content FROM document_chunks WHERE content IS NOT NULL AND content <> ''",
+        "UPDATE document_chunks SET embedding = $1 WHERE id = $2",
+    ),
+    (
+        "paragraph_embeddings",
+        # paragraph_embeddings stores embedding only — text is in decision_paragraphs
+        "SELECT pe.id, dp.content "
+        "FROM paragraph_embeddings pe "
+        "JOIN decision_paragraphs dp ON dp.id = pe.paragraph_id "
+        "WHERE dp.content IS NOT NULL AND dp.content <> ''",
+        "UPDATE paragraph_embeddings SET embedding = $1 WHERE id = $2",
+    ),
+    (
+        "case_law_embeddings",
+        "SELECT id, chunk_text FROM case_law_embeddings "
+        "WHERE chunk_text IS NOT NULL AND chunk_text <> ''",
+        "UPDATE case_law_embeddings SET embedding = $1 WHERE id = $2",
+    ),
+    (
+        "precedent_chunks",
+        "SELECT id, content FROM precedent_chunks WHERE content IS NOT NULL AND content <> ''",
+        "UPDATE precedent_chunks SET embedding = $1 WHERE id = $2",
+    ),
+    (
+        "halachot",
+        # Embed rule_statement + reasoning_summary, matching the original
+        # storage in halacha_extractor.extract().
+        "SELECT id, "
+        "  TRIM(BOTH ' —' FROM rule_statement || ' — ' || COALESCE(reasoning_summary, '')) "
+        "  AS embed_text "
+        "FROM halachot WHERE rule_statement IS NOT NULL AND rule_statement <> ''",
+        "UPDATE halachot SET embedding = $1 WHERE id = $2",
+    ),
+]
+
+
+async def embed_batch(client, texts: list[str]) -> list[list[float]]:
+    """Voyage embed_texts with explicit input_type='document' for storage."""
+    return client.embed(texts, model=VOYAGE_MODEL, input_type="document").embeddings
+
+
+async def reembed_table(
+    pool: asyncpg.Pool, voyage, label: str, select_sql: str, update_sql: str,
+) -> dict:
+    rows = await pool.fetch(select_sql)
+    n = len(rows)
+    print(f"\n[{label}] {n} rows")
+    if n == 0:
+        return {"table": label, "rows": 0, "elapsed": 0.0}
+    start = time.time()
+    done = 0
+    for i in range(0, n, BATCH):
+        batch_rows = rows[i:i + BATCH]
+        texts = [r[1] for r in batch_rows]
+        ids = [r[0] for r in batch_rows]
+        try:
+            embeddings = await embed_batch(voyage, texts)
+        except Exception as e:
+            print(f"  [{label}] batch {i // BATCH} failed: {e}", file=sys.stderr)
+            continue
+        # Update each row
+        async with pool.acquire() as conn:
+            async with conn.transaction():
+                for emb, rid in zip(embeddings, ids):
+                    # asyncpg accepts list[float] for vector via asyncpg-pgvector;
+                    # but pgvector type is inferred via str cast on the wire
+                    await conn.execute(update_sql, str(emb), rid)
+        done += len(batch_rows)
+        elapsed = time.time() - start
+        print(f"  [{label}] {done}/{n} ({done/n*100:.1f}%)  "
+              f"elapsed={elapsed:.0f}s  rate={done/max(elapsed,0.1):.1f}/s")
+    elapsed = time.time() - start
+    return {"table": label, "rows": n, "elapsed": elapsed}
+
+
+async def main():
+    api_key = os.environ.get("VOYAGE_API_KEY")
+    if not api_key:
+        sys.exit("VOYAGE_API_KEY not set (export it or add to ~/.env)")
+
+    pg_host = os.environ.get("POSTGRES_HOST", "127.0.0.1")
+    pg_port = int(os.environ.get("POSTGRES_PORT", "5433"))
+    pg_user = os.environ.get("POSTGRES_USER", "legal_ai")
+    pg_pw = os.environ.get("POSTGRES_PASSWORD", "")
+    pg_db = os.environ.get("POSTGRES_DB", "legal_ai")
+    if not pg_pw:
+        sys.exit("POSTGRES_PASSWORD not set")
+
+    print(f"Re-embed all tables with model: {VOYAGE_MODEL}")
+    print(f"DB: {pg_user}@{pg_host}:{pg_port}/{pg_db}")
+
+    voyage = voyageai.Client(api_key=api_key)
+    pool = await asyncpg.create_pool(
+        host=pg_host, port=pg_port, user=pg_user,
+        password=pg_pw, database=pg_db,
+        min_size=1, max_size=4,
+    )
+
+    # pgvector needs explicit codec setup so we can pass list[float]
+    async def _init(conn: asyncpg.Connection) -> None:
+        await conn.execute("SET search_path = public")
+    await pool.__aenter__()  # noqa — enter context to ensure init
+
+    summary = []
+    try:
+        for label, select_sql, update_sql in TABLES:
+            r = await reembed_table(pool, voyage, label, select_sql, update_sql)
+            summary.append(r)
+    finally:
+        await pool.close()
+
+    total_rows = sum(r["rows"] for r in summary)
+    total_time = sum(r["elapsed"] for r in summary)
+    print(f"\n{'=' * 60}\nDONE — {total_rows} rows in {total_time:.0f}s")
+    for r in summary:
+        print(f"  {r['table']:30s} {r['rows']:>6} rows   {r['elapsed']:>5.0f}s")
+    print(f"\nModel: {VOYAGE_MODEL}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())