legal-ai/scripts/voyage_context3_poc_long.py

"""POC #2: voyage-3 vs voyage-context-3 on a LONG case (אהרון ברק 403/17).

Case is 178K chars / 219 chunks / ~60K tokens — too big for a single
contextualized_embed call (32K token limit per inner list). We split the
chunks into overlapping sliding windows (~80 chunks each, ~22K tokens)
and merge: each chunk gets the embedding from the window where it sits
*most centrally* (max symmetric context on both sides).

The hypothesis: voyage-context-3 should shine here because the case is
full of internal references ("ראה לעיל סעיף 13", "להבדיל מעניין X",
"תוצאת הבחינה ב-בר"מ 1975/24 שנידונה לעיל"). voyage-3 embeds chunks
in isolation; context-3 sees ~80 surrounding chunks per embedding.

No DB writes. Output: side-by-side ranking comparison + summary.

Usage:
    /home/chaim/legal-ai/mcp-server/.venv/bin/python \\
        /home/chaim/legal-ai/scripts/voyage_context3_poc_long.py
"""
from __future__ import annotations

import asyncio
import math
import os
import sys
import time

ENV_PATH = os.path.expanduser("~/.env")
if os.path.isfile(ENV_PATH):
    with open(ENV_PATH) as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith("#") and "=" in line:
                k, v = line.split("=", 1)
                os.environ.setdefault(k, v)

import asyncpg  # noqa: E402
import voyageai  # noqa: E402


CASE_ID = "e151fc25-cf12-4563-b638-a86323f8413b"  # 403/17 אהרון ברק (178K chars)
CONTEXT_MODEL = "voyage-context-3"
BASELINE_MODEL = "voyage-3"

# Sliding-window split params. With 219 chunks and ~60K tokens total
# (~275 tokens/chunk average), 3 windows of 80 chunks each is ~22K tokens
# per call — comfortably under 32K.
WINDOW_SIZE = 80
WINDOW_STRIDE = 70  # overlap = WINDOW_SIZE - WINDOW_STRIDE = 10

# Mix of:
#   (a) generic queries (also tested in POC #1)
#   (b) queries that require *internal* document context
QUERIES = [
    # generic
    "תכנית רחביה הוראות בנייה",
    "פיצויים לפי סעיף 197 ירידת ערך",
    "השפעת תכנית על שווי מקרקעין",
    "סמכות ועדת ערר לדון בפיצויים",
    "תוספת זכויות בנייה כפיצוי",
    # internal-context — should benefit context-3
    "ההבחנה בין השבחה לפיצויים",
    "מה נקבע לגבי תמ\"א 38 בפסק הדין",
    "ההלכה שנקבעה בעניין רובע 3",
    "כלל הנטרול של זכויות תכנוניות",
    "הסכמת השופט אלרון לחוות הדעת",
]


def cosine(a: list[float], b: list[float]) -> float:
    dot = sum(x * y for x, y in zip(a, b))
    na = math.sqrt(sum(x * x for x in a))
    nb = math.sqrt(sum(y * y for y in b))
    return dot / (na * nb) if na and nb else 0.0


def parse_pgvector(s: str) -> list[float]:
    return [float(x) for x in s.strip("[]").split(",")]


def build_windows(n: int, size: int, stride: int) -> list[tuple[int, int]]:
    """Return list of (start, end) ranges (end exclusive) covering 0..n.

    Last window extends to n exactly. Overlap = size - stride.
    """
    windows = []
    start = 0
    while start < n:
        end = min(start + size, n)
        windows.append((start, end))
        if end == n:
            break
        start += stride
    return windows


def assign_chunk_to_window(
    chunk_idx: int, windows: list[tuple[int, int]],
) -> int:
    """Pick the window where chunk_idx sits most centrally (max symmetric
    distance to either edge). Ties broken by larger window."""
    best = -1
    best_score = -1
    for w_idx, (s, e) in enumerate(windows):
        if not (s <= chunk_idx < e):
            continue
        # symmetric distance: min(distance to s, distance to e-1)
        dist = min(chunk_idx - s, (e - 1) - chunk_idx)
        if dist > best_score:
            best_score = dist
            best = w_idx
    return best


async def main():
    api_key = os.environ["VOYAGE_API_KEY"]
    pg_pw = os.environ["POSTGRES_PASSWORD"]

    voyage = voyageai.Client(api_key=api_key)

    pool = await asyncpg.create_pool(
        host="127.0.0.1", port=5433, user="legal_ai",
        password=pg_pw, database="legal_ai",
        min_size=1, max_size=2,
    )

    rows = await pool.fetch("""
        SELECT chunk_index, content, embedding::text AS emb_text
        FROM precedent_chunks
        WHERE case_law_id = $1
        ORDER BY chunk_index
    """, CASE_ID)
    n = len(rows)
    print(f"[load] {n} chunks from אהרון ברק 403/17")

    chunks = [r["content"] for r in rows]
    indices = [r["chunk_index"] for r in rows]
    baseline_embs = [parse_pgvector(r["emb_text"]) for r in rows]

    # Build windows
    windows = build_windows(n, WINDOW_SIZE, WINDOW_STRIDE)
    print(f"[windows] {len(windows)} windows: "
          f"{', '.join(f'[{s}:{e})' for s, e in windows)}")

    # Embed each window with context-3
    window_embs: list[list[list[float]]] = []  # [window][chunk_in_window][dim]
    total_call_tokens = 0
    total_start = time.time()
    for w_idx, (s, e) in enumerate(windows):
        sub_chunks = chunks[s:e]
        sub_chars = sum(len(c) for c in sub_chunks)
        start = time.time()
        result = voyage.contextualized_embed(
            inputs=[sub_chunks],
            model=CONTEXT_MODEL,
            input_type="document",
        )
        elapsed = time.time() - start
        toks = getattr(result, "total_tokens", 0)
        total_call_tokens += toks
        print(f"  [window {w_idx}] [{s}:{e}) — {len(sub_chunks)} chunks, "
              f"{sub_chars:,} chars, {toks} tokens — {elapsed:.1f}s")
        window_embs.append(result.results[0].embeddings)
    total_elapsed = time.time() - total_start
    print(f"[context] all windows done in {total_elapsed:.1f}s, "
          f"{total_call_tokens} total tokens")

    # Merge: for each chunk, pick the embedding from its most-central window
    context_embs: list[list[float]] = []
    chunk_window_choice = []
    for i in range(n):
        w_idx = assign_chunk_to_window(i, windows)
        chunk_window_choice.append(w_idx)
        s, _ = windows[w_idx]
        context_embs.append(window_embs[w_idx][i - s])
    print(f"[merge] window distribution: "
          f"{[chunk_window_choice.count(j) for j in range(len(windows))]}")

    # Run queries
    print("\n" + "=" * 100)
    print(f"{'Q':<3} {'baseline (voyage-3)':<48} {'context-3 (windowed)':<48}")
    print("=" * 100)

    rank_overlaps = []
    for q_idx, query in enumerate(QUERIES, 1):
        q_baseline = voyage.embed(
            [query], model=BASELINE_MODEL, input_type="query"
        ).embeddings[0]
        q_context = voyage.contextualized_embed(
            inputs=[[query]],
            model=CONTEXT_MODEL,
            input_type="query",
        ).results[0].embeddings[0]

        scores_b = sorted(
            [(cosine(q_baseline, e), i) for i, e in enumerate(baseline_embs)],
            reverse=True,
        )
        scores_c = sorted(
            [(cosine(q_context, e), i) for i, e in enumerate(context_embs)],
            reverse=True,
        )

        top10_b = [i for _, i in scores_b[:10]]
        top10_c = [i for _, i in scores_c[:10]]
        overlap = len(set(top10_b) & set(top10_c))
        rank_overlaps.append(overlap)

        print(f"\n[Q{q_idx}] {query}")
        print(f"  overlap top-10: {overlap}/10  |  "
              f"avg score top-3: baseline="
              f"{sum(s for s, _ in scores_b[:3])/3:.3f}  "
              f"context-3={sum(s for s, _ in scores_c[:3])/3:.3f}")
        for rank in range(5):
            sb, ib = scores_b[rank]
            sc, ic = scores_c[rank]
            cb = chunks[ib].replace("\n", " ").strip()[:50]
            cc = chunks[ic].replace("\n", " ").strip()[:50]
            print(f"  #{rank+1}  [{indices[ib]:3d}] {sb:.3f} {cb:<55}  "
                  f"|  [{indices[ic]:3d}] {sc:.3f} {cc}")

    print("\n" + "=" * 100)
    print("SUMMARY")
    print("=" * 100)
    avg = sum(rank_overlaps) / len(rank_overlaps)
    print(f"Avg overlap top-10:  {avg:.1f}/10")
    print(f"Per-query overlap:   {rank_overlaps}")
    print(f"Total context-3 tokens used: {total_call_tokens:,}  "
          f"(in {len(windows)} calls)")
    print(f"\nNote: cosine across models not directly comparable. The")
    print(f"meaningful test is *which chunks bubble to the top* — read")
    print(f"the actual text above to judge relevance.")

    await pool.close()


if __name__ == "__main__":
    asyncio.run(main())