legal-ai/scripts/rechunk_legacy_precedents.py

#!/usr/bin/env python3
"""#57 — re-chunk + re-embed legacy precedents that were embedded before the
chunker fix (#55).

Selects every case_law row that still has at least one tiny chunk
(length(trim(content)) < 50) — the fingerprint of the pre-fix chunker — and
runs ``ingest.reindex_case_law`` on it. That helper re-chunks + re-embeds from
the STORED full_text only (no re-OCR / no LLM — feedback_no_reocr_retrofit) and
is idempotent (store_precedent_chunks is DELETE-then-INSERT).

Idempotent at the batch level too: it re-queries the affected set each run, so
already-fixed rows drop out automatically. Safe to re-run.

Run with the MCP server venv (config loads ~/.env / Infisical for VOYAGE +
POSTGRES, same as the live MCP tools):

    cd ~/legal-ai/mcp-server
    .venv/bin/python ../scripts/rechunk_legacy_precedents.py            # all affected
    .venv/bin/python ../scripts/rechunk_legacy_precedents.py --limit 5  # first N (smoke)
"""
import argparse
import asyncio
import sys

from legal_mcp.services import db, ingest


async def affected_ids(conn) -> list:
    rows = await conn.fetch(
        """
        SELECT pc.case_law_id,
               cl.case_number,
               count(*) FILTER (WHERE length(trim(pc.content)) < 50) AS tiny,
               count(*) AS total
        FROM precedent_chunks pc
        JOIN case_law cl ON cl.id = pc.case_law_id
        GROUP BY pc.case_law_id, cl.case_number
        HAVING count(*) FILTER (WHERE length(trim(pc.content)) < 50) > 0
        ORDER BY total ASC
        """
    )
    return rows


async def main(limit: int | None) -> int:
    pool = await db.get_pool()
    async with pool.acquire() as conn:
        rows = await affected_ids(conn)

    if limit:
        rows = rows[:limit]

    n = len(rows)
    print(f"affected precedents to re-chunk: {n}", flush=True)
    ok = 0
    failed = []
    for i, r in enumerate(rows, 1):
        cid = r["case_law_id"]
        cn = r["case_number"]
        try:
            res = await ingest.reindex_case_law(cid)
            ok += 1
            print(
                f"[{i}/{n}] OK  {cn}: {r['total']} chunks ({r['tiny']} tiny) "
                f"-> {res['chunks']} chunks",
                flush=True,
            )
        except Exception as e:  # noqa: BLE001 — report per-doc, keep going
            failed.append((cn, str(e)))
            print(f"[{i}/{n}] FAIL {cn}: {e}", flush=True)

    print(f"\nDONE — {ok}/{n} reindexed, {len(failed)} failed", flush=True)
    for cn, e in failed:
        print(f"  FAILED {cn}: {e}", flush=True)
    return 0 if not failed else 1


if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("--limit", type=int, default=None, help="process only first N")
    args = ap.parse_args()
    sys.exit(asyncio.run(main(args.limit)))