#!/usr/bin/env python3 """#57 — re-chunk + re-embed legacy precedents that were embedded before the chunker fix (#55). Selects every case_law row that still has at least one tiny chunk (length(trim(content)) < 50) — the fingerprint of the pre-fix chunker — and runs ``ingest.reindex_case_law`` on it. That helper re-chunks + re-embeds from the STORED full_text only (no re-OCR / no LLM — feedback_no_reocr_retrofit) and is idempotent (store_precedent_chunks is DELETE-then-INSERT). Idempotent at the batch level too: it re-queries the affected set each run, so already-fixed rows drop out automatically. Safe to re-run. Run with the MCP server venv (config loads ~/.env / Infisical for VOYAGE + POSTGRES, same as the live MCP tools): cd ~/legal-ai/mcp-server .venv/bin/python ../scripts/rechunk_legacy_precedents.py # all affected .venv/bin/python ../scripts/rechunk_legacy_precedents.py --limit 5 # first N (smoke) """ import argparse import asyncio import sys from legal_mcp.services import db, ingest async def affected_ids(conn) -> list: rows = await conn.fetch( """ SELECT pc.case_law_id, cl.case_number, count(*) FILTER (WHERE length(trim(pc.content)) < 50) AS tiny, count(*) AS total FROM precedent_chunks pc JOIN case_law cl ON cl.id = pc.case_law_id GROUP BY pc.case_law_id, cl.case_number HAVING count(*) FILTER (WHERE length(trim(pc.content)) < 50) > 0 ORDER BY total ASC """ ) return rows async def main(limit: int | None) -> int: pool = await db.get_pool() async with pool.acquire() as conn: rows = await affected_ids(conn) if limit: rows = rows[:limit] n = len(rows) print(f"affected precedents to re-chunk: {n}", flush=True) ok = 0 failed = [] for i, r in enumerate(rows, 1): cid = r["case_law_id"] cn = r["case_number"] try: res = await ingest.reindex_case_law(cid) ok += 1 print( f"[{i}/{n}] OK {cn}: {r['total']} chunks ({r['tiny']} tiny) " f"-> {res['chunks']} chunks", flush=True, ) except Exception as e: # noqa: BLE001 — report per-doc, keep going failed.append((cn, str(e))) print(f"[{i}/{n}] FAIL {cn}: {e}", flush=True) print(f"\nDONE — {ok}/{n} reindexed, {len(failed)} failed", flush=True) for cn, e in failed: print(f" FAILED {cn}: {e}", flush=True) return 0 if not failed else 1 if __name__ == "__main__": ap = argparse.ArgumentParser() ap.add_argument("--limit", type=int, default=None, help="process only first N") args = ap.parse_args() sys.exit(asyncio.run(main(args.limit)))