Adds scripts/rechunk_legacy_precedents.py: selects every case_law with a tiny
chunk (content<50 — the pre-fix chunker fingerprint) and runs
ingest.reindex_case_law (re-chunk+re-embed from stored full_text only, no
re-OCR/LLM, idempotent). Batch-idempotent (re-queries the affected set).
Run result (2026-06-03): 73 precedents reindexed, 0 failed. Tiny chunks
483 -> 4 (99.2%); total precedent_chunks 5019 -> 3115 (fragments merged).
Search verified healthy (substantial coherent passages, no errors).
The 4 residual tiny chunks are isolated section headings ('דיון',
'טענות המשיבים', ...) emitted by the CURRENT (fixed) chunker — not legacy
fragments — and are already filtered at query time (>=50, #55). Minor
chunker edge case, candidate #55 follow-up.
The DB chunk migration is already applied to prod; this commit is the script
+ SCRIPTS.md entry only (no app code change, no deploy needed).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
83 lines
2.8 KiB
Python
83 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
"""#57 — re-chunk + re-embed legacy precedents that were embedded before the
|
|
chunker fix (#55).
|
|
|
|
Selects every case_law row that still has at least one tiny chunk
|
|
(length(trim(content)) < 50) — the fingerprint of the pre-fix chunker — and
|
|
runs ``ingest.reindex_case_law`` on it. That helper re-chunks + re-embeds from
|
|
the STORED full_text only (no re-OCR / no LLM — feedback_no_reocr_retrofit) and
|
|
is idempotent (store_precedent_chunks is DELETE-then-INSERT).
|
|
|
|
Idempotent at the batch level too: it re-queries the affected set each run, so
|
|
already-fixed rows drop out automatically. Safe to re-run.
|
|
|
|
Run with the MCP server venv (config loads ~/.env / Infisical for VOYAGE +
|
|
POSTGRES, same as the live MCP tools):
|
|
|
|
cd ~/legal-ai/mcp-server
|
|
.venv/bin/python ../scripts/rechunk_legacy_precedents.py # all affected
|
|
.venv/bin/python ../scripts/rechunk_legacy_precedents.py --limit 5 # first N (smoke)
|
|
"""
|
|
import argparse
|
|
import asyncio
|
|
import sys
|
|
|
|
from legal_mcp.services import db, ingest
|
|
|
|
|
|
async def affected_ids(conn) -> list:
|
|
rows = await conn.fetch(
|
|
"""
|
|
SELECT pc.case_law_id,
|
|
cl.case_number,
|
|
count(*) FILTER (WHERE length(trim(pc.content)) < 50) AS tiny,
|
|
count(*) AS total
|
|
FROM precedent_chunks pc
|
|
JOIN case_law cl ON cl.id = pc.case_law_id
|
|
GROUP BY pc.case_law_id, cl.case_number
|
|
HAVING count(*) FILTER (WHERE length(trim(pc.content)) < 50) > 0
|
|
ORDER BY total ASC
|
|
"""
|
|
)
|
|
return rows
|
|
|
|
|
|
async def main(limit: int | None) -> int:
|
|
pool = await db.get_pool()
|
|
async with pool.acquire() as conn:
|
|
rows = await affected_ids(conn)
|
|
|
|
if limit:
|
|
rows = rows[:limit]
|
|
|
|
n = len(rows)
|
|
print(f"affected precedents to re-chunk: {n}", flush=True)
|
|
ok = 0
|
|
failed = []
|
|
for i, r in enumerate(rows, 1):
|
|
cid = r["case_law_id"]
|
|
cn = r["case_number"]
|
|
try:
|
|
res = await ingest.reindex_case_law(cid)
|
|
ok += 1
|
|
print(
|
|
f"[{i}/{n}] OK {cn}: {r['total']} chunks ({r['tiny']} tiny) "
|
|
f"-> {res['chunks']} chunks",
|
|
flush=True,
|
|
)
|
|
except Exception as e: # noqa: BLE001 — report per-doc, keep going
|
|
failed.append((cn, str(e)))
|
|
print(f"[{i}/{n}] FAIL {cn}: {e}", flush=True)
|
|
|
|
print(f"\nDONE — {ok}/{n} reindexed, {len(failed)} failed", flush=True)
|
|
for cn, e in failed:
|
|
print(f" FAILED {cn}: {e}", flush=True)
|
|
return 0 if not failed else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--limit", type=int, default=None, help="process only first N")
|
|
args = ap.parse_args()
|
|
sys.exit(asyncio.run(main(args.limit)))
|