chore(#57): re-chunk+re-embed legacy precedents (pre-#55 chunker remediation)
Adds scripts/rechunk_legacy_precedents.py: selects every case_law with a tiny
chunk (content<50 — the pre-fix chunker fingerprint) and runs
ingest.reindex_case_law (re-chunk+re-embed from stored full_text only, no
re-OCR/LLM, idempotent). Batch-idempotent (re-queries the affected set).
Run result (2026-06-03): 73 precedents reindexed, 0 failed. Tiny chunks
483 -> 4 (99.2%); total precedent_chunks 5019 -> 3115 (fragments merged).
Search verified healthy (substantial coherent passages, no errors).
The 4 residual tiny chunks are isolated section headings ('דיון',
'טענות המשיבים', ...) emitted by the CURRENT (fixed) chunker — not legacy
fragments — and are already filtered at query time (>=50, #55). Minor
chunker edge case, candidate #55 follow-up.
The DB chunk migration is already applied to prod; this commit is the script
+ SCRIPTS.md entry only (no app code change, no deploy needed).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
82
scripts/rechunk_legacy_precedents.py
Normal file
82
scripts/rechunk_legacy_precedents.py
Normal file
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python3
|
||||
"""#57 — re-chunk + re-embed legacy precedents that were embedded before the
|
||||
chunker fix (#55).
|
||||
|
||||
Selects every case_law row that still has at least one tiny chunk
|
||||
(length(trim(content)) < 50) — the fingerprint of the pre-fix chunker — and
|
||||
runs ``ingest.reindex_case_law`` on it. That helper re-chunks + re-embeds from
|
||||
the STORED full_text only (no re-OCR / no LLM — feedback_no_reocr_retrofit) and
|
||||
is idempotent (store_precedent_chunks is DELETE-then-INSERT).
|
||||
|
||||
Idempotent at the batch level too: it re-queries the affected set each run, so
|
||||
already-fixed rows drop out automatically. Safe to re-run.
|
||||
|
||||
Run with the MCP server venv (config loads ~/.env / Infisical for VOYAGE +
|
||||
POSTGRES, same as the live MCP tools):
|
||||
|
||||
cd ~/legal-ai/mcp-server
|
||||
.venv/bin/python ../scripts/rechunk_legacy_precedents.py # all affected
|
||||
.venv/bin/python ../scripts/rechunk_legacy_precedents.py --limit 5 # first N (smoke)
|
||||
"""
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
|
||||
from legal_mcp.services import db, ingest
|
||||
|
||||
|
||||
async def affected_ids(conn) -> list:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT pc.case_law_id,
|
||||
cl.case_number,
|
||||
count(*) FILTER (WHERE length(trim(pc.content)) < 50) AS tiny,
|
||||
count(*) AS total
|
||||
FROM precedent_chunks pc
|
||||
JOIN case_law cl ON cl.id = pc.case_law_id
|
||||
GROUP BY pc.case_law_id, cl.case_number
|
||||
HAVING count(*) FILTER (WHERE length(trim(pc.content)) < 50) > 0
|
||||
ORDER BY total ASC
|
||||
"""
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
async def main(limit: int | None) -> int:
|
||||
pool = await db.get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
rows = await affected_ids(conn)
|
||||
|
||||
if limit:
|
||||
rows = rows[:limit]
|
||||
|
||||
n = len(rows)
|
||||
print(f"affected precedents to re-chunk: {n}", flush=True)
|
||||
ok = 0
|
||||
failed = []
|
||||
for i, r in enumerate(rows, 1):
|
||||
cid = r["case_law_id"]
|
||||
cn = r["case_number"]
|
||||
try:
|
||||
res = await ingest.reindex_case_law(cid)
|
||||
ok += 1
|
||||
print(
|
||||
f"[{i}/{n}] OK {cn}: {r['total']} chunks ({r['tiny']} tiny) "
|
||||
f"-> {res['chunks']} chunks",
|
||||
flush=True,
|
||||
)
|
||||
except Exception as e: # noqa: BLE001 — report per-doc, keep going
|
||||
failed.append((cn, str(e)))
|
||||
print(f"[{i}/{n}] FAIL {cn}: {e}", flush=True)
|
||||
|
||||
print(f"\nDONE — {ok}/{n} reindexed, {len(failed)} failed", flush=True)
|
||||
for cn, e in failed:
|
||||
print(f" FAILED {cn}: {e}", flush=True)
|
||||
return 0 if not failed else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--limit", type=int, default=None, help="process only first N")
|
||||
args = ap.parse_args()
|
||||
sys.exit(asyncio.run(main(args.limit)))
|
||||
Reference in New Issue
Block a user