"""Backfill page_number on existing document_chunks. Why this exists: the legacy chunker did not track which page each chunk came from. After the page-tracking fix, new uploads carry page_number correctly, but existing chunks have ``page_number=NULL`` in the DB. That blocks the multimodal hybrid retriever's text+image boost (it joins (chunk, image) on (document_id, page_number)). What it does (per case): 1. List every document in the case 2. For each document with NULL page_number chunks: a. Re-extract via extractor.extract_text (re-runs OCR if needed — ~$0.0015/page on Google Vision; idempotent on the DB side) b. Compute page_offsets from the re-extracted text c. For every chunk row (sorted by chunk_index), search its content in the re-extracted text → look up page → UPDATE 3. Skip documents whose chunks already have non-null page_number Idempotent: a second run with no --force is a no-op. Designed to run from inside the FastAPI/MCP container (where /data is mounted and Google Vision creds are present). Locally it requires GOOGLE_CLOUD_VISION_API_KEY in ~/.env. Usage: docker exec -it python /tmp/backfill_chunk_pages.py 8174-24 8137-24 """ from __future__ import annotations import argparse import asyncio import logging import os import sys import time from pathlib import Path from uuid import UUID def _setup_paths(): here = Path(__file__).resolve().parent mcp_src = here.parent / "mcp-server" / "src" if mcp_src.is_dir() and str(mcp_src) not in sys.path: sys.path.insert(0, str(mcp_src)) _setup_paths() from legal_mcp.services import db, extractor # noqa: E402 logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", ) logger = logging.getLogger("backfill_chunk_pages") def _resolve_local_path(db_path: str) -> Path: p = Path(db_path) if p.is_file(): return p if str(p).startswith("/data/"): local = Path("/home/chaim/legal-ai") / Path(*p.parts[1:]) if local.is_file(): return local return p async def _backfill_document( document_id: UUID, title: str, db_file_path: str, force: bool, ) -> dict: pool = await db.get_pool() # Fetch chunks for this document chunks = await pool.fetch( "SELECT id, chunk_index, content, page_number FROM document_chunks " "WHERE document_id = $1 ORDER BY chunk_index", document_id, ) if not chunks: return {"status": "no_chunks"} n_null = sum(1 for c in chunks if c["page_number"] is None) if not force and n_null == 0: logger.info(" skip (all %d chunks already tagged): %s", len(chunks), title) return {"status": "skipped", "chunks": len(chunks)} pdf_path = _resolve_local_path(db_file_path) if not pdf_path.is_file(): logger.warning(" file missing: %s (%s)", pdf_path, title) return {"status": "missing"} if pdf_path.suffix.lower() != ".pdf": return {"status": "not_pdf"} logger.info(" re-extracting %s (%d chunks, %d need page)", title, len(chunks), n_null) t0 = time.time() text, page_count, page_offsets = await extractor.extract_text(str(pdf_path)) elapsed = time.time() - t0 if not page_offsets: return {"status": "no_offsets"} # Walk chunks, find each in the re-extracted text, assign page pos = 0 updated = 0 not_found = 0 for c in chunks: content = c["content"] if not content: continue idx = text.find(content, pos) if idx < 0: idx = text.find(content) # global fallback if idx < 0: not_found += 1 continue page = extractor.page_at_offset(idx, page_offsets) await pool.execute( "UPDATE document_chunks SET page_number = $1 WHERE id = $2", page, c["id"], ) updated += 1 # advance roughly past midpoint — chunks have overlap pos = idx + max(1, len(content) // 2) logger.info( " done in %.1fs: extracted %d pages, updated %d/%d chunks, " "%d not found", elapsed, page_count, updated, len(chunks), not_found, ) return { "status": "ok", "elapsed_sec": round(elapsed, 1), "pages": page_count, "chunks_total": len(chunks), "chunks_updated": updated, "chunks_not_found": not_found, } async def backfill_cases(case_numbers: list[str], force: bool) -> dict: pool = await db.get_pool() summary: dict = {} for cn in case_numbers: logger.info("=" * 60) logger.info("Case %s", cn) case = await db.get_case_by_number(cn) if not case: logger.warning("Case not found: %s", cn) summary[cn] = {"status": "case_not_found"} continue case_id = UUID(str(case["id"])) docs = await pool.fetch( "SELECT id, title, file_path FROM documents WHERE case_id = $1 ORDER BY title", case_id, ) logger.info(" %d documents", len(docs)) per_doc: list[dict] = [] for d in docs: r = await _backfill_document( UUID(str(d["id"])), d["title"], d["file_path"], force, ) per_doc.append({"document_id": str(d["id"]), "title": d["title"], **r}) summary[cn] = { "documents_total": len(docs), "ok": sum(1 for r in per_doc if r["status"] == "ok"), "skipped": sum(1 for r in per_doc if r["status"] == "skipped"), "missing": sum(1 for r in per_doc if r["status"] == "missing"), "no_chunks": sum(1 for r in per_doc if r["status"] == "no_chunks"), "no_offsets": sum(1 for r in per_doc if r["status"] == "no_offsets"), "chunks_updated": sum(r.get("chunks_updated", 0) for r in per_doc), "documents": per_doc, } return summary def main(): parser = argparse.ArgumentParser(description="Backfill page_number on existing chunks") parser.add_argument("cases", nargs="+", help="Case numbers (e.g. 8174-24 8137-24)") parser.add_argument( "--force", action="store_true", help="Re-extract even if all chunks already have page_number (default: skip)", ) args = parser.parse_args() summary = asyncio.run(backfill_cases(args.cases, force=args.force)) print() print("=" * 60) print("SUMMARY") print("=" * 60) for cn, s in summary.items(): if s.get("status") == "case_not_found": print(f" {cn}: NOT FOUND") continue print( f" {cn}: {s['documents_total']} docs — " f"ok {s['ok']}, skipped {s['skipped']}, missing {s['missing']}, " f"chunks_updated {s['chunks_updated']}" ) if __name__ == "__main__": main()