legal-ai/scripts/backfill_chunk_pages.py

"""Backfill page_number on existing document_chunks.

Why this exists: the legacy chunker did not track which page each chunk
came from. After the page-tracking fix, new uploads carry page_number
correctly, but existing chunks have ``page_number=NULL`` in the DB.
That blocks the multimodal hybrid retriever's text+image boost (it
joins (chunk, image) on (document_id, page_number)).

What it does (per case):
    1. List every document in the case
    2. For each document with NULL page_number chunks:
       a. Re-extract via extractor.extract_text (re-runs OCR if needed —
          ~$0.0015/page on Google Vision; idempotent on the DB side)
       b. Compute page_offsets from the re-extracted text
       c. For every chunk row (sorted by chunk_index), search its
          content in the re-extracted text → look up page → UPDATE
    3. Skip documents whose chunks already have non-null page_number

Idempotent: a second run with no --force is a no-op.

Designed to run from inside the FastAPI/MCP container (where /data
is mounted and Google Vision creds are present). Locally it requires
GOOGLE_CLOUD_VISION_API_KEY in ~/.env.

Usage:
    docker exec -it <legal-ai-container> python /tmp/backfill_chunk_pages.py 8174-24 8137-24
"""
from __future__ import annotations

import argparse
import asyncio
import logging
import os
import sys
import time
from pathlib import Path
from uuid import UUID


def _setup_paths():
    here = Path(__file__).resolve().parent
    mcp_src = here.parent / "mcp-server" / "src"
    if mcp_src.is_dir() and str(mcp_src) not in sys.path:
        sys.path.insert(0, str(mcp_src))


_setup_paths()
from legal_mcp.services import db, extractor  # noqa: E402

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger("backfill_chunk_pages")


def _resolve_local_path(db_path: str) -> Path:
    p = Path(db_path)
    if p.is_file():
        return p
    if str(p).startswith("/data/"):
        local = Path("/home/chaim/legal-ai") / Path(*p.parts[1:])
        if local.is_file():
            return local
    return p


async def _backfill_document(
    document_id: UUID,
    title: str,
    db_file_path: str,
    force: bool,
) -> dict:
    pool = await db.get_pool()

    # Fetch chunks for this document
    chunks = await pool.fetch(
        "SELECT id, chunk_index, content, page_number FROM document_chunks "
        "WHERE document_id = $1 ORDER BY chunk_index",
        document_id,
    )
    if not chunks:
        return {"status": "no_chunks"}

    n_null = sum(1 for c in chunks if c["page_number"] is None)
    if not force and n_null == 0:
        logger.info("  skip (all %d chunks already tagged): %s", len(chunks), title)
        return {"status": "skipped", "chunks": len(chunks)}

    pdf_path = _resolve_local_path(db_file_path)
    if not pdf_path.is_file():
        logger.warning("  file missing: %s (%s)", pdf_path, title)
        return {"status": "missing"}
    if pdf_path.suffix.lower() != ".pdf":
        return {"status": "not_pdf"}

    logger.info("  re-extracting %s (%d chunks, %d need page)",
                title, len(chunks), n_null)
    t0 = time.time()
    text, page_count, page_offsets = await extractor.extract_text(str(pdf_path))
    elapsed = time.time() - t0
    if not page_offsets:
        return {"status": "no_offsets"}

    # Walk chunks, find each in the re-extracted text, assign page
    pos = 0
    updated = 0
    not_found = 0
    for c in chunks:
        content = c["content"]
        if not content:
            continue
        idx = text.find(content, pos)
        if idx < 0:
            idx = text.find(content)  # global fallback
        if idx < 0:
            not_found += 1
            continue
        page = extractor.page_at_offset(idx, page_offsets)
        await pool.execute(
            "UPDATE document_chunks SET page_number = $1 WHERE id = $2",
            page, c["id"],
        )
        updated += 1
        # advance roughly past midpoint — chunks have overlap
        pos = idx + max(1, len(content) // 2)

    logger.info(
        "  done in %.1fs: extracted %d pages, updated %d/%d chunks, "
        "%d not found", elapsed, page_count, updated, len(chunks), not_found,
    )
    return {
        "status": "ok",
        "elapsed_sec": round(elapsed, 1),
        "pages": page_count,
        "chunks_total": len(chunks),
        "chunks_updated": updated,
        "chunks_not_found": not_found,
    }


async def backfill_cases(case_numbers: list[str], force: bool) -> dict:
    pool = await db.get_pool()
    summary: dict = {}
    for cn in case_numbers:
        logger.info("=" * 60)
        logger.info("Case %s", cn)
        case = await db.get_case_by_number(cn)
        if not case:
            logger.warning("Case not found: %s", cn)
            summary[cn] = {"status": "case_not_found"}
            continue
        case_id = UUID(str(case["id"]))
        docs = await pool.fetch(
            "SELECT id, title, file_path FROM documents WHERE case_id = $1 ORDER BY title",
            case_id,
        )
        logger.info("  %d documents", len(docs))
        per_doc: list[dict] = []
        for d in docs:
            r = await _backfill_document(
                UUID(str(d["id"])), d["title"], d["file_path"], force,
            )
            per_doc.append({"document_id": str(d["id"]), "title": d["title"], **r})
        summary[cn] = {
            "documents_total": len(docs),
            "ok": sum(1 for r in per_doc if r["status"] == "ok"),
            "skipped": sum(1 for r in per_doc if r["status"] == "skipped"),
            "missing": sum(1 for r in per_doc if r["status"] == "missing"),
            "no_chunks": sum(1 for r in per_doc if r["status"] == "no_chunks"),
            "no_offsets": sum(1 for r in per_doc if r["status"] == "no_offsets"),
            "chunks_updated": sum(r.get("chunks_updated", 0) for r in per_doc),
            "documents": per_doc,
        }
    return summary


def main():
    parser = argparse.ArgumentParser(description="Backfill page_number on existing chunks")
    parser.add_argument("cases", nargs="+", help="Case numbers (e.g. 8174-24 8137-24)")
    parser.add_argument(
        "--force", action="store_true",
        help="Re-extract even if all chunks already have page_number (default: skip)",
    )
    args = parser.parse_args()

    summary = asyncio.run(backfill_cases(args.cases, force=args.force))
    print()
    print("=" * 60)
    print("SUMMARY")
    print("=" * 60)
    for cn, s in summary.items():
        if s.get("status") == "case_not_found":
            print(f"  {cn}: NOT FOUND")
            continue
        print(
            f"  {cn}: {s['documents_total']} docs — "
            f"ok {s['ok']}, skipped {s['skipped']}, missing {s['missing']}, "
            f"chunks_updated {s['chunks_updated']}"
        )


if __name__ == "__main__":
    main()