feat(retrieval): track page_number on text chunks for multimodal hybrid boost

The legacy chunker did not track which PDF page each chunk came from. Stored chunks had page_number=NULL, which blocked the multimodal hybrid retriever's text+image boost — it joins (chunk, image) on (document_id, page_number) and the join could never fire. This change: - extractor.extract_text now returns (text, page_count, page_offsets); page_offsets[i] is the start char offset of page (i+1) in the joined text. None for non-PDFs. - chunker.chunk_document accepts an optional page_offsets and tags each chunk with the page that contains its first character (uses the existing chunker logic; pages assigned post-hoc by content search to keep the diff minimal). - processor.process_document and precedent_library.ingest_precedent forward page_offsets through the chunker. New uploads now carry accurate page_number on every chunk. - Other extract_text callers (tools/documents, tools/workflow, web/app.py) updated to unpack the third element (ignored). - scripts/backfill_chunk_pages.py: per-case retrofit. Re-extracts each PDF (re-OCRs via Google Vision if needed, ~$0.0015/page), computes page_offsets, and updates page_number on every chunk by content search. Idempotent; --force re-runs on already-tagged docs. Forward-only would leave the 419 image embeddings backfilled on cases 8174-24 + 8137-24 unable to boost their corresponding text chunks. The retrofit script closes that gap (cost ~$0.60). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 19:49:41 +00:00
parent 5724ed8e5b
commit 81ccf3a888
9 changed files with 301 additions and 18 deletions
--- a/mcp-server/src/legal_mcp/services/extractor.py
+++ b/mcp-server/src/legal_mcp/services/extractor.py
@@ -120,12 +120,22 @@ def _fix_hebrew_quotes(text: str) -> str:
 # ── Extraction ───────────────────────────────────────────────────


-async def extract_text(file_path: str) -> tuple[str, int]:
+# Separator used when joining per-page text. Constant so chunker /
+# retrofit can reproduce the join when computing page offsets.
+PAGE_SEPARATOR = "\n\n"
+
+
+async def extract_text(file_path: str) -> tuple[str, int, list[int] | None]:
    """Extract text from a document file.

    Returns:
-        Tuple of (extracted_text, page_count).
-        page_count is 0 for non-PDF files.
+        ``(text, page_count, page_offsets)`` where:
+        - ``text``: concatenated extracted text
+        - ``page_count``: number of pages (0 for non-PDF)
+        - ``page_offsets``: ``page_offsets[i]`` = char start offset of
+          page (i+1) inside ``text``. ``None`` for non-PDFs (where the
+          notion of pages doesn't apply). Used by the chunker to assign
+          a ``page_number`` to each chunk.
    """
    path = Path(file_path)
    suffix = path.suffix.lower()
@@ -133,18 +143,34 @@ async def extract_text(file_path: str) -> tuple[str, int]:
    if suffix == ".pdf":
        return await _extract_pdf(path)
    elif suffix == ".docx":
-        return _extract_docx(path), 0
+        return _extract_docx(path), 0, None
    elif suffix == ".doc":
-        return _extract_doc(path), 0
+        return _extract_doc(path), 0, None
    elif suffix == ".rtf":
-        return _extract_rtf(path), 0
+        return _extract_rtf(path), 0, None
    elif suffix in (".txt", ".md"):
-        return path.read_text(encoding="utf-8"), 0
+        return path.read_text(encoding="utf-8"), 0, None
    else:
        raise ValueError(f"Unsupported file type: {suffix}")


-async def _extract_pdf(path: Path) -> tuple[str, int]:
+def _join_pages(pages_text: list[str]) -> tuple[str, list[int]]:
+    """Join per-page text with PAGE_SEPARATOR while recording the start
+    offset of each page in the joined output."""
+    offsets: list[int] = []
+    parts: list[str] = []
+    cursor = 0
+    for i, pg in enumerate(pages_text):
+        offsets.append(cursor)
+        parts.append(pg)
+        cursor += len(pg)
+        if i < len(pages_text) - 1:
+            parts.append(PAGE_SEPARATOR)
+            cursor += len(PAGE_SEPARATOR)
+    return "".join(parts), offsets
+
+
+async def _extract_pdf(path: Path) -> tuple[str, int, list[int]]:
    """Extract text from PDF.

    Try direct text first, fall back to Google Cloud Vision for scanned
@@ -172,7 +198,27 @@ async def _extract_pdf(path: Path) -> tuple[str, int]:
            pages_text.append(ocr_text)

    doc.close()
-    return "\n\n".join(pages_text), page_count
+    joined, offsets = _join_pages(pages_text)
+    return joined, page_count, offsets
+
+
+def page_at_offset(offset: int, page_offsets: list[int]) -> int:
+    """Look up the page number containing a given char offset.
+
+    page_offsets[i] is the start of page (i+1) in the joined text;
+    a chunk starting at ``offset`` belongs to the highest-indexed page
+    whose start is ``<= offset``. Returns 1-based page number.
+    """
+    if not page_offsets:
+        return 1
+    # Linear scan is fine — page_offsets is short (≤ ~200 for our PDFs).
+    page = 1
+    for i, start in enumerate(page_offsets):
+        if start <= offset:
+            page = i + 1
+        else:
+            break
+    return page


 def _ocr_with_google_vision(image_bytes: bytes, page_num: int) -> str: