feat(retrieval): track page_number on text chunks for multimodal hybrid boost
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 6m33s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 6m33s
The legacy chunker did not track which PDF page each chunk came from. Stored chunks had page_number=NULL, which blocked the multimodal hybrid retriever's text+image boost — it joins (chunk, image) on (document_id, page_number) and the join could never fire. This change: - extractor.extract_text now returns (text, page_count, page_offsets); page_offsets[i] is the start char offset of page (i+1) in the joined text. None for non-PDFs. - chunker.chunk_document accepts an optional page_offsets and tags each chunk with the page that contains its first character (uses the existing chunker logic; pages assigned post-hoc by content search to keep the diff minimal). - processor.process_document and precedent_library.ingest_precedent forward page_offsets through the chunker. New uploads now carry accurate page_number on every chunk. - Other extract_text callers (tools/documents, tools/workflow, web/app.py) updated to unpack the third element (ignored). - scripts/backfill_chunk_pages.py: per-case retrofit. Re-extracts each PDF (re-OCRs via Google Vision if needed, ~$0.0015/page), computes page_offsets, and updates page_number on every chunk by content search. Idempotent; --force re-runs on already-tagged docs. Forward-only would leave the 419 image embeddings backfilled on cases 8174-24 + 8137-24 unable to boost their corresponding text chunks. The retrofit script closes that gap (cost ~$0.60). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -33,8 +33,15 @@ def chunk_document(
|
||||
text: str,
|
||||
chunk_size: int = config.CHUNK_SIZE_TOKENS,
|
||||
overlap: int = config.CHUNK_OVERLAP_TOKENS,
|
||||
page_offsets: list[int] | None = None,
|
||||
) -> list[Chunk]:
|
||||
"""Split a legal document into chunks, respecting section boundaries."""
|
||||
"""Split a legal document into chunks, respecting section boundaries.
|
||||
|
||||
When ``page_offsets`` is supplied (from a PDF extraction), each chunk
|
||||
is tagged with the page number of its first character — used by the
|
||||
multimodal hybrid retriever to join (text chunk, image at same page)
|
||||
and surface text+image matches.
|
||||
"""
|
||||
if not text.strip():
|
||||
return []
|
||||
|
||||
@@ -52,9 +59,34 @@ def chunk_document(
|
||||
))
|
||||
idx += 1
|
||||
|
||||
if page_offsets:
|
||||
_assign_pages(chunks, text, page_offsets)
|
||||
return chunks
|
||||
|
||||
|
||||
def _assign_pages(chunks: list[Chunk], text: str, page_offsets: list[int]) -> None:
|
||||
"""Locate each chunk's first character in ``text`` and tag with the
|
||||
page that contains that offset. Mutates chunks in-place.
|
||||
|
||||
Chunks have overlap so we search forward from a position slightly
|
||||
past the previous chunk's start. Falls back to a global search if
|
||||
the forward scan misses (rare — happens only when overlap is bigger
|
||||
than the advance distance below).
|
||||
"""
|
||||
from legal_mcp.services.extractor import page_at_offset
|
||||
pos = 0
|
||||
for c in chunks:
|
||||
idx = text.find(c.content, pos)
|
||||
if idx < 0:
|
||||
idx = text.find(c.content)
|
||||
if idx < 0:
|
||||
continue
|
||||
c.page_number = page_at_offset(idx, page_offsets)
|
||||
# advance past the chunk's halfway point — overlap is < 50% so
|
||||
# the next chunk's starting point will be after this cursor.
|
||||
pos = idx + max(1, len(c.content) // 2)
|
||||
|
||||
|
||||
def _split_into_sections(text: str) -> list[tuple[str, str]]:
|
||||
"""Split text into (section_type, text) pairs based on Hebrew headers."""
|
||||
# Find all section headers and their positions
|
||||
|
||||
@@ -120,12 +120,22 @@ def _fix_hebrew_quotes(text: str) -> str:
|
||||
# ── Extraction ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def extract_text(file_path: str) -> tuple[str, int]:
|
||||
# Separator used when joining per-page text. Constant so chunker /
|
||||
# retrofit can reproduce the join when computing page offsets.
|
||||
PAGE_SEPARATOR = "\n\n"
|
||||
|
||||
|
||||
async def extract_text(file_path: str) -> tuple[str, int, list[int] | None]:
|
||||
"""Extract text from a document file.
|
||||
|
||||
Returns:
|
||||
Tuple of (extracted_text, page_count).
|
||||
page_count is 0 for non-PDF files.
|
||||
``(text, page_count, page_offsets)`` where:
|
||||
- ``text``: concatenated extracted text
|
||||
- ``page_count``: number of pages (0 for non-PDF)
|
||||
- ``page_offsets``: ``page_offsets[i]`` = char start offset of
|
||||
page (i+1) inside ``text``. ``None`` for non-PDFs (where the
|
||||
notion of pages doesn't apply). Used by the chunker to assign
|
||||
a ``page_number`` to each chunk.
|
||||
"""
|
||||
path = Path(file_path)
|
||||
suffix = path.suffix.lower()
|
||||
@@ -133,18 +143,34 @@ async def extract_text(file_path: str) -> tuple[str, int]:
|
||||
if suffix == ".pdf":
|
||||
return await _extract_pdf(path)
|
||||
elif suffix == ".docx":
|
||||
return _extract_docx(path), 0
|
||||
return _extract_docx(path), 0, None
|
||||
elif suffix == ".doc":
|
||||
return _extract_doc(path), 0
|
||||
return _extract_doc(path), 0, None
|
||||
elif suffix == ".rtf":
|
||||
return _extract_rtf(path), 0
|
||||
return _extract_rtf(path), 0, None
|
||||
elif suffix in (".txt", ".md"):
|
||||
return path.read_text(encoding="utf-8"), 0
|
||||
return path.read_text(encoding="utf-8"), 0, None
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {suffix}")
|
||||
|
||||
|
||||
async def _extract_pdf(path: Path) -> tuple[str, int]:
|
||||
def _join_pages(pages_text: list[str]) -> tuple[str, list[int]]:
|
||||
"""Join per-page text with PAGE_SEPARATOR while recording the start
|
||||
offset of each page in the joined output."""
|
||||
offsets: list[int] = []
|
||||
parts: list[str] = []
|
||||
cursor = 0
|
||||
for i, pg in enumerate(pages_text):
|
||||
offsets.append(cursor)
|
||||
parts.append(pg)
|
||||
cursor += len(pg)
|
||||
if i < len(pages_text) - 1:
|
||||
parts.append(PAGE_SEPARATOR)
|
||||
cursor += len(PAGE_SEPARATOR)
|
||||
return "".join(parts), offsets
|
||||
|
||||
|
||||
async def _extract_pdf(path: Path) -> tuple[str, int, list[int]]:
|
||||
"""Extract text from PDF.
|
||||
|
||||
Try direct text first, fall back to Google Cloud Vision for scanned
|
||||
@@ -172,7 +198,27 @@ async def _extract_pdf(path: Path) -> tuple[str, int]:
|
||||
pages_text.append(ocr_text)
|
||||
|
||||
doc.close()
|
||||
return "\n\n".join(pages_text), page_count
|
||||
joined, offsets = _join_pages(pages_text)
|
||||
return joined, page_count, offsets
|
||||
|
||||
|
||||
def page_at_offset(offset: int, page_offsets: list[int]) -> int:
|
||||
"""Look up the page number containing a given char offset.
|
||||
|
||||
page_offsets[i] is the start of page (i+1) in the joined text;
|
||||
a chunk starting at ``offset`` belongs to the highest-indexed page
|
||||
whose start is ``<= offset``. Returns 1-based page number.
|
||||
"""
|
||||
if not page_offsets:
|
||||
return 1
|
||||
# Linear scan is fine — page_offsets is short (≤ ~200 for our PDFs).
|
||||
page = 1
|
||||
for i, start in enumerate(page_offsets):
|
||||
if start <= offset:
|
||||
page = i + 1
|
||||
else:
|
||||
break
|
||||
return page
|
||||
|
||||
|
||||
def _ocr_with_google_vision(image_bytes: bytes, page_num: int) -> str:
|
||||
|
||||
@@ -127,7 +127,7 @@ async def ingest_precedent(
|
||||
|
||||
await progress("extracting", 15, "מחלץ טקסט מהקובץ")
|
||||
try:
|
||||
text, page_count = await extractor.extract_text(str(staged))
|
||||
text, page_count, page_offsets = await extractor.extract_text(str(staged))
|
||||
except Exception as e:
|
||||
await progress("failed", 100, f"כשל בחילוץ טקסט: {e}")
|
||||
raise
|
||||
@@ -161,7 +161,7 @@ async def ingest_precedent(
|
||||
|
||||
try:
|
||||
await progress("chunking", 40, f"מחלק את הטקסט ל-chunks ({page_count} עמ')")
|
||||
chunks = chunker.chunk_document(text)
|
||||
chunks = chunker.chunk_document(text, page_offsets=page_offsets)
|
||||
if not chunks:
|
||||
await db.set_case_law_extraction_status(case_law_id, "completed")
|
||||
await db.set_case_law_halacha_status(case_law_id, "completed")
|
||||
|
||||
@@ -32,7 +32,7 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:
|
||||
try:
|
||||
# Step 1: Extract text
|
||||
logger.info("Extracting text from %s", doc["file_path"])
|
||||
text, page_count = await extractor.extract_text(doc["file_path"])
|
||||
text, page_count, page_offsets = await extractor.extract_text(doc["file_path"])
|
||||
|
||||
await db.update_document(
|
||||
document_id,
|
||||
@@ -70,9 +70,9 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:
|
||||
except Exception as e:
|
||||
logger.warning("Classification failed (non-fatal): %s", e)
|
||||
|
||||
# Step 2: Chunk
|
||||
# Step 2: Chunk (page_offsets propagates page_number into chunks)
|
||||
logger.info("Chunking document (%d chars)", len(text))
|
||||
chunks = chunker.chunk_document(text)
|
||||
chunks = chunker.chunk_document(text, page_offsets=page_offsets)
|
||||
|
||||
if not chunks:
|
||||
await db.update_document(document_id, extraction_status="completed")
|
||||
|
||||
@@ -144,7 +144,7 @@ async def document_upload_training(
|
||||
shutil.copy2(str(source), str(dest))
|
||||
|
||||
# Extract text and strip Nevo preamble
|
||||
text, page_count = await extractor.extract_text(str(dest))
|
||||
text, page_count, _ = await extractor.extract_text(str(dest))
|
||||
text = extractor.strip_nevo_preamble(text)
|
||||
|
||||
# Parse date
|
||||
|
||||
@@ -308,7 +308,7 @@ async def ingest_final_version(
|
||||
# Extract text from file if provided
|
||||
if file_path and not final_text:
|
||||
from legal_mcp.services import extractor
|
||||
final_text, _ = await extractor.extract_text(file_path)
|
||||
final_text, _, _ = await extractor.extract_text(file_path)
|
||||
|
||||
if not final_text:
|
||||
return "לא סופק טקסט — יש לספק file_path או final_text."
|
||||
|
||||
Reference in New Issue
Block a user