feat(retrieval): add voyage-multimodal-3 page-image embeddings (feature flag)
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m50s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m50s
Stage C: per-page image embeddings via voyage-multimodal-3 + hybrid text+image search. Off by default; enable with MULTIMODAL_ENABLED=true. - Schema V9: document_image_embeddings + precedent_image_embeddings (vector(1024), page_number, image_thumbnail_path) - extractor.render_pages_for_multimodal renders PDF pages at MULTIMODAL_DPI (144) for embedding + JPEG thumbnails at MULTIMODAL_THUMB_DPI (96) for UI preview, in one pass - embeddings.embed_images calls voyage-multimodal-3 in 50-page batches - services/hybrid_search.py orchestrator: rerank applied to text side first (rerank-2 is text-only); image side cosine; weighted merge with text_weight 0.65 (env-tunable); image-only pages surface as match_type='image' so dense scanned content still appears - processor.process_document and precedent_library.ingest_precedent gated by flag — non-fatal on multimodal failure - scripts/multimodal_backfill.py — idempotent per-case CLI to embed existing documents without re-extracting text Validated locally on a 5-page response brief: render 0.31s, embed 8.32s, hybrid merge surfaces image rows correctly. Production rollout starts with flag=false (no behavior change), then per-case A/B. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -9,6 +9,7 @@ Post-processing: Hebrew abbreviation quote fixer.
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
@@ -16,6 +17,7 @@ import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import fitz # PyMuPDF
|
||||
from PIL import Image
|
||||
from docx import Document as DocxDocument
|
||||
from google.cloud import vision
|
||||
from striprtf.striprtf import rtf_to_text
|
||||
@@ -220,6 +222,65 @@ def _extract_rtf(path: Path) -> str:
|
||||
return rtf_to_text(rtf_content)
|
||||
|
||||
|
||||
# ── Multimodal page rendering (V9) ───────────────────────────────
|
||||
|
||||
|
||||
def _pixmap_to_pil(pix: fitz.Pixmap) -> Image.Image:
|
||||
"""Convert a PyMuPDF pixmap to PIL.Image (RGB) without going through
|
||||
PNG bytes. Faster than tobytes('png') → Image.open()."""
|
||||
if pix.alpha:
|
||||
# Drop alpha channel — voyage multimodal expects RGB.
|
||||
pix = fitz.Pixmap(pix, 0)
|
||||
return Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
|
||||
|
||||
|
||||
def render_pages_for_multimodal(
|
||||
pdf_path: str | Path,
|
||||
embed_dpi: int,
|
||||
thumb_dpi: int | None = None,
|
||||
thumbnail_dir: Path | None = None,
|
||||
) -> list[tuple[Image.Image, Path | None]]:
|
||||
"""Render each PDF page as PIL.Image at ``embed_dpi`` for the
|
||||
multimodal embedder, and optionally save a smaller JPEG thumbnail
|
||||
at ``thumb_dpi`` to ``thumbnail_dir`` for UI preview.
|
||||
|
||||
Returns ``[(pil_image, thumb_path_or_None), ...]`` in page order.
|
||||
The full-DPI image stays in memory only — only the thumbnail is
|
||||
persisted to disk.
|
||||
"""
|
||||
src = Path(pdf_path)
|
||||
if not src.is_file():
|
||||
raise FileNotFoundError(f"PDF not found: {src}")
|
||||
if thumbnail_dir is not None:
|
||||
thumbnail_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
out: list[tuple[Image.Image, Path | None]] = []
|
||||
doc = fitz.open(str(src))
|
||||
try:
|
||||
for page_idx, page in enumerate(doc):
|
||||
page_num = page_idx + 1
|
||||
pix = page.get_pixmap(dpi=embed_dpi)
|
||||
img = _pixmap_to_pil(pix)
|
||||
|
||||
thumb_path: Path | None = None
|
||||
if thumbnail_dir is not None and thumb_dpi:
|
||||
thumb_path = thumbnail_dir / f"p{page_num:03d}.jpg"
|
||||
# Downsample the same render rather than re-rendering
|
||||
# with PyMuPDF — far faster.
|
||||
ratio = thumb_dpi / embed_dpi
|
||||
thumb_size = (
|
||||
max(1, int(img.width * ratio)),
|
||||
max(1, int(img.height * ratio)),
|
||||
)
|
||||
thumb = img.resize(thumb_size, Image.Resampling.LANCZOS)
|
||||
thumb.save(thumb_path, "JPEG", quality=75, optimize=True)
|
||||
|
||||
out.append((img, thumb_path))
|
||||
finally:
|
||||
doc.close()
|
||||
return out
|
||||
|
||||
|
||||
# ── Nevo preamble stripping ──────────────────────────────────────
|
||||
|
||||
_NEVO_MARKERS = ("ספרות:", "חקיקה שאוזכרה:", "מיני-רציו:", "פסקי דין שאוזכרו:",
|
||||
|
||||
Reference in New Issue
Block a user