feat(retrieval): add voyage-multimodal-3 page-image embeddings (feature flag)

Stage C: per-page image embeddings via voyage-multimodal-3 + hybrid text+image search. Off by default; enable with MULTIMODAL_ENABLED=true. - Schema V9: document_image_embeddings + precedent_image_embeddings (vector(1024), page_number, image_thumbnail_path) - extractor.render_pages_for_multimodal renders PDF pages at MULTIMODAL_DPI (144) for embedding + JPEG thumbnails at MULTIMODAL_THUMB_DPI (96) for UI preview, in one pass - embeddings.embed_images calls voyage-multimodal-3 in 50-page batches - services/hybrid_search.py orchestrator: rerank applied to text side first (rerank-2 is text-only); image side cosine; weighted merge with text_weight 0.65 (env-tunable); image-only pages surface as match_type='image' so dense scanned content still appears - processor.process_document and precedent_library.ingest_precedent gated by flag — non-fatal on multimodal failure - scripts/multimodal_backfill.py — idempotent per-case CLI to embed existing documents without re-extracting text Validated locally on a 5-page response brief: render 0.31s, embed 8.32s, hybrid merge surfaces image rows correctly. Production rollout starts with flag=false (no behavior change), then per-case A/B. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 19:24:52 +00:00
parent b9cdcf980d
commit 242f668319
10 changed files with 1038 additions and 40 deletions
--- a/mcp-server/src/legal_mcp/services/processor.py
+++ b/mcp-server/src/legal_mcp/services/processor.py
@@ -2,10 +2,12 @@

 from __future__ import annotations

+import asyncio
 import logging
 from pathlib import Path
 from uuid import UUID

+from legal_mcp import config
 from legal_mcp.services import chunker, db, embeddings, extractor, references_extractor

 logger = logging.getLogger(__name__)
@@ -95,6 +97,21 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:

        stored = await db.store_chunks(document_id, case_id, chunk_dicts)

+        # Step 4.5: Multimodal page-image embeddings (V9). Gated by
+        # MULTIMODAL_ENABLED. Renders each PDF page → embeds via
+        # voyage-multimodal-3 → stores per-page row with thumbnail.
+        # Non-fatal on failure (text path already succeeded).
+        multimodal_result = {"pages_embedded": 0}
+        if config.MULTIMODAL_ENABLED and page_count > 0:
+            try:
+                pdf_path = Path(doc["file_path"])
+                if pdf_path.suffix.lower() == ".pdf":
+                    multimodal_result = await _embed_document_pages(
+                        document_id, case_id, pdf_path, page_count,
+                    )
+            except Exception as e:
+                logger.warning("Multimodal embedding failed (non-fatal): %s", e)
+
        # Step 5: Extract references (plans, case law, legislation) — non-fatal
        refs_result = {"plans": 0, "case_law": 0, "case_law_linked": 0, "legislation": 0}
        try:
@@ -124,9 +141,63 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:
                "case_law": refs_result["case_law"],
                "legislation": refs_result["legislation"],
            },
+            "multimodal": multimodal_result,
        }

    except Exception as e:
        logger.exception("Document processing failed: %s", e)
        await db.update_document(document_id, extraction_status="failed")
        return {"status": "failed", "error": str(e)}
+
+
+async def _embed_document_pages(
+    document_id: UUID,
+    case_id: UUID,
+    pdf_path: Path,
+    page_count: int,
+) -> dict:
+    """Render PDF pages → embed via voyage-multimodal → store per-page rows.
+
+    Thumbnails are saved under
+    ``data/cases/{case_number}/thumbnails/{document_id}/p{N:03d}.jpg``
+    so the UI can show small previews next to image-side search hits.
+    """
+    # Layout: data/cases/{case_number}/documents/originals/{file}.pdf
+    # → case_dir = pdf_path.parent.parent.parent
+    case_dir = pdf_path.parent.parent.parent
+    thumb_dir = case_dir / "thumbnails" / str(document_id)
+
+    logger.info("Multimodal: rendering %d pages @ %ddpi", page_count, config.MULTIMODAL_DPI)
+    rendered = await asyncio.to_thread(
+        extractor.render_pages_for_multimodal,
+        pdf_path,
+        config.MULTIMODAL_DPI,
+        config.MULTIMODAL_THUMB_DPI,
+        thumb_dir,
+    )
+    images = [pil for pil, _ in rendered]
+    thumb_paths = [thumb for _, thumb in rendered]
+
+    logger.info("Multimodal: embedding %d pages via %s", len(images), config.MULTIMODAL_MODEL)
+    img_embs = await embeddings.embed_images(images)
+
+    page_records = []
+    for i, (emb, thumb) in enumerate(zip(img_embs, thumb_paths)):
+        rel_thumb = None
+        if thumb is not None:
+            try:
+                rel_thumb = str(thumb.relative_to(config.DATA_DIR))
+            except ValueError:
+                rel_thumb = str(thumb)
+        page_records.append({
+            "page_number": i + 1,
+            "embedding": emb,
+            "image_thumbnail_path": rel_thumb,
+        })
+
+    stored = await db.store_document_image_embeddings(
+        document_id, case_id, page_records,
+        model_name=config.MULTIMODAL_MODEL,
+    )
+    logger.info("Multimodal: stored %d page-image embeddings", stored)
+    return {"pages_embedded": stored, "model": config.MULTIMODAL_MODEL}