feat(retrieval): add voyage-multimodal-3 page-image embeddings (feature flag)

Stage C: per-page image embeddings via voyage-multimodal-3 + hybrid text+image search. Off by default; enable with MULTIMODAL_ENABLED=true. - Schema V9: document_image_embeddings + precedent_image_embeddings (vector(1024), page_number, image_thumbnail_path) - extractor.render_pages_for_multimodal renders PDF pages at MULTIMODAL_DPI (144) for embedding + JPEG thumbnails at MULTIMODAL_THUMB_DPI (96) for UI preview, in one pass - embeddings.embed_images calls voyage-multimodal-3 in 50-page batches - services/hybrid_search.py orchestrator: rerank applied to text side first (rerank-2 is text-only); image side cosine; weighted merge with text_weight 0.65 (env-tunable); image-only pages surface as match_type='image' so dense scanned content still appears - processor.process_document and precedent_library.ingest_precedent gated by flag — non-fatal on multimodal failure - scripts/multimodal_backfill.py — idempotent per-case CLI to embed existing documents without re-extracting text Validated locally on a 5-page response brief: render 0.31s, embed 8.32s, hybrid merge surfaces image rows correctly. Production rollout starts with flag=false (no behavior change), then per-case A/B. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 19:24:52 +00:00
parent b9cdcf980d
commit 242f668319
10 changed files with 1038 additions and 40 deletions
--- a/mcp-server/src/legal_mcp/services/precedent_library.py
+++ b/mcp-server/src/legal_mcp/services/precedent_library.py
@@ -13,6 +13,7 @@ SSE plumbing without this module knowing about Redis.

 from __future__ import annotations

+import asyncio
 import logging
 import re
 import shutil
@@ -22,7 +23,7 @@ from typing import Awaitable, Callable
 from uuid import UUID, uuid4

 from legal_mcp import config
-from legal_mcp.services import chunker, db, embeddings, extractor, rerank
+from legal_mcp.services import chunker, db, embeddings, extractor, hybrid_search, rerank  # noqa: F401

 # Note: halacha_extractor and precedent_metadata_extractor are NOT imported
 # at module load. They are imported lazily inside the dedicated re-extract
@@ -188,6 +189,18 @@ async def ingest_precedent(
        ]
        stored_chunks = await db.store_precedent_chunks(case_law_id, chunk_dicts)

+        # Multimodal page-image embeddings (V9). Gated by feature flag.
+        # Non-fatal: text path already succeeded. Only PDFs.
+        if config.MULTIMODAL_ENABLED and page_count > 0 and staged.suffix.lower() == ".pdf":
+            try:
+                await progress(
+                    "embedding_images", 70,
+                    f"מטמיע {page_count} עמודי תמונה (multimodal)",
+                )
+                await _embed_precedent_pages(case_law_id, staged, page_count)
+            except Exception as e:
+                logger.warning("Precedent multimodal embedding failed (non-fatal): %s", e)
+
        # Pipeline split: the container does the non-LLM half (extract +
        # chunk + embed + store). LLM-driven extraction (metadata, halachot)
        # runs separately via the MCP tool `precedent_process_pending` from
@@ -413,19 +426,60 @@ async def search_library(
        return []
    query_vec = await embeddings.embed_query(query)

-    async def _base(limit: int) -> list[dict]:
-        return await db.search_precedent_library_semantic(
-            query_embedding=query_vec,
-            practice_area=practice_area,
-            court=court,
-            precedent_level=precedent_level,
-            appeal_subtype=appeal_subtype,
-            is_binding=is_binding,
-            subject_tag=subject_tag,
-            limit=limit,
-            include_halachot=include_halachot,
-        )
-
-    return await rerank.maybe_rerank(
-        query=query, base_search=_base, limit=limit,
+    return await hybrid_search.search_precedent_library_hybrid(
+        query=query,
+        query_text_embedding=query_vec,
+        limit=limit,
+        practice_area=practice_area,
+        court=court,
+        precedent_level=precedent_level,
+        appeal_subtype=appeal_subtype,
+        is_binding=is_binding,
+        subject_tag=subject_tag,
+        include_halachot=include_halachot,
    )
+
+
+async def _embed_precedent_pages(
+    case_law_id: UUID,
+    pdf_path: Path,
+    page_count: int,
+) -> dict:
+    """Render precedent PDF pages → embed via voyage-multimodal → store.
+
+    Thumbnails go to
+    ``data/precedent-library/thumbnails/{case_law_id}/p{N:03d}.jpg``.
+    """
+    thumb_dir = PRECEDENT_LIBRARY_DIR / "thumbnails" / str(case_law_id)
+    rendered = await asyncio.to_thread(
+        extractor.render_pages_for_multimodal,
+        pdf_path,
+        config.MULTIMODAL_DPI,
+        config.MULTIMODAL_THUMB_DPI,
+        thumb_dir,
+    )
+    images = [pil for pil, _ in rendered]
+    thumbs = [t for _, t in rendered]
+    img_embs = await embeddings.embed_images(images)
+
+    page_records = []
+    for i, (emb, thumb) in enumerate(zip(img_embs, thumbs)):
+        rel_thumb = None
+        if thumb is not None:
+            try:
+                rel_thumb = str(thumb.relative_to(config.DATA_DIR))
+            except ValueError:
+                rel_thumb = str(thumb)
+        page_records.append({
+            "page_number": i + 1,
+            "embedding": emb,
+            "image_thumbnail_path": rel_thumb,
+        })
+    stored = await db.store_precedent_image_embeddings(
+        case_law_id, page_records, model_name=config.MULTIMODAL_MODEL,
+    )
+    logger.info(
+        "Multimodal: stored %d page-image embeddings for case_law %s",
+        stored, case_law_id,
+    )
+    return {"pages_embedded": stored}