feat(retrieval): add voyage-multimodal-3 page-image embeddings (feature flag)
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m50s

Stage C: per-page image embeddings via voyage-multimodal-3 + hybrid
text+image search. Off by default; enable with MULTIMODAL_ENABLED=true.

- Schema V9: document_image_embeddings + precedent_image_embeddings
  (vector(1024), page_number, image_thumbnail_path)
- extractor.render_pages_for_multimodal renders PDF pages at
  MULTIMODAL_DPI (144) for embedding + JPEG thumbnails at
  MULTIMODAL_THUMB_DPI (96) for UI preview, in one pass
- embeddings.embed_images calls voyage-multimodal-3 in 50-page batches
- services/hybrid_search.py orchestrator: rerank applied to text side
  first (rerank-2 is text-only); image side cosine; weighted merge
  with text_weight 0.65 (env-tunable); image-only pages surface as
  match_type='image' so dense scanned content still appears
- processor.process_document and precedent_library.ingest_precedent
  gated by flag — non-fatal on multimodal failure
- scripts/multimodal_backfill.py — idempotent per-case CLI to embed
  existing documents without re-extracting text

Validated locally on a 5-page response brief: render 0.31s, embed 8.32s,
hybrid merge surfaces image rows correctly. Production rollout starts
with flag=false (no behavior change), then per-case A/B.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-03 19:24:52 +00:00
parent b9cdcf980d
commit 242f668319
10 changed files with 1038 additions and 40 deletions

View File

@@ -58,6 +58,29 @@ VOYAGE_RERANK_ENABLED = (
# 50 was the depth used in the POC; balances recall vs rerank cost.
VOYAGE_RERANK_FETCH_K = int(os.environ.get("VOYAGE_RERANK_FETCH_K", "50"))
# Multimodal — page-image embeddings via voyage-multimodal-3. Off by
# default; flip with env to enable per-page image embedding during
# ingestion + hybrid (text+image) ranking at search time. POC #3
# validated on a 89-page appraisal PDF (38s, 312K tokens, recovered
# table structure + image-only scanned pages that text-OCR misses).
MULTIMODAL_ENABLED = (
os.environ.get("MULTIMODAL_ENABLED", "false").lower() == "true"
)
MULTIMODAL_MODEL = os.environ.get("MULTIMODAL_MODEL", "voyage-multimodal-3")
# Render DPI for the image fed to the embedder. POC used 144 — sweet
# spot between embedding quality and tokens/page (144 ≈ 3.5K tok/page).
MULTIMODAL_DPI = int(os.environ.get("MULTIMODAL_DPI", "144"))
# Separate, lower DPI for the JPEG thumbnail saved to disk for UI
# preview. ~96dpi → ~20KB/page; ingestion-time, no re-render at view.
MULTIMODAL_THUMB_DPI = int(os.environ.get("MULTIMODAL_THUMB_DPI", "96"))
# Hybrid merge weight for the *text* side. The image side gets
# (1 - this). POC found text dominates most queries; image wins only
# on table/visual queries — slight text bias starting point, tunable
# per env without redeploy.
MULTIMODAL_TEXT_WEIGHT = float(
os.environ.get("MULTIMODAL_TEXT_WEIGHT", "0.65")
)
# Halacha extraction — auto-approve threshold. Halachot with extractor
# confidence >= this value are inserted with review_status='approved'
# instead of 'pending_review' (so they immediately appear in

View File

@@ -623,6 +623,54 @@ CREATE INDEX IF NOT EXISTS idx_case_law_halacha_requested
"""
# ── V9: Multimodal page-image embeddings ─────────────────────────
# voyage-multimodal-3 (1024-dim) embeds the whole page as an image:
# captures table layout, scanned content, signatures, plans — content
# that text-OCR loses. Ingestion is gated by config.MULTIMODAL_ENABLED;
# search_*_hybrid() merge text-cosine + image-cosine when present.
# image_thumbnail_path is a relative path under DATA_DIR/cases/{case}/
# thumbnails/ or DATA_DIR/precedent-library/thumbnails/ — a small JPEG
# rendered at config.MULTIMODAL_THUMB_DPI for UI preview, distinct from
# the higher-DPI render fed to the embedder (which is not persisted).
SCHEMA_V9_SQL = """
CREATE TABLE IF NOT EXISTS document_image_embeddings (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID REFERENCES documents(id) ON DELETE CASCADE,
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
page_number INTEGER NOT NULL,
image_thumbnail_path TEXT,
embedding vector(1024),
model_name TEXT DEFAULT 'voyage-multimodal-3',
created_at TIMESTAMPTZ DEFAULT now(),
UNIQUE(document_id, page_number)
);
CREATE INDEX IF NOT EXISTS idx_doc_img_emb_vec
ON document_image_embeddings USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 50);
CREATE INDEX IF NOT EXISTS idx_doc_img_emb_doc
ON document_image_embeddings(document_id);
CREATE INDEX IF NOT EXISTS idx_doc_img_emb_case
ON document_image_embeddings(case_id);
CREATE TABLE IF NOT EXISTS precedent_image_embeddings (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE,
page_number INTEGER NOT NULL,
image_thumbnail_path TEXT,
embedding vector(1024),
model_name TEXT DEFAULT 'voyage-multimodal-3',
created_at TIMESTAMPTZ DEFAULT now(),
UNIQUE(case_law_id, page_number)
);
CREATE INDEX IF NOT EXISTS idx_prec_img_emb_vec
ON precedent_image_embeddings USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 50);
CREATE INDEX IF NOT EXISTS idx_prec_img_emb_case_law
ON precedent_image_embeddings(case_law_id);
"""
async def init_schema() -> None:
pool = await get_pool()
async with pool.acquire() as conn:
@@ -635,7 +683,8 @@ async def init_schema() -> None:
await conn.execute(SCHEMA_V6_SQL)
await conn.execute(SCHEMA_V7_SQL)
await conn.execute(SCHEMA_V8_SQL)
logger.info("Database schema initialized (v1-v8)")
await conn.execute(SCHEMA_V9_SQL)
logger.info("Database schema initialized (v1-v9)")
# ── Case CRUD ───────────────────────────────────────────────────────
@@ -2350,3 +2399,300 @@ async def clear_extraction_request(
f"UPDATE case_law SET {col} = NULL WHERE id = $1",
case_law_id,
)
# ── V9: Multimodal page image embeddings ─────────────────────────
async def store_document_image_embeddings(
document_id: UUID,
case_id: UUID | None,
page_records: list[dict],
model_name: str = "voyage-multimodal-3",
) -> int:
"""Replace per-page image embeddings for a document.
Each ``page_records`` entry: ``{page_number, embedding, image_thumbnail_path}``.
Embeddings should already be 1024-dim lists (or None for skipped pages).
"""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(
"DELETE FROM document_image_embeddings WHERE document_id = $1",
document_id,
)
for r in page_records:
await conn.execute(
"""INSERT INTO document_image_embeddings
(document_id, case_id, page_number, embedding,
image_thumbnail_path, model_name)
VALUES ($1, $2, $3, $4, $5, $6)""",
document_id, case_id,
r["page_number"],
r.get("embedding"),
r.get("image_thumbnail_path"),
model_name,
)
return len(page_records)
async def store_precedent_image_embeddings(
case_law_id: UUID,
page_records: list[dict],
model_name: str = "voyage-multimodal-3",
) -> int:
"""Same pattern as store_document_image_embeddings but for precedents."""
pool = await get_pool()
async with pool.acquire() as conn:
await conn.execute(
"DELETE FROM precedent_image_embeddings WHERE case_law_id = $1",
case_law_id,
)
for r in page_records:
await conn.execute(
"""INSERT INTO precedent_image_embeddings
(case_law_id, page_number, embedding,
image_thumbnail_path, model_name)
VALUES ($1, $2, $3, $4, $5)""",
case_law_id,
r["page_number"],
r.get("embedding"),
r.get("image_thumbnail_path"),
model_name,
)
return len(page_records)
async def search_document_images_similar(
query_embedding: list[float],
limit: int = 10,
case_id: UUID | None = None,
practice_area: str | None = None,
appeal_subtype: str | None = None,
) -> list[dict]:
"""Cosine search over per-page image embeddings of case documents."""
pool = await get_pool()
conditions: list[str] = []
params: list = [query_embedding, limit]
idx = 3
if case_id:
conditions.append(f"die.case_id = ${idx}")
params.append(case_id); idx += 1
if practice_area:
conditions.append(f"c.practice_area = ${idx}")
params.append(practice_area); idx += 1
if appeal_subtype:
conditions.append(f"c.appeal_subtype = ${idx}")
params.append(appeal_subtype); idx += 1
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
sql = f"""
SELECT die.document_id, die.case_id, die.page_number,
die.image_thumbnail_path,
d.title AS document_title,
c.case_number,
1 - (die.embedding <=> $1) AS score
FROM document_image_embeddings die
JOIN documents d ON d.id = die.document_id
JOIN cases c ON c.id = die.case_id
{where}
ORDER BY die.embedding <=> $1
LIMIT $2
"""
async with pool.acquire() as conn:
rows = await conn.fetch(sql, *params)
return [dict(r) for r in rows]
async def search_precedent_images_similar(
query_embedding: list[float],
limit: int = 10,
practice_area: str = "",
court: str = "",
precedent_level: str = "",
appeal_subtype: str = "",
is_binding: bool | None = None,
) -> list[dict]:
"""Cosine search over per-page image embeddings of precedent rulings."""
pool = await get_pool()
conditions: list[str] = ["cl.source_kind = 'external_upload'"]
params: list = [query_embedding, limit]
idx = 3
if practice_area:
conditions.append(f"cl.practice_area = ${idx}")
params.append(practice_area); idx += 1
if court:
conditions.append(f"cl.court ILIKE ${idx}")
params.append(f"%{court}%"); idx += 1
if precedent_level:
conditions.append(f"cl.precedent_level = ${idx}")
params.append(precedent_level); idx += 1
if appeal_subtype:
conditions.append(f"cl.appeal_subtype = ${idx}")
params.append(appeal_subtype); idx += 1
if is_binding is not None:
conditions.append(f"cl.is_binding = ${idx}")
params.append(is_binding); idx += 1
where = " AND ".join(conditions)
sql = f"""
SELECT pie.case_law_id, pie.page_number, pie.image_thumbnail_path,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level, cl.practice_area,
1 - (pie.embedding <=> $1) AS score
FROM precedent_image_embeddings pie
JOIN case_law cl ON cl.id = pie.case_law_id
WHERE {where}
ORDER BY pie.embedding <=> $1
LIMIT $2
"""
async with pool.acquire() as conn:
rows = await conn.fetch(sql, *params)
out = []
for r in rows:
d = dict(r)
if d.get("decision_date") is not None:
d["decision_date"] = d["decision_date"].isoformat()
out.append(d)
return out
async def search_similar_hybrid(
query_text_embedding: list[float],
query_image_embedding: list[float],
limit: int = 10,
fetch_k: int = 30,
text_weight: float = 0.65,
case_id: UUID | None = None,
section_type: str | None = None,
practice_area: str | None = None,
appeal_subtype: str | None = None,
) -> list[dict]:
"""Weighted merge of text-chunk and per-page image search.
Same (document_id, page_number) → boost text chunk by image score
on that page. Image-only pages with no overlapping text chunk are
surfaced as ``match_type='image'`` so dense scanned content still
appears in results.
"""
img_weight = 1.0 - text_weight
text_rows = await search_similar(
query_text_embedding, limit=fetch_k, case_id=case_id,
section_type=section_type, practice_area=practice_area,
appeal_subtype=appeal_subtype,
)
img_rows = await search_document_images_similar(
query_image_embedding, limit=fetch_k, case_id=case_id,
practice_area=practice_area, appeal_subtype=appeal_subtype,
)
img_by_page: dict[tuple, dict] = {
(str(r["document_id"]), r["page_number"]): r for r in img_rows
}
seen: set = set()
merged: list[dict] = []
for r in text_rows:
page = r.get("page_number")
key = (str(r["document_id"]), page) if page is not None else None
img_hit = img_by_page.get(key) if key else None
text_score = float(r["score"])
image_score = float(img_hit["score"]) if img_hit else 0.0
d = dict(r)
d["text_score"] = text_score
d["image_score"] = image_score
d["score"] = text_score * text_weight + image_score * img_weight
d["match_type"] = "text+image" if img_hit else "text"
if img_hit:
d["image_thumbnail_path"] = img_hit.get("image_thumbnail_path")
merged.append(d)
if key:
seen.add(key)
for r in img_rows:
key = (str(r["document_id"]), r["page_number"])
if key in seen:
continue
d = dict(r)
d["text_score"] = 0.0
d["image_score"] = float(r["score"])
d["score"] = float(r["score"]) * img_weight
d["match_type"] = "image"
d["content"] = ""
d["section_type"] = "image"
merged.append(d)
merged.sort(key=lambda x: -x["score"])
return merged[:limit]
async def search_precedent_library_hybrid(
query_text_embedding: list[float],
query_image_embedding: list[float],
limit: int = 10,
fetch_k: int = 30,
text_weight: float = 0.65,
practice_area: str = "",
court: str = "",
precedent_level: str = "",
appeal_subtype: str = "",
is_binding: bool | None = None,
subject_tag: str = "",
include_halachot: bool = True,
) -> list[dict]:
"""Hybrid variant of search_precedent_library_semantic.
Halachot have no ``page_number`` — they're boosted by the max
image score from any page in the same case_law row.
"""
img_weight = 1.0 - text_weight
text_results = await search_precedent_library_semantic(
query_text_embedding,
practice_area=practice_area, court=court,
precedent_level=precedent_level, appeal_subtype=appeal_subtype,
is_binding=is_binding, subject_tag=subject_tag,
limit=fetch_k, include_halachot=include_halachot,
)
img_results = await search_precedent_images_similar(
query_image_embedding, limit=fetch_k,
practice_area=practice_area, court=court,
precedent_level=precedent_level, appeal_subtype=appeal_subtype,
is_binding=is_binding,
)
img_by_page: dict[tuple, dict] = {}
img_by_case: dict[str, float] = {}
for r in img_results:
cid = str(r["case_law_id"])
img_by_page[(cid, r["page_number"])] = r
img_by_case[cid] = max(img_by_case.get(cid, 0.0), float(r["score"]))
seen: set = set()
merged: list[dict] = []
for r in text_results:
cid = str(r["case_law_id"])
page = r.get("page_number")
key = (cid, page) if page is not None else None
img_hit = img_by_page.get(key) if key else None
if img_hit:
image_score = float(img_hit["score"])
elif r.get("type") == "halacha":
image_score = img_by_case.get(cid, 0.0)
else:
image_score = 0.0
text_score = float(r["score"])
d = dict(r)
d["text_score"] = text_score
d["image_score"] = image_score
d["score"] = text_score * text_weight + image_score * img_weight
if img_hit:
d["image_thumbnail_path"] = img_hit.get("image_thumbnail_path")
if key:
seen.add(key)
merged.append(d)
for r in img_results:
key = (str(r["case_law_id"]), r["page_number"])
if key in seen:
continue
d = dict(r)
d["text_score"] = 0.0
d["image_score"] = float(r["score"])
d["score"] = float(r["score"]) * img_weight
d["type"] = "image_page"
d["content"] = ""
d["section_type"] = "image"
merged.append(d)
merged.sort(key=lambda x: -x["score"])
return merged[:limit]

View File

@@ -3,15 +3,24 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING
import voyageai
from legal_mcp import config
if TYPE_CHECKING:
from PIL import Image as PILImage
logger = logging.getLogger(__name__)
_client: voyageai.Client | None = None
# Per-call cap for multimodal_embed. POC ran 89 pages (~312K tokens)
# in a single call comfortably; 50 leaves safe headroom for densely-
# OCR'd legal pages where tokens/page can exceed 4K.
_MULTIMODAL_BATCH_SIZE = 50
def _get_client() -> voyageai.Client:
global _client
@@ -55,6 +64,45 @@ async def embed_query(query: str) -> list[float]:
return results[0]
async def embed_images(
images: "list[PILImage.Image]",
input_type: str = "document",
) -> list[list[float]]:
"""Embed page images via voyage-multimodal-3.
Each input is a single PIL.Image (one page = one embedding).
Returns a list of 1024-dim vectors, one per input image, in order.
Batches at ``_MULTIMODAL_BATCH_SIZE`` to stay within Voyage's
per-request limits on dense legal pages.
"""
if not images:
return []
client = _get_client()
out: list[list[float]] = []
for i in range(0, len(images), _MULTIMODAL_BATCH_SIZE):
batch = images[i : i + _MULTIMODAL_BATCH_SIZE]
result = client.multimodal_embed(
inputs=[[img] for img in batch],
model=config.MULTIMODAL_MODEL,
input_type=input_type,
truncation=True,
)
out.extend(result.embeddings)
return out
async def embed_query_for_multimodal(query: str) -> list[float]:
"""Embed a text query in the multimodal vector space, so it can be
cosine-compared against page-image embeddings."""
client = _get_client()
result = client.multimodal_embed(
inputs=[[query]],
model=config.MULTIMODAL_MODEL,
input_type="query",
)
return result.embeddings[0]
async def voyage_rerank(
query: str, documents: list[str], top_k: int | None = None,
) -> list[tuple[int, float]]:

View File

@@ -9,6 +9,7 @@ Post-processing: Hebrew abbreviation quote fixer.
from __future__ import annotations
import asyncio
import io
import logging
import re
import subprocess
@@ -16,6 +17,7 @@ import tempfile
from pathlib import Path
import fitz # PyMuPDF
from PIL import Image
from docx import Document as DocxDocument
from google.cloud import vision
from striprtf.striprtf import rtf_to_text
@@ -220,6 +222,65 @@ def _extract_rtf(path: Path) -> str:
return rtf_to_text(rtf_content)
# ── Multimodal page rendering (V9) ───────────────────────────────
def _pixmap_to_pil(pix: fitz.Pixmap) -> Image.Image:
"""Convert a PyMuPDF pixmap to PIL.Image (RGB) without going through
PNG bytes. Faster than tobytes('png') → Image.open()."""
if pix.alpha:
# Drop alpha channel — voyage multimodal expects RGB.
pix = fitz.Pixmap(pix, 0)
return Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
def render_pages_for_multimodal(
pdf_path: str | Path,
embed_dpi: int,
thumb_dpi: int | None = None,
thumbnail_dir: Path | None = None,
) -> list[tuple[Image.Image, Path | None]]:
"""Render each PDF page as PIL.Image at ``embed_dpi`` for the
multimodal embedder, and optionally save a smaller JPEG thumbnail
at ``thumb_dpi`` to ``thumbnail_dir`` for UI preview.
Returns ``[(pil_image, thumb_path_or_None), ...]`` in page order.
The full-DPI image stays in memory only — only the thumbnail is
persisted to disk.
"""
src = Path(pdf_path)
if not src.is_file():
raise FileNotFoundError(f"PDF not found: {src}")
if thumbnail_dir is not None:
thumbnail_dir.mkdir(parents=True, exist_ok=True)
out: list[tuple[Image.Image, Path | None]] = []
doc = fitz.open(str(src))
try:
for page_idx, page in enumerate(doc):
page_num = page_idx + 1
pix = page.get_pixmap(dpi=embed_dpi)
img = _pixmap_to_pil(pix)
thumb_path: Path | None = None
if thumbnail_dir is not None and thumb_dpi:
thumb_path = thumbnail_dir / f"p{page_num:03d}.jpg"
# Downsample the same render rather than re-rendering
# with PyMuPDF — far faster.
ratio = thumb_dpi / embed_dpi
thumb_size = (
max(1, int(img.width * ratio)),
max(1, int(img.height * ratio)),
)
thumb = img.resize(thumb_size, Image.Resampling.LANCZOS)
thumb.save(thumb_path, "JPEG", quality=75, optimize=True)
out.append((img, thumb_path))
finally:
doc.close()
return out
# ── Nevo preamble stripping ──────────────────────────────────────
_NEVO_MARKERS = ("ספרות:", "חקיקה שאוזכרה:", "מיני-רציו:", "פסקי דין שאוזכרו:",

View File

@@ -0,0 +1,202 @@
"""Hybrid (text + image) search wrappers.
Layered on top of ``rerank.maybe_rerank``. When ``MULTIMODAL_ENABLED`` is
true the result comes from a weighted merge of:
• text side: cosine on chunks → optional rerank-2 cross-encoder
• image side: cosine on per-page voyage-multimodal-3 embeddings
rerank-2 is a *text* cross-encoder, so image-side rows are NOT passed
through it; they keep their cosine score and merge alongside the
(possibly reranked) text rows. Image-only pages with no overlapping
text chunk are surfaced as ``match_type='image'`` so scanned-only or
visual-heavy content still appears in results.
When ``MULTIMODAL_ENABLED`` is false this module degenerates to plain
``rerank.maybe_rerank`` — callers can wrap unconditionally and let env
control behaviour.
"""
from __future__ import annotations
import logging
from typing import Any
from uuid import UUID
from legal_mcp import config
from legal_mcp.services import db, embeddings, rerank
logger = logging.getLogger(__name__)
async def search_documents_hybrid(
query: str,
query_text_embedding: list[float],
*,
limit: int,
case_id: UUID | None = None,
section_type: str | None = None,
practice_area: str | None = None,
appeal_subtype: str | None = None,
) -> list[dict]:
"""Hybrid wrapper for document-chunk search (search_decisions /
search_case_documents / find_similar_cases)."""
fetch_k = max(limit, config.VOYAGE_RERANK_FETCH_K) if config.MULTIMODAL_ENABLED else limit
text_results = await rerank.maybe_rerank(
query=query,
base_search=lambda **kw: db.search_similar(
query_embedding=query_text_embedding, **kw,
),
limit=fetch_k,
case_id=case_id,
section_type=section_type,
practice_area=practice_area,
appeal_subtype=appeal_subtype,
)
if not config.MULTIMODAL_ENABLED:
return text_results[:limit]
try:
query_img_emb = await embeddings.embed_query_for_multimodal(query)
img_rows = await db.search_document_images_similar(
query_img_emb,
limit=fetch_k,
case_id=case_id,
practice_area=practice_area,
appeal_subtype=appeal_subtype,
)
except Exception as e:
logger.warning("Hybrid: image side failed, returning text only: %s", e)
return text_results[:limit]
merged = _merge(
text_results, img_rows,
id_field="document_id",
text_weight=config.MULTIMODAL_TEXT_WEIGHT,
)
return merged[:limit]
async def search_precedent_library_hybrid(
query: str,
query_text_embedding: list[float],
*,
limit: int,
practice_area: str = "",
court: str = "",
precedent_level: str = "",
appeal_subtype: str = "",
is_binding: bool | None = None,
subject_tag: str = "",
include_halachot: bool = True,
) -> list[dict]:
"""Hybrid wrapper for precedent-library search."""
fetch_k = max(limit, config.VOYAGE_RERANK_FETCH_K) if config.MULTIMODAL_ENABLED else limit
async def _base(limit_inner: int) -> list[dict]:
return await db.search_precedent_library_semantic(
query_embedding=query_text_embedding,
practice_area=practice_area,
court=court,
precedent_level=precedent_level,
appeal_subtype=appeal_subtype,
is_binding=is_binding,
subject_tag=subject_tag,
limit=limit_inner,
include_halachot=include_halachot,
)
text_results = await rerank.maybe_rerank(
query=query, base_search=_base, limit=fetch_k,
)
if not config.MULTIMODAL_ENABLED:
return text_results[:limit]
try:
query_img_emb = await embeddings.embed_query_for_multimodal(query)
img_rows = await db.search_precedent_images_similar(
query_img_emb,
limit=fetch_k,
practice_area=practice_area,
court=court,
precedent_level=precedent_level,
appeal_subtype=appeal_subtype,
is_binding=is_binding,
)
except Exception as e:
logger.warning("Hybrid: image side failed, returning text only: %s", e)
return text_results[:limit]
merged = _merge(
text_results, img_rows,
id_field="case_law_id",
text_weight=config.MULTIMODAL_TEXT_WEIGHT,
)
return merged[:limit]
def _merge(
text_rows: list[dict],
img_rows: list[dict],
id_field: str,
text_weight: float,
) -> list[dict]:
"""Weighted merge of text + image rows.
Joins on ``(id_field, page_number)``. Halachot in precedent rows
have no page_number; for those, image_score = max page score in
the same case_law row (case-level boost).
Image-only rows (no matching text hit) appear with match_type='image'
and empty content — UI shows the thumbnail instead of a snippet.
"""
img_weight = 1.0 - text_weight
img_by_key: dict[tuple, dict] = {}
img_max_by_id: dict[str, float] = {}
for r in img_rows:
rid = str(r[id_field])
page = r.get("page_number")
img_by_key[(rid, page)] = r
score = float(r.get("score", 0.0))
img_max_by_id[rid] = max(img_max_by_id.get(rid, 0.0), score)
seen: set = set()
merged: list[dict] = []
for r in text_rows:
rid = str(r[id_field])
page = r.get("page_number")
key = (rid, page) if page is not None else None
img_hit = img_by_key.get(key) if key else None
text_score = float(r.get("score", 0.0))
if img_hit:
image_score = float(img_hit["score"])
elif r.get("type") == "halacha":
image_score = img_max_by_id.get(rid, 0.0)
else:
image_score = 0.0
d = dict(r)
d["text_score"] = text_score
d["image_score"] = image_score
d["score"] = text_score * text_weight + image_score * img_weight
d["match_type"] = "text+image" if img_hit else "text"
if img_hit:
d["image_thumbnail_path"] = img_hit.get("image_thumbnail_path")
if key:
seen.add(key)
merged.append(d)
for r in img_rows:
rid = str(r[id_field])
key = (rid, r.get("page_number"))
if key in seen:
continue
d = dict(r)
d["text_score"] = 0.0
d["image_score"] = float(r.get("score", 0.0))
d["score"] = float(r.get("score", 0.0)) * img_weight
d["match_type"] = "image"
d["content"] = ""
d["section_type"] = "image"
merged.append(d)
merged.sort(key=lambda x: -float(x["score"]))
return merged

View File

@@ -13,6 +13,7 @@ SSE plumbing without this module knowing about Redis.
from __future__ import annotations
import asyncio
import logging
import re
import shutil
@@ -22,7 +23,7 @@ from typing import Awaitable, Callable
from uuid import UUID, uuid4
from legal_mcp import config
from legal_mcp.services import chunker, db, embeddings, extractor, rerank
from legal_mcp.services import chunker, db, embeddings, extractor, hybrid_search, rerank # noqa: F401
# Note: halacha_extractor and precedent_metadata_extractor are NOT imported
# at module load. They are imported lazily inside the dedicated re-extract
@@ -188,6 +189,18 @@ async def ingest_precedent(
]
stored_chunks = await db.store_precedent_chunks(case_law_id, chunk_dicts)
# Multimodal page-image embeddings (V9). Gated by feature flag.
# Non-fatal: text path already succeeded. Only PDFs.
if config.MULTIMODAL_ENABLED and page_count > 0 and staged.suffix.lower() == ".pdf":
try:
await progress(
"embedding_images", 70,
f"מטמיע {page_count} עמודי תמונה (multimodal)",
)
await _embed_precedent_pages(case_law_id, staged, page_count)
except Exception as e:
logger.warning("Precedent multimodal embedding failed (non-fatal): %s", e)
# Pipeline split: the container does the non-LLM half (extract +
# chunk + embed + store). LLM-driven extraction (metadata, halachot)
# runs separately via the MCP tool `precedent_process_pending` from
@@ -413,19 +426,60 @@ async def search_library(
return []
query_vec = await embeddings.embed_query(query)
async def _base(limit: int) -> list[dict]:
return await db.search_precedent_library_semantic(
query_embedding=query_vec,
practice_area=practice_area,
court=court,
precedent_level=precedent_level,
appeal_subtype=appeal_subtype,
is_binding=is_binding,
subject_tag=subject_tag,
limit=limit,
include_halachot=include_halachot,
)
return await rerank.maybe_rerank(
query=query, base_search=_base, limit=limit,
return await hybrid_search.search_precedent_library_hybrid(
query=query,
query_text_embedding=query_vec,
limit=limit,
practice_area=practice_area,
court=court,
precedent_level=precedent_level,
appeal_subtype=appeal_subtype,
is_binding=is_binding,
subject_tag=subject_tag,
include_halachot=include_halachot,
)
async def _embed_precedent_pages(
case_law_id: UUID,
pdf_path: Path,
page_count: int,
) -> dict:
"""Render precedent PDF pages → embed via voyage-multimodal → store.
Thumbnails go to
``data/precedent-library/thumbnails/{case_law_id}/p{N:03d}.jpg``.
"""
thumb_dir = PRECEDENT_LIBRARY_DIR / "thumbnails" / str(case_law_id)
rendered = await asyncio.to_thread(
extractor.render_pages_for_multimodal,
pdf_path,
config.MULTIMODAL_DPI,
config.MULTIMODAL_THUMB_DPI,
thumb_dir,
)
images = [pil for pil, _ in rendered]
thumbs = [t for _, t in rendered]
img_embs = await embeddings.embed_images(images)
page_records = []
for i, (emb, thumb) in enumerate(zip(img_embs, thumbs)):
rel_thumb = None
if thumb is not None:
try:
rel_thumb = str(thumb.relative_to(config.DATA_DIR))
except ValueError:
rel_thumb = str(thumb)
page_records.append({
"page_number": i + 1,
"embedding": emb,
"image_thumbnail_path": rel_thumb,
})
stored = await db.store_precedent_image_embeddings(
case_law_id, page_records, model_name=config.MULTIMODAL_MODEL,
)
logger.info(
"Multimodal: stored %d page-image embeddings for case_law %s",
stored, case_law_id,
)
return {"pages_embedded": stored}

View File

@@ -2,10 +2,12 @@
from __future__ import annotations
import asyncio
import logging
from pathlib import Path
from uuid import UUID
from legal_mcp import config
from legal_mcp.services import chunker, db, embeddings, extractor, references_extractor
logger = logging.getLogger(__name__)
@@ -95,6 +97,21 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:
stored = await db.store_chunks(document_id, case_id, chunk_dicts)
# Step 4.5: Multimodal page-image embeddings (V9). Gated by
# MULTIMODAL_ENABLED. Renders each PDF page → embeds via
# voyage-multimodal-3 → stores per-page row with thumbnail.
# Non-fatal on failure (text path already succeeded).
multimodal_result = {"pages_embedded": 0}
if config.MULTIMODAL_ENABLED and page_count > 0:
try:
pdf_path = Path(doc["file_path"])
if pdf_path.suffix.lower() == ".pdf":
multimodal_result = await _embed_document_pages(
document_id, case_id, pdf_path, page_count,
)
except Exception as e:
logger.warning("Multimodal embedding failed (non-fatal): %s", e)
# Step 5: Extract references (plans, case law, legislation) — non-fatal
refs_result = {"plans": 0, "case_law": 0, "case_law_linked": 0, "legislation": 0}
try:
@@ -124,9 +141,63 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:
"case_law": refs_result["case_law"],
"legislation": refs_result["legislation"],
},
"multimodal": multimodal_result,
}
except Exception as e:
logger.exception("Document processing failed: %s", e)
await db.update_document(document_id, extraction_status="failed")
return {"status": "failed", "error": str(e)}
async def _embed_document_pages(
document_id: UUID,
case_id: UUID,
pdf_path: Path,
page_count: int,
) -> dict:
"""Render PDF pages → embed via voyage-multimodal → store per-page rows.
Thumbnails are saved under
``data/cases/{case_number}/thumbnails/{document_id}/p{N:03d}.jpg``
so the UI can show small previews next to image-side search hits.
"""
# Layout: data/cases/{case_number}/documents/originals/{file}.pdf
# → case_dir = pdf_path.parent.parent.parent
case_dir = pdf_path.parent.parent.parent
thumb_dir = case_dir / "thumbnails" / str(document_id)
logger.info("Multimodal: rendering %d pages @ %ddpi", page_count, config.MULTIMODAL_DPI)
rendered = await asyncio.to_thread(
extractor.render_pages_for_multimodal,
pdf_path,
config.MULTIMODAL_DPI,
config.MULTIMODAL_THUMB_DPI,
thumb_dir,
)
images = [pil for pil, _ in rendered]
thumb_paths = [thumb for _, thumb in rendered]
logger.info("Multimodal: embedding %d pages via %s", len(images), config.MULTIMODAL_MODEL)
img_embs = await embeddings.embed_images(images)
page_records = []
for i, (emb, thumb) in enumerate(zip(img_embs, thumb_paths)):
rel_thumb = None
if thumb is not None:
try:
rel_thumb = str(thumb.relative_to(config.DATA_DIR))
except ValueError:
rel_thumb = str(thumb)
page_records.append({
"page_number": i + 1,
"embedding": emb,
"image_thumbnail_path": rel_thumb,
})
stored = await db.store_document_image_embeddings(
document_id, case_id, page_records,
model_name=config.MULTIMODAL_MODEL,
)
logger.info("Multimodal: stored %d page-image embeddings", stored)
return {"pages_embedded": stored, "model": config.MULTIMODAL_MODEL}

View File

@@ -6,7 +6,7 @@ import json
import logging
from uuid import UUID
from legal_mcp.services import db, embeddings, rerank
from legal_mcp.services import db, embeddings, hybrid_search
logger = logging.getLogger(__name__)
@@ -43,9 +43,9 @@ async def search_decisions(
)
query_emb = await embeddings.embed_query(query)
results = await rerank.maybe_rerank(
results = await hybrid_search.search_documents_hybrid(
query=query,
base_search=lambda **kw: db.search_similar(query_embedding=query_emb, **kw),
query_text_embedding=query_emb,
limit=limit,
section_type=section_type or None,
practice_area=practice_area or None,
@@ -59,11 +59,13 @@ async def search_decisions(
for r in results:
formatted.append({
"score": round(float(r["score"]), 4),
"case_number": r["case_number"],
"document": r["document_title"],
"section": r["section_type"],
"page": r["page_number"],
"content": r["content"],
"case_number": r.get("case_number"),
"document": r.get("document_title"),
"section": r.get("section_type"),
"page": r.get("page_number"),
"content": r.get("content", ""),
"match_type": r.get("match_type", "text"),
"image_thumbnail": r.get("image_thumbnail_path"),
})
return json.dumps(formatted, ensure_ascii=False, indent=2)
@@ -87,9 +89,9 @@ async def search_case_documents(
query_emb = await embeddings.embed_query(query)
# Restricted to case_id — practice_area filter would be redundant.
results = await rerank.maybe_rerank(
results = await hybrid_search.search_documents_hybrid(
query=query,
base_search=lambda **kw: db.search_similar(query_embedding=query_emb, **kw),
query_text_embedding=query_emb,
limit=limit,
case_id=UUID(case["id"]),
)
@@ -101,10 +103,12 @@ async def search_case_documents(
for r in results:
formatted.append({
"score": round(float(r["score"]), 4),
"document": r["document_title"],
"section": r["section_type"],
"page": r["page_number"],
"content": r["content"],
"document": r.get("document_title"),
"section": r.get("section_type"),
"page": r.get("page_number"),
"content": r.get("content", ""),
"match_type": r.get("match_type", "text"),
"image_thumbnail": r.get("image_thumbnail_path"),
})
return json.dumps(formatted, ensure_ascii=False, indent=2)
@@ -139,12 +143,11 @@ async def find_similar_cases(
)
query_emb = await embeddings.embed_query(description)
# Use description as the query text for rerank too.
# Note: even with rerank we ask for ``limit*3`` so the dedup-by-case
# Even with rerank we ask for ``limit*3`` so the dedup-by-case
# step downstream still has enough rows to pick the best per case.
results = await rerank.maybe_rerank(
results = await hybrid_search.search_documents_hybrid(
query=description,
base_search=lambda **kw: db.search_similar(query_embedding=query_emb, **kw),
query_text_embedding=query_emb,
limit=limit * 3,
practice_area=practice_area or None,
appeal_subtype=appeal_subtype or None,
@@ -153,14 +156,16 @@ async def find_similar_cases(
if not results:
return "לא נמצאו תיקים דומים."
# Deduplicate by case_number, keep best score per case
# Deduplicate by case_number, keep best score per case.
# image-only rows still carry case_number from the join.
seen_cases = {}
for r in results:
cn = r["case_number"]
cn = r.get("case_number")
if not cn:
continue
if cn not in seen_cases or r["score"] > seen_cases[cn]["score"]:
seen_cases[cn] = r
# Sort by score and limit
top_cases = sorted(seen_cases.values(), key=lambda x: x["score"], reverse=True)[:limit]
formatted = []
@@ -168,8 +173,9 @@ async def find_similar_cases(
formatted.append({
"score": round(float(r["score"]), 4),
"case_number": r["case_number"],
"document": r["document_title"],
"relevant_section": r["content"][:500],
"document": r.get("document_title"),
"relevant_section": (r.get("content") or "")[:500],
"match_type": r.get("match_type", "text"),
})
return json.dumps(formatted, ensure_ascii=False, indent=2)