feat(retrieval): add voyage-multimodal-3 page-image embeddings (feature flag)
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m50s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m50s
Stage C: per-page image embeddings via voyage-multimodal-3 + hybrid text+image search. Off by default; enable with MULTIMODAL_ENABLED=true. - Schema V9: document_image_embeddings + precedent_image_embeddings (vector(1024), page_number, image_thumbnail_path) - extractor.render_pages_for_multimodal renders PDF pages at MULTIMODAL_DPI (144) for embedding + JPEG thumbnails at MULTIMODAL_THUMB_DPI (96) for UI preview, in one pass - embeddings.embed_images calls voyage-multimodal-3 in 50-page batches - services/hybrid_search.py orchestrator: rerank applied to text side first (rerank-2 is text-only); image side cosine; weighted merge with text_weight 0.65 (env-tunable); image-only pages surface as match_type='image' so dense scanned content still appears - processor.process_document and precedent_library.ingest_precedent gated by flag — non-fatal on multimodal failure - scripts/multimodal_backfill.py — idempotent per-case CLI to embed existing documents without re-extracting text Validated locally on a 5-page response brief: render 0.31s, embed 8.32s, hybrid merge surfaces image rows correctly. Production rollout starts with flag=false (no behavior change), then per-case A/B. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -623,6 +623,54 @@ CREATE INDEX IF NOT EXISTS idx_case_law_halacha_requested
|
||||
"""
|
||||
|
||||
|
||||
# ── V9: Multimodal page-image embeddings ─────────────────────────
|
||||
# voyage-multimodal-3 (1024-dim) embeds the whole page as an image:
|
||||
# captures table layout, scanned content, signatures, plans — content
|
||||
# that text-OCR loses. Ingestion is gated by config.MULTIMODAL_ENABLED;
|
||||
# search_*_hybrid() merge text-cosine + image-cosine when present.
|
||||
# image_thumbnail_path is a relative path under DATA_DIR/cases/{case}/
|
||||
# thumbnails/ or DATA_DIR/precedent-library/thumbnails/ — a small JPEG
|
||||
# rendered at config.MULTIMODAL_THUMB_DPI for UI preview, distinct from
|
||||
# the higher-DPI render fed to the embedder (which is not persisted).
|
||||
|
||||
SCHEMA_V9_SQL = """
|
||||
CREATE TABLE IF NOT EXISTS document_image_embeddings (
|
||||
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||
document_id UUID REFERENCES documents(id) ON DELETE CASCADE,
|
||||
case_id UUID REFERENCES cases(id) ON DELETE CASCADE,
|
||||
page_number INTEGER NOT NULL,
|
||||
image_thumbnail_path TEXT,
|
||||
embedding vector(1024),
|
||||
model_name TEXT DEFAULT 'voyage-multimodal-3',
|
||||
created_at TIMESTAMPTZ DEFAULT now(),
|
||||
UNIQUE(document_id, page_number)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_img_emb_vec
|
||||
ON document_image_embeddings USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 50);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_img_emb_doc
|
||||
ON document_image_embeddings(document_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_doc_img_emb_case
|
||||
ON document_image_embeddings(case_id);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS precedent_image_embeddings (
|
||||
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||
case_law_id UUID REFERENCES case_law(id) ON DELETE CASCADE,
|
||||
page_number INTEGER NOT NULL,
|
||||
image_thumbnail_path TEXT,
|
||||
embedding vector(1024),
|
||||
model_name TEXT DEFAULT 'voyage-multimodal-3',
|
||||
created_at TIMESTAMPTZ DEFAULT now(),
|
||||
UNIQUE(case_law_id, page_number)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_prec_img_emb_vec
|
||||
ON precedent_image_embeddings USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 50);
|
||||
CREATE INDEX IF NOT EXISTS idx_prec_img_emb_case_law
|
||||
ON precedent_image_embeddings(case_law_id);
|
||||
"""
|
||||
|
||||
|
||||
async def init_schema() -> None:
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
@@ -635,7 +683,8 @@ async def init_schema() -> None:
|
||||
await conn.execute(SCHEMA_V6_SQL)
|
||||
await conn.execute(SCHEMA_V7_SQL)
|
||||
await conn.execute(SCHEMA_V8_SQL)
|
||||
logger.info("Database schema initialized (v1-v8)")
|
||||
await conn.execute(SCHEMA_V9_SQL)
|
||||
logger.info("Database schema initialized (v1-v9)")
|
||||
|
||||
|
||||
# ── Case CRUD ───────────────────────────────────────────────────────
|
||||
@@ -2350,3 +2399,300 @@ async def clear_extraction_request(
|
||||
f"UPDATE case_law SET {col} = NULL WHERE id = $1",
|
||||
case_law_id,
|
||||
)
|
||||
|
||||
|
||||
# ── V9: Multimodal page image embeddings ─────────────────────────
|
||||
|
||||
|
||||
async def store_document_image_embeddings(
|
||||
document_id: UUID,
|
||||
case_id: UUID | None,
|
||||
page_records: list[dict],
|
||||
model_name: str = "voyage-multimodal-3",
|
||||
) -> int:
|
||||
"""Replace per-page image embeddings for a document.
|
||||
|
||||
Each ``page_records`` entry: ``{page_number, embedding, image_thumbnail_path}``.
|
||||
Embeddings should already be 1024-dim lists (or None for skipped pages).
|
||||
"""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"DELETE FROM document_image_embeddings WHERE document_id = $1",
|
||||
document_id,
|
||||
)
|
||||
for r in page_records:
|
||||
await conn.execute(
|
||||
"""INSERT INTO document_image_embeddings
|
||||
(document_id, case_id, page_number, embedding,
|
||||
image_thumbnail_path, model_name)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)""",
|
||||
document_id, case_id,
|
||||
r["page_number"],
|
||||
r.get("embedding"),
|
||||
r.get("image_thumbnail_path"),
|
||||
model_name,
|
||||
)
|
||||
return len(page_records)
|
||||
|
||||
|
||||
async def store_precedent_image_embeddings(
|
||||
case_law_id: UUID,
|
||||
page_records: list[dict],
|
||||
model_name: str = "voyage-multimodal-3",
|
||||
) -> int:
|
||||
"""Same pattern as store_document_image_embeddings but for precedents."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"DELETE FROM precedent_image_embeddings WHERE case_law_id = $1",
|
||||
case_law_id,
|
||||
)
|
||||
for r in page_records:
|
||||
await conn.execute(
|
||||
"""INSERT INTO precedent_image_embeddings
|
||||
(case_law_id, page_number, embedding,
|
||||
image_thumbnail_path, model_name)
|
||||
VALUES ($1, $2, $3, $4, $5)""",
|
||||
case_law_id,
|
||||
r["page_number"],
|
||||
r.get("embedding"),
|
||||
r.get("image_thumbnail_path"),
|
||||
model_name,
|
||||
)
|
||||
return len(page_records)
|
||||
|
||||
|
||||
async def search_document_images_similar(
|
||||
query_embedding: list[float],
|
||||
limit: int = 10,
|
||||
case_id: UUID | None = None,
|
||||
practice_area: str | None = None,
|
||||
appeal_subtype: str | None = None,
|
||||
) -> list[dict]:
|
||||
"""Cosine search over per-page image embeddings of case documents."""
|
||||
pool = await get_pool()
|
||||
conditions: list[str] = []
|
||||
params: list = [query_embedding, limit]
|
||||
idx = 3
|
||||
if case_id:
|
||||
conditions.append(f"die.case_id = ${idx}")
|
||||
params.append(case_id); idx += 1
|
||||
if practice_area:
|
||||
conditions.append(f"c.practice_area = ${idx}")
|
||||
params.append(practice_area); idx += 1
|
||||
if appeal_subtype:
|
||||
conditions.append(f"c.appeal_subtype = ${idx}")
|
||||
params.append(appeal_subtype); idx += 1
|
||||
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
|
||||
sql = f"""
|
||||
SELECT die.document_id, die.case_id, die.page_number,
|
||||
die.image_thumbnail_path,
|
||||
d.title AS document_title,
|
||||
c.case_number,
|
||||
1 - (die.embedding <=> $1) AS score
|
||||
FROM document_image_embeddings die
|
||||
JOIN documents d ON d.id = die.document_id
|
||||
JOIN cases c ON c.id = die.case_id
|
||||
{where}
|
||||
ORDER BY die.embedding <=> $1
|
||||
LIMIT $2
|
||||
"""
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(sql, *params)
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
async def search_precedent_images_similar(
|
||||
query_embedding: list[float],
|
||||
limit: int = 10,
|
||||
practice_area: str = "",
|
||||
court: str = "",
|
||||
precedent_level: str = "",
|
||||
appeal_subtype: str = "",
|
||||
is_binding: bool | None = None,
|
||||
) -> list[dict]:
|
||||
"""Cosine search over per-page image embeddings of precedent rulings."""
|
||||
pool = await get_pool()
|
||||
conditions: list[str] = ["cl.source_kind = 'external_upload'"]
|
||||
params: list = [query_embedding, limit]
|
||||
idx = 3
|
||||
if practice_area:
|
||||
conditions.append(f"cl.practice_area = ${idx}")
|
||||
params.append(practice_area); idx += 1
|
||||
if court:
|
||||
conditions.append(f"cl.court ILIKE ${idx}")
|
||||
params.append(f"%{court}%"); idx += 1
|
||||
if precedent_level:
|
||||
conditions.append(f"cl.precedent_level = ${idx}")
|
||||
params.append(precedent_level); idx += 1
|
||||
if appeal_subtype:
|
||||
conditions.append(f"cl.appeal_subtype = ${idx}")
|
||||
params.append(appeal_subtype); idx += 1
|
||||
if is_binding is not None:
|
||||
conditions.append(f"cl.is_binding = ${idx}")
|
||||
params.append(is_binding); idx += 1
|
||||
where = " AND ".join(conditions)
|
||||
sql = f"""
|
||||
SELECT pie.case_law_id, pie.page_number, pie.image_thumbnail_path,
|
||||
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
|
||||
cl.precedent_level, cl.practice_area,
|
||||
1 - (pie.embedding <=> $1) AS score
|
||||
FROM precedent_image_embeddings pie
|
||||
JOIN case_law cl ON cl.id = pie.case_law_id
|
||||
WHERE {where}
|
||||
ORDER BY pie.embedding <=> $1
|
||||
LIMIT $2
|
||||
"""
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(sql, *params)
|
||||
out = []
|
||||
for r in rows:
|
||||
d = dict(r)
|
||||
if d.get("decision_date") is not None:
|
||||
d["decision_date"] = d["decision_date"].isoformat()
|
||||
out.append(d)
|
||||
return out
|
||||
|
||||
|
||||
async def search_similar_hybrid(
|
||||
query_text_embedding: list[float],
|
||||
query_image_embedding: list[float],
|
||||
limit: int = 10,
|
||||
fetch_k: int = 30,
|
||||
text_weight: float = 0.65,
|
||||
case_id: UUID | None = None,
|
||||
section_type: str | None = None,
|
||||
practice_area: str | None = None,
|
||||
appeal_subtype: str | None = None,
|
||||
) -> list[dict]:
|
||||
"""Weighted merge of text-chunk and per-page image search.
|
||||
|
||||
Same (document_id, page_number) → boost text chunk by image score
|
||||
on that page. Image-only pages with no overlapping text chunk are
|
||||
surfaced as ``match_type='image'`` so dense scanned content still
|
||||
appears in results.
|
||||
"""
|
||||
img_weight = 1.0 - text_weight
|
||||
text_rows = await search_similar(
|
||||
query_text_embedding, limit=fetch_k, case_id=case_id,
|
||||
section_type=section_type, practice_area=practice_area,
|
||||
appeal_subtype=appeal_subtype,
|
||||
)
|
||||
img_rows = await search_document_images_similar(
|
||||
query_image_embedding, limit=fetch_k, case_id=case_id,
|
||||
practice_area=practice_area, appeal_subtype=appeal_subtype,
|
||||
)
|
||||
img_by_page: dict[tuple, dict] = {
|
||||
(str(r["document_id"]), r["page_number"]): r for r in img_rows
|
||||
}
|
||||
seen: set = set()
|
||||
merged: list[dict] = []
|
||||
for r in text_rows:
|
||||
page = r.get("page_number")
|
||||
key = (str(r["document_id"]), page) if page is not None else None
|
||||
img_hit = img_by_page.get(key) if key else None
|
||||
text_score = float(r["score"])
|
||||
image_score = float(img_hit["score"]) if img_hit else 0.0
|
||||
d = dict(r)
|
||||
d["text_score"] = text_score
|
||||
d["image_score"] = image_score
|
||||
d["score"] = text_score * text_weight + image_score * img_weight
|
||||
d["match_type"] = "text+image" if img_hit else "text"
|
||||
if img_hit:
|
||||
d["image_thumbnail_path"] = img_hit.get("image_thumbnail_path")
|
||||
merged.append(d)
|
||||
if key:
|
||||
seen.add(key)
|
||||
for r in img_rows:
|
||||
key = (str(r["document_id"]), r["page_number"])
|
||||
if key in seen:
|
||||
continue
|
||||
d = dict(r)
|
||||
d["text_score"] = 0.0
|
||||
d["image_score"] = float(r["score"])
|
||||
d["score"] = float(r["score"]) * img_weight
|
||||
d["match_type"] = "image"
|
||||
d["content"] = ""
|
||||
d["section_type"] = "image"
|
||||
merged.append(d)
|
||||
merged.sort(key=lambda x: -x["score"])
|
||||
return merged[:limit]
|
||||
|
||||
|
||||
async def search_precedent_library_hybrid(
|
||||
query_text_embedding: list[float],
|
||||
query_image_embedding: list[float],
|
||||
limit: int = 10,
|
||||
fetch_k: int = 30,
|
||||
text_weight: float = 0.65,
|
||||
practice_area: str = "",
|
||||
court: str = "",
|
||||
precedent_level: str = "",
|
||||
appeal_subtype: str = "",
|
||||
is_binding: bool | None = None,
|
||||
subject_tag: str = "",
|
||||
include_halachot: bool = True,
|
||||
) -> list[dict]:
|
||||
"""Hybrid variant of search_precedent_library_semantic.
|
||||
|
||||
Halachot have no ``page_number`` — they're boosted by the max
|
||||
image score from any page in the same case_law row.
|
||||
"""
|
||||
img_weight = 1.0 - text_weight
|
||||
text_results = await search_precedent_library_semantic(
|
||||
query_text_embedding,
|
||||
practice_area=practice_area, court=court,
|
||||
precedent_level=precedent_level, appeal_subtype=appeal_subtype,
|
||||
is_binding=is_binding, subject_tag=subject_tag,
|
||||
limit=fetch_k, include_halachot=include_halachot,
|
||||
)
|
||||
img_results = await search_precedent_images_similar(
|
||||
query_image_embedding, limit=fetch_k,
|
||||
practice_area=practice_area, court=court,
|
||||
precedent_level=precedent_level, appeal_subtype=appeal_subtype,
|
||||
is_binding=is_binding,
|
||||
)
|
||||
img_by_page: dict[tuple, dict] = {}
|
||||
img_by_case: dict[str, float] = {}
|
||||
for r in img_results:
|
||||
cid = str(r["case_law_id"])
|
||||
img_by_page[(cid, r["page_number"])] = r
|
||||
img_by_case[cid] = max(img_by_case.get(cid, 0.0), float(r["score"]))
|
||||
seen: set = set()
|
||||
merged: list[dict] = []
|
||||
for r in text_results:
|
||||
cid = str(r["case_law_id"])
|
||||
page = r.get("page_number")
|
||||
key = (cid, page) if page is not None else None
|
||||
img_hit = img_by_page.get(key) if key else None
|
||||
if img_hit:
|
||||
image_score = float(img_hit["score"])
|
||||
elif r.get("type") == "halacha":
|
||||
image_score = img_by_case.get(cid, 0.0)
|
||||
else:
|
||||
image_score = 0.0
|
||||
text_score = float(r["score"])
|
||||
d = dict(r)
|
||||
d["text_score"] = text_score
|
||||
d["image_score"] = image_score
|
||||
d["score"] = text_score * text_weight + image_score * img_weight
|
||||
if img_hit:
|
||||
d["image_thumbnail_path"] = img_hit.get("image_thumbnail_path")
|
||||
if key:
|
||||
seen.add(key)
|
||||
merged.append(d)
|
||||
for r in img_results:
|
||||
key = (str(r["case_law_id"]), r["page_number"])
|
||||
if key in seen:
|
||||
continue
|
||||
d = dict(r)
|
||||
d["text_score"] = 0.0
|
||||
d["image_score"] = float(r["score"])
|
||||
d["score"] = float(r["score"]) * img_weight
|
||||
d["type"] = "image_page"
|
||||
d["content"] = ""
|
||||
d["section_type"] = "image"
|
||||
merged.append(d)
|
||||
merged.sort(key=lambda x: -x["score"])
|
||||
return merged[:limit]
|
||||
|
||||
Reference in New Issue
Block a user