Ezer Mishpati - AI legal decision drafting system with: - MCP server (FastMCP) with document processing pipeline - Web upload interface (FastAPI) for file upload and classification - pgvector-based semantic search - Hebrew legal document chunking and embedding
56 lines
1.3 KiB
Python
56 lines
1.3 KiB
Python
"""Embedding service using Voyage AI API."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
|
|
import voyageai
|
|
|
|
from legal_mcp import config
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_client: voyageai.Client | None = None
|
|
|
|
|
|
def _get_client() -> voyageai.Client:
|
|
global _client
|
|
if _client is None:
|
|
_client = voyageai.Client(api_key=config.VOYAGE_API_KEY)
|
|
return _client
|
|
|
|
|
|
async def embed_texts(texts: list[str], input_type: str = "document") -> list[list[float]]:
|
|
"""Embed a batch of texts using Voyage AI.
|
|
|
|
Args:
|
|
texts: List of texts to embed (max 128 per call).
|
|
input_type: "document" for indexing, "query" for search queries.
|
|
|
|
Returns:
|
|
List of embedding vectors (1024 dimensions each).
|
|
"""
|
|
if not texts:
|
|
return []
|
|
|
|
client = _get_client()
|
|
all_embeddings = []
|
|
|
|
# Voyage AI supports up to 128 texts per batch
|
|
for i in range(0, len(texts), 128):
|
|
batch = texts[i : i + 128]
|
|
result = client.embed(
|
|
batch,
|
|
model=config.VOYAGE_MODEL,
|
|
input_type=input_type,
|
|
)
|
|
all_embeddings.extend(result.embeddings)
|
|
|
|
return all_embeddings
|
|
|
|
|
|
async def embed_query(query: str) -> list[float]:
|
|
"""Embed a single search query."""
|
|
results = await embed_texts([query], input_type="query")
|
|
return results[0]
|