Some checks are pending
Build & Deploy / build-and-deploy (push) Waiting to run
Cosine scores in voyage-3 (~0.4-0.5) and voyage-multimodal-3
(~0.2-0.25) live on different scales. The previous weighted-sum
merge let text always dominate — verified empirically: 0 image-only
hits across 7 queries on case 8174-24, image side contributed nothing.
RRF combines by *rank* in each list rather than raw score, robust
to scale differences. Per-item score:
rrf_score = text_weight / (k + text_rank)
+ image_weight / (k + image_rank)
A row that appears in both lists (joined on (id_field, page_number))
gets both terms — surfaced as match_type='text+image'.
After fix on 8174-24 (146 image rows): 2 image-only hits land in
top-5 across all 7 test queries, surfacing actual table/diagram/
signature pages (p12, p13 of שומת המשיבה for 'טבלת השוואת ערכי שומה',
p25 of שומת השגה for 'תרשים גוש וחלקה', etc).
On 8137-24 (273 image rows): 'חישוב היוון של דמי החכירה' goes from
0 baseline results → 5 hybrid results (3 text + 2 image), opening
recall on scanned content the OCR layer misses.
Default MULTIMODAL_TEXT_WEIGHT 0.65 → 0.5 (vanilla RRF) since the
prior 0.65 was tuned for raw cosine scales that no longer apply.
New env knob MULTIMODAL_RRF_K (default 60, standard literature).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
195 lines
7.8 KiB
Python
195 lines
7.8 KiB
Python
"""Configuration loaded from Infisical or central .env file.
|
|
|
|
Priority: Infisical → environment variables → .env file
|
|
"""
|
|
|
|
import os
|
|
from pathlib import Path
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
# Load from central .env or override path
|
|
dotenv_path = os.environ.get("DOTENV_PATH", str(Path.home() / ".env"))
|
|
load_dotenv(dotenv_path)
|
|
|
|
# Try loading from Infisical if configured
|
|
INFISICAL_TOKEN = os.environ.get("INFISICAL_TOKEN", "")
|
|
if INFISICAL_TOKEN:
|
|
try:
|
|
from infisical_sdk import InfisicalSDKClient
|
|
_client = InfisicalSDKClient(token=INFISICAL_TOKEN)
|
|
_secrets = _client.get_all_secrets(
|
|
environment=os.environ.get("INFISICAL_ENV", "production"),
|
|
project_id=os.environ.get("INFISICAL_PROJECT_ID", ""),
|
|
)
|
|
for s in _secrets:
|
|
os.environ.setdefault(s.secret_key, s.secret_value)
|
|
except ImportError:
|
|
pass # Infisical SDK not installed — use .env
|
|
except Exception:
|
|
pass # Infisical unreachable — fall back to .env
|
|
|
|
# PostgreSQL
|
|
POSTGRES_URL = os.environ.get(
|
|
"POSTGRES_URL",
|
|
f"postgres://{os.environ.get('POSTGRES_USER', 'legal_ai')}:"
|
|
f"{os.environ.get('POSTGRES_PASSWORD', '')}@"
|
|
f"{os.environ.get('POSTGRES_HOST', '127.0.0.1')}:"
|
|
f"{os.environ.get('POSTGRES_PORT', '5433')}/"
|
|
f"{os.environ.get('POSTGRES_DB', 'legal_ai')}",
|
|
)
|
|
|
|
# Redis
|
|
REDIS_URL = os.environ.get("REDIS_URL", "redis://127.0.0.1:6380/0")
|
|
|
|
# Voyage AI
|
|
VOYAGE_API_KEY = os.environ.get("VOYAGE_API_KEY", "")
|
|
VOYAGE_MODEL = os.environ.get("VOYAGE_MODEL", "voyage-law-2")
|
|
VOYAGE_DIMENSIONS = 1024
|
|
|
|
# Rerank — cross-encoder second-stage. Off by default; flip with env to
|
|
# enable across all semantic search tools (search_decisions,
|
|
# search_case_documents, find_similar_cases, search_precedent_library).
|
|
VOYAGE_RERANK_MODEL = os.environ.get("VOYAGE_RERANK_MODEL", "rerank-2")
|
|
VOYAGE_RERANK_ENABLED = (
|
|
os.environ.get("VOYAGE_RERANK_ENABLED", "false").lower() == "true"
|
|
)
|
|
# How many candidates to fetch from bi-encoder before reranking.
|
|
# 50 was the depth used in the POC; balances recall vs rerank cost.
|
|
VOYAGE_RERANK_FETCH_K = int(os.environ.get("VOYAGE_RERANK_FETCH_K", "50"))
|
|
|
|
# Multimodal — page-image embeddings via voyage-multimodal-3. Off by
|
|
# default; flip with env to enable per-page image embedding during
|
|
# ingestion + hybrid (text+image) ranking at search time. POC #3
|
|
# validated on a 89-page appraisal PDF (38s, 312K tokens, recovered
|
|
# table structure + image-only scanned pages that text-OCR misses).
|
|
MULTIMODAL_ENABLED = (
|
|
os.environ.get("MULTIMODAL_ENABLED", "false").lower() == "true"
|
|
)
|
|
MULTIMODAL_MODEL = os.environ.get("MULTIMODAL_MODEL", "voyage-multimodal-3")
|
|
# Render DPI for the image fed to the embedder. POC used 144 — sweet
|
|
# spot between embedding quality and tokens/page (144 ≈ 3.5K tok/page).
|
|
MULTIMODAL_DPI = int(os.environ.get("MULTIMODAL_DPI", "144"))
|
|
# Separate, lower DPI for the JPEG thumbnail saved to disk for UI
|
|
# preview. ~96dpi → ~20KB/page; ingestion-time, no re-render at view.
|
|
MULTIMODAL_THUMB_DPI = int(os.environ.get("MULTIMODAL_THUMB_DPI", "96"))
|
|
# Hybrid merge: Reciprocal Rank Fusion (RRF) bias for the *text* side.
|
|
# voyage-3 cosine scores (~0.4-0.5) and voyage-multimodal-3 scores
|
|
# (~0.20-0.25) live on different scales; a direct weighted sum lets
|
|
# text always dominate. RRF is rank-based and robust to that. The
|
|
# weight here biases the contribution of each side: 0.5 = balanced
|
|
# (vanilla RRF), >0.5 favours text, <0.5 favours image. Tunable per
|
|
# env without redeploy.
|
|
MULTIMODAL_TEXT_WEIGHT = float(
|
|
os.environ.get("MULTIMODAL_TEXT_WEIGHT", "0.5")
|
|
)
|
|
# RRF damping constant. Standard literature value is 60: lower values
|
|
# concentrate weight at top ranks; higher values flatten the curve.
|
|
MULTIMODAL_RRF_K = int(os.environ.get("MULTIMODAL_RRF_K", "60"))
|
|
|
|
# Halacha extraction — auto-approve threshold. Halachot with extractor
|
|
# confidence >= this value are inserted with review_status='approved'
|
|
# instead of 'pending_review' (so they immediately appear in
|
|
# search_precedent_library). Set to a value > 1.0 to disable auto-approval.
|
|
# 0.80 baseline: 89% of historical extractions land here, manual spot-check
|
|
# of 10 random samples confirmed quality. Tunable via env if drift is
|
|
# observed (e.g. raise to 0.90 if false-positives appear).
|
|
HALACHA_AUTO_APPROVE_THRESHOLD = float(
|
|
os.environ.get("HALACHA_AUTO_APPROVE_THRESHOLD", "0.80")
|
|
)
|
|
|
|
# Google Cloud Vision (OCR for scanned PDFs)
|
|
GOOGLE_CLOUD_VISION_API_KEY = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY", "")
|
|
|
|
# Data directory
|
|
DATA_DIR = Path(os.environ.get("DATA_DIR", str(Path.home() / "legal-ai" / "data")))
|
|
TRAINING_DIR = DATA_DIR / "training"
|
|
EXPORTS_DIR = DATA_DIR / "exports" # legacy exports only
|
|
|
|
# Cases directory — flat structure: data/cases/{case_number}/
|
|
CASES_DIR = DATA_DIR / "cases"
|
|
|
|
|
|
def find_case_dir(case_number: str) -> Path:
|
|
"""Return the case directory for a given case number."""
|
|
return CASES_DIR / case_number
|
|
|
|
# Chunking parameters
|
|
CHUNK_SIZE_TOKENS = 600
|
|
CHUNK_OVERLAP_TOKENS = 100
|
|
|
|
# External service allowlist — case materials may ONLY be sent to these domains
|
|
ALLOWED_EXTERNAL_SERVICES = {
|
|
"api.voyageai.com", # Voyage AI (embeddings)
|
|
"vision.googleapis.com", # Google Cloud Vision (OCR)
|
|
}
|
|
|
|
# Audit
|
|
AUDIT_ENABLED = os.environ.get("AUDIT_ENABLED", "true").lower() == "true"
|
|
|
|
|
|
# ── Utility ───────────────────────────────────────────────────────
|
|
|
|
def parse_llm_json(raw: str):
|
|
"""Parse JSON from LLM response, handling markdown wrapping and truncation.
|
|
|
|
Handles:
|
|
1. Markdown ```json ... ``` code blocks
|
|
2. Extra text before/after JSON
|
|
3. Truncated JSON (missing closing brackets) — attempts recovery
|
|
"""
|
|
import json
|
|
import re
|
|
raw = raw.strip()
|
|
# Strip markdown code blocks
|
|
raw = re.sub(r"^```(?:json)?\s*\n?", "", raw)
|
|
raw = re.sub(r"\n?\s*```\s*$", "", raw)
|
|
# Try direct parse first
|
|
try:
|
|
return json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
# Try to find JSON object or array
|
|
for pattern in [r"\{.*\}", r"\[.*\]"]:
|
|
match = re.search(pattern, raw, re.DOTALL)
|
|
if match:
|
|
try:
|
|
return json.loads(match.group())
|
|
except json.JSONDecodeError:
|
|
continue
|
|
# Attempt truncated JSON recovery:
|
|
# Find the start of JSON, then try closing open brackets
|
|
for opener, closer in [("[", "]"), ("{", "}")]:
|
|
start = raw.find(opener)
|
|
if start < 0:
|
|
continue
|
|
fragment = raw[start:]
|
|
# Try progressively removing trailing partial content and closing
|
|
# Look for the last complete item (ending with }, or ])
|
|
for end_pattern in [r'.*\}(?=\s*,?\s*$)', r'.*\](?=\s*,?\s*$)', r'.*"(?=\s*$)']:
|
|
pass # fallback below
|
|
# Simple approach: find last complete JSON item boundary
|
|
# For arrays: find last "}" and close the array
|
|
if opener == "[":
|
|
last_brace = fragment.rfind("}")
|
|
if last_brace > 0:
|
|
truncated = fragment[:last_brace + 1] + "]"
|
|
try:
|
|
return json.loads(truncated)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
# For objects: find last complete key-value
|
|
if opener == "{":
|
|
last_brace = fragment.rfind("}")
|
|
if last_brace > 0:
|
|
# Check if this closes a nested object — try adding outer close
|
|
truncated = fragment[:last_brace + 1]
|
|
# Count unclosed braces
|
|
open_count = truncated.count("{") - truncated.count("}")
|
|
truncated += "}" * open_count
|
|
try:
|
|
return json.loads(truncated)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return None
|