The single choke-point for all binary file I/O (originals, derived artifacts, exports), replacing the scattered open()/shutil/Path.write_bytes calls across ~8 services. Backend chosen by STORAGE_BACKEND: - filesystem (default): disk under DATA_DIR — byte-for-byte legacy behaviour - dual: write disk + S3, read S3→disk fallback (migration window) - s3: MinIO via aioboto3 (lazy import; absent in the filesystem path) Keys are DATA_DIR-relative POSIX paths; the FS backend ignores the logical bucket and keeps the existing single tree, so the default backend is zero behaviour change. S3 maps a governance bucket (documents/immutable/derived) → MinIO bucket; presigned URLs are minted against the public endpoint (browser-reachable) and carry the Hebrew filename via RFC-5987 Content-Disposition. - config: STORAGE_BACKEND + MINIO_* (endpoint, public-endpoint, creds, region, 3 bucket names, presign TTL) - mcp_env_catalog: new "storage" category + 10 specs (X10/INV-ENV1) - pyproject: aioboto3>=13 (consumed here, deployed with first use) - tests: 18 unit tests (FS round-trip, key normalization/traversal guard, bucket resolution, backend selection, dual write-both + S3-down fallback) No call-sites are rewired yet — that is Phase 2 (106.3). STORAGE_BACKEND stays filesystem in prod, so behaviour is unchanged. Invariants: keeps G2 (one storage path replaces scattered I/O); establishes INV-STG1 (single layer), INV-STG2 (atomic keys, Hebrew name in metadata), INV-STG3 (governance buckets), INV-STG6 (presigned serving). Spec: docs/spec/X14-storage-minio.md. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
351 lines
18 KiB
Python
351 lines
18 KiB
Python
"""Configuration loaded from Infisical or central .env file.
|
|
|
|
Priority: Infisical → environment variables → .env file
|
|
"""
|
|
|
|
import os
|
|
from pathlib import Path
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
# Load from central .env or override path
|
|
dotenv_path = os.environ.get("DOTENV_PATH", str(Path.home() / ".env"))
|
|
load_dotenv(dotenv_path)
|
|
|
|
# Try loading from Infisical if configured
|
|
INFISICAL_TOKEN = os.environ.get("INFISICAL_TOKEN", "")
|
|
if INFISICAL_TOKEN:
|
|
try:
|
|
from infisical_sdk import InfisicalSDKClient
|
|
_client = InfisicalSDKClient(token=INFISICAL_TOKEN)
|
|
_secrets = _client.get_all_secrets(
|
|
environment=os.environ.get("INFISICAL_ENV", "production"),
|
|
project_id=os.environ.get("INFISICAL_PROJECT_ID", ""),
|
|
)
|
|
for s in _secrets:
|
|
os.environ.setdefault(s.secret_key, s.secret_value)
|
|
except ImportError:
|
|
pass # Infisical SDK not installed — use .env
|
|
except Exception:
|
|
pass # Infisical unreachable — fall back to .env
|
|
|
|
# PostgreSQL
|
|
POSTGRES_URL = os.environ.get(
|
|
"POSTGRES_URL",
|
|
f"postgres://{os.environ.get('POSTGRES_USER', 'legal_ai')}:"
|
|
f"{os.environ.get('POSTGRES_PASSWORD', '')}@"
|
|
f"{os.environ.get('POSTGRES_HOST', '127.0.0.1')}:"
|
|
f"{os.environ.get('POSTGRES_PORT', '5433')}/"
|
|
f"{os.environ.get('POSTGRES_DB', 'legal_ai')}",
|
|
)
|
|
|
|
# Redis
|
|
REDIS_URL = os.environ.get("REDIS_URL", "redis://127.0.0.1:6380/0")
|
|
|
|
# Claude CLI — model + effort for halacha extraction.
|
|
# All LLM calls go through the local `claude -p` CLI (claude_session.py).
|
|
# By default the CLI uses the developer's session default model with no
|
|
# explicit effort. For halacha extraction we pin Opus 4.8 @ xhigh: the
|
|
# 2026-05-31 A/B (scripts/ab_halacha_opus48.py) showed it cuts over-extraction
|
|
# (~124→51 on שטיין) at 100% quote-verification with honest confidence
|
|
# calibration. Env-overridable so the model/effort can be tuned without a
|
|
# code change (set to "" to fall back to the CLI default). Other extractors
|
|
# (claims, metadata, block-writing, QA) keep the CLI default unless similarly
|
|
# pinned.
|
|
HALACHA_EXTRACT_MODEL = os.environ.get("HALACHA_EXTRACT_MODEL", "claude-opus-4-8")
|
|
HALACHA_EXTRACT_EFFORT = os.environ.get("HALACHA_EXTRACT_EFFORT", "xhigh")
|
|
# Digest (X12) metadata extraction is a simpler, high-volume task (concept tag,
|
|
# headline, underlying citation, tags from a one-page summary) — Sonnet is the
|
|
# speed/cost sweet-spot here, unlike halacha extraction which pins Opus. Tune via env.
|
|
DIGEST_EXTRACT_MODEL = os.environ.get("DIGEST_EXTRACT_MODEL", "claude-sonnet-4-6")
|
|
# Effort for BULK queue-drain extraction (process_pending over many precedents).
|
|
# xhigh is the quality sweet-spot for a single precedent but very slow at scale
|
|
# (a 64-chunk case ≈ 20 min). Bulk drains use a lighter effort to cut wall-clock;
|
|
# interactive single re-extraction keeps HALACHA_EXTRACT_EFFORT (xhigh). Tune via
|
|
# env (set to 'xhigh' to make bulk match single, or 'medium' for max speed).
|
|
HALACHA_BULK_EXTRACT_EFFORT = os.environ.get("HALACHA_BULK_EXTRACT_EFFORT", "high")
|
|
# Concurrent chunks WITHIN a single extraction. Each `claude -p` @ xhigh holds
|
|
# ~300MB RSS + heavy CPU; cross-process overlap (agent retries) on top of this
|
|
# froze the box on 2026-05-31 (hard reboot). A global advisory lock now caps
|
|
# the system to ONE extraction at a time; this caps the chunks within it.
|
|
HALACHA_CHUNK_CONCURRENCY = int(os.environ.get("HALACHA_CHUNK_CONCURRENCY", "3"))
|
|
HALACHA_CORROBORATION_MATCH_FLOOR = float(os.environ.get("HALACHA_CORROBORATION_MATCH_FLOOR", "0.50"))
|
|
HALACHA_CORROBORATION_MIN_CITES = int(os.environ.get("HALACHA_CORROBORATION_MIN_CITES", "2"))
|
|
# X11 Phase 2: gate corroboration → approval. Default ON (Dafna validated the
|
|
# Phase 1 signal, 2026-06-01). Set to "false" to disable the auto-approve/demote
|
|
# wiring while keeping the Phase 1 signal intact.
|
|
HALACHA_CORROBORATION_AUTO_APPROVE = os.environ.get(
|
|
"HALACHA_CORROBORATION_AUTO_APPROVE", "true"
|
|
).strip().lower() in ("1", "true", "yes", "on")
|
|
|
|
# Voyage AI
|
|
VOYAGE_API_KEY = os.environ.get("VOYAGE_API_KEY", "")
|
|
VOYAGE_MODEL = os.environ.get("VOYAGE_MODEL", "voyage-law-2")
|
|
VOYAGE_DIMENSIONS = 1024
|
|
|
|
# Rerank — cross-encoder second-stage. Off by default; flip with env to
|
|
# enable across all semantic search tools (search_decisions,
|
|
# search_case_documents, find_similar_cases, search_precedent_library).
|
|
VOYAGE_RERANK_MODEL = os.environ.get("VOYAGE_RERANK_MODEL", "rerank-2")
|
|
VOYAGE_RERANK_ENABLED = (
|
|
os.environ.get("VOYAGE_RERANK_ENABLED", "false").lower() == "true"
|
|
)
|
|
# How many candidates to fetch from bi-encoder before reranking.
|
|
# 50 was the depth used in the POC; balances recall vs rerank cost.
|
|
VOYAGE_RERANK_FETCH_K = int(os.environ.get("VOYAGE_RERANK_FETCH_K", "50"))
|
|
|
|
# Multimodal — page-image embeddings via voyage-multimodal-3. Off by
|
|
# default; flip with env to enable per-page image embedding during
|
|
# ingestion + hybrid (text+image) ranking at search time. POC #3
|
|
# validated on a 89-page appraisal PDF (38s, 312K tokens, recovered
|
|
# table structure + image-only scanned pages that text-OCR misses).
|
|
MULTIMODAL_ENABLED = (
|
|
os.environ.get("MULTIMODAL_ENABLED", "false").lower() == "true"
|
|
)
|
|
MULTIMODAL_MODEL = os.environ.get("MULTIMODAL_MODEL", "voyage-multimodal-3")
|
|
# Render DPI for the image fed to the embedder. POC used 144 — sweet
|
|
# spot between embedding quality and tokens/page (144 ≈ 3.5K tok/page).
|
|
MULTIMODAL_DPI = int(os.environ.get("MULTIMODAL_DPI", "144"))
|
|
# Separate, lower DPI for the JPEG thumbnail saved to disk for UI
|
|
# preview. ~96dpi → ~20KB/page; ingestion-time, no re-render at view.
|
|
MULTIMODAL_THUMB_DPI = int(os.environ.get("MULTIMODAL_THUMB_DPI", "96"))
|
|
# Hybrid merge: Reciprocal Rank Fusion (RRF) bias for the *text* side.
|
|
# voyage-3 cosine scores (~0.4-0.5) and voyage-multimodal-3 scores
|
|
# (~0.20-0.25) live on different scales; a direct weighted sum lets
|
|
# text always dominate. RRF is rank-based and robust to that. The
|
|
# weight here biases the contribution of each side: 0.5 = balanced
|
|
# (vanilla RRF), >0.5 favours text, <0.5 favours image. Tunable per
|
|
# env without redeploy.
|
|
MULTIMODAL_TEXT_WEIGHT = float(
|
|
os.environ.get("MULTIMODAL_TEXT_WEIGHT", "0.5")
|
|
)
|
|
# RRF damping constant. Standard literature value is 60: lower values
|
|
# concentrate weight at top ranks; higher values flatten the curve.
|
|
MULTIMODAL_RRF_K = int(os.environ.get("MULTIMODAL_RRF_K", "60"))
|
|
|
|
# BM25/lexical hybrid — fuse ``ts_rank_cd`` over ``content_tsv``/
|
|
# ``rule_tsv`` (DB schema V12) with the semantic cosine layer via RRF.
|
|
# Recovers recall on exact-string queries that voyage embeddings blur
|
|
# (e.g. case-number citations like "1461/20", "317/10"; rare planning
|
|
# vocabulary). Hebrew uses the ``simple`` text-search config — no
|
|
# stemmer needed, and numeric/punctuation tokens stay intact. When
|
|
# disabled, hybrid search falls back to semantic-only (the previous
|
|
# behaviour). On by default — the lexical leg is cheap (GIN index) and
|
|
# only ever *adds* candidates to RRF, it can't down-rank a strong
|
|
# semantic hit.
|
|
BM25_HYBRID_ENABLED = (
|
|
os.environ.get("BM25_HYBRID_ENABLED", "true").lower() == "true"
|
|
)
|
|
|
|
# Halacha extraction — auto-approve threshold. Halachot with extractor
|
|
# confidence >= this value are inserted with review_status='approved'
|
|
# instead of 'pending_review' (so they immediately appear in
|
|
# search_precedent_library). Set to a value > 1.0 to disable auto-approval.
|
|
# 0.80 baseline: 89% of historical extractions land here, manual spot-check
|
|
# of 10 random samples confirmed quality. Tunable via env if drift is
|
|
# observed (e.g. raise to 0.90 if false-positives appear).
|
|
HALACHA_AUTO_APPROVE_THRESHOLD = float(
|
|
os.environ.get("HALACHA_AUTO_APPROVE_THRESHOLD", "0.80")
|
|
)
|
|
|
|
# Halacha dedup-on-insert — within-precedent semantic cosine ceiling. Before
|
|
# storing a halacha, store_halachot_for_chunk skips it if its rule-embedding has
|
|
# cosine >= this value against an already-stored halacha of the SAME precedent
|
|
# (exact normalized supporting_quote is always skipped regardless). 0.93 is the
|
|
# conservative auto-skip floor: the 2026-06-03 cleanup showed the 0.90-0.95 band
|
|
# is "almost entirely" same-rule-reworded, but auto-skip is unreviewed so we sit
|
|
# just above the manual-cleanup 0.90 to avoid dropping a genuinely distinct
|
|
# principle. Set > 1.0 to disable semantic dedup (exact-quote dedup still runs).
|
|
HALACHA_DEDUP_COSINE = float(os.environ.get("HALACHA_DEDUP_COSINE", "0.93"))
|
|
|
|
# Halacha dedup TAIL band (#82.3) — the [BAND_COSINE, DEDUP_COSINE) range is too
|
|
# low to auto-skip but suspicious. A halacha whose nearest same-precedent
|
|
# neighbor sits in this band AND has high LEXICAL overlap (Jaccard/Levenshtein
|
|
# on rule_statement) is flagged 'near_duplicate' (blocks auto-approve → review),
|
|
# not skipped — catching paraphrases the cosine threshold misses without
|
|
# dropping a possibly-distinct principle unreviewed. 0.83 from the same cleanup.
|
|
HALACHA_DEDUP_BAND_COSINE = float(os.environ.get("HALACHA_DEDUP_BAND_COSINE", "0.83"))
|
|
|
|
# Halacha review-queue clustering (#84.2) — when the review queue is requested
|
|
# with cluster=true, halachot of the SAME precedent whose rule-embeddings are
|
|
# within this cosine are grouped into ONE review card (canonical + variants), so
|
|
# the chair judges near-identical principles once instead of repeatedly. Display
|
|
# only — never merges/deletes. 0.90 = "same principle, reworded".
|
|
HALACHA_CLUSTER_COSINE = float(os.environ.get("HALACHA_CLUSTER_COSINE", "0.90"))
|
|
|
|
# Halacha NLI entailment validator (#81.3) — after extraction, a claude_session
|
|
# judge checks each halacha's rule_statement is entailed by its supporting_quote.
|
|
# Non-entailed (neutral/contradiction) → quality flag 'nli_unsupported' that
|
|
# blocks auto-approve. Runs through the local CLI (zero cost); fails OPEN if the
|
|
# CLI is unavailable (e.g. container). 'low' effort — entailment is a simple call.
|
|
HALACHA_NLI_ENABLED = os.environ.get("HALACHA_NLI_ENABLED", "true").lower() == "true"
|
|
HALACHA_NLI_MODEL = os.environ.get("HALACHA_NLI_MODEL", HALACHA_EXTRACT_MODEL)
|
|
HALACHA_NLI_EFFORT = os.environ.get("HALACHA_NLI_EFFORT", "low")
|
|
|
|
# Halacha over-extraction consolidation (#81.5) — after a precedent finishes
|
|
# extracting, a claude_session pass folds facets of the SAME legal question
|
|
# (below the #82 dedup cosine) into one canonical; the rest are marked rejected
|
|
# (reversible). Cross-chunk safety net for over-splitting. Runs through the local
|
|
# CLI (zero cost); fails OPEN. 'high' effort — folding needs careful judgment.
|
|
HALACHA_CONSOLIDATE_ENABLED = os.environ.get("HALACHA_CONSOLIDATE_ENABLED", "true").lower() == "true"
|
|
HALACHA_CONSOLIDATE_MODEL = os.environ.get("HALACHA_CONSOLIDATE_MODEL", HALACHA_EXTRACT_MODEL)
|
|
HALACHA_CONSOLIDATE_EFFORT = os.environ.get("HALACHA_CONSOLIDATE_EFFORT", "high")
|
|
|
|
# Google Cloud Vision (OCR for scanned PDFs)
|
|
GOOGLE_CLOUD_VISION_API_KEY = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY", "")
|
|
|
|
# Data directory
|
|
DATA_DIR = Path(os.environ.get("DATA_DIR", str(Path.home() / "legal-ai" / "data")))
|
|
TRAINING_DIR = DATA_DIR / "training"
|
|
EXPORTS_DIR = DATA_DIR / "exports" # legacy exports only
|
|
|
|
# Cases directory — flat structure: data/cases/{case_number}/
|
|
CASES_DIR = DATA_DIR / "cases"
|
|
|
|
# ── Object storage (X14 / MinIO) ───────────────────────────────────
|
|
# Single storage layer (services/storage.py) replaces the scattered file
|
|
# I/O across ~8 services (INV-STG1 / G2). Backend selector:
|
|
# "filesystem" (default) — disk under DATA_DIR; current behaviour, no change.
|
|
# "dual" — write disk + S3, read S3→disk fallback (migration).
|
|
# "s3" — MinIO only.
|
|
# See docs/spec/X14-storage-minio.md.
|
|
STORAGE_BACKEND = os.environ.get("STORAGE_BACKEND", "filesystem").strip().lower()
|
|
# Endpoint reached server-side (internal Docker network: http://minio:9000).
|
|
MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "http://minio:9000")
|
|
# Public endpoint used when MINTING presigned URLs for the browser (INV-STG6) —
|
|
# the browser cannot resolve the internal hostname. Falls back to the internal
|
|
# endpoint when unset (e.g. local dev).
|
|
MINIO_PUBLIC_ENDPOINT = os.environ.get("MINIO_PUBLIC_ENDPOINT", MINIO_ENDPOINT)
|
|
MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY", "")
|
|
MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY", "")
|
|
MINIO_REGION = os.environ.get("MINIO_REGION", "us-east-1")
|
|
# Logical bucket → name. Governance boundaries (INV-STG3): documents
|
|
# (versioned), immutable (versioned + Object-Lock COMPLIANCE for final
|
|
# decisions, INV-STG4), derived (thumbnails/extracted text — regenerable).
|
|
MINIO_BUCKET_DOCUMENTS = os.environ.get("MINIO_BUCKET_DOCUMENTS", "legal-documents")
|
|
MINIO_BUCKET_IMMUTABLE = os.environ.get("MINIO_BUCKET_IMMUTABLE", "legal-immutable")
|
|
MINIO_BUCKET_DERIVED = os.environ.get("MINIO_BUCKET_DERIVED", "legal-derived")
|
|
# Default presigned-URL TTL (seconds). SigV4 hard max is 7 days; keep short.
|
|
MINIO_PRESIGN_TTL = int(os.environ.get("MINIO_PRESIGN_TTL", "900"))
|
|
|
|
|
|
def find_case_dir(case_number: str) -> Path:
|
|
"""Return the case directory for a given case number."""
|
|
return CASES_DIR / case_number
|
|
|
|
# Chunking parameters
|
|
CHUNK_SIZE_TOKENS = 600
|
|
CHUNK_OVERLAP_TOKENS = 100
|
|
|
|
# Parent-doc retrieval (TaskMaster #48) — hierarchical chunking + lookup.
|
|
# When enabled:
|
|
# - The ingest pipeline emits two tiers of precedent_chunks: small
|
|
# "child" chunks (~300 tokens) for high-recall semantic/lexical
|
|
# matching, and larger "parent" chunks (~1500 tokens) that contain
|
|
# ~5 children each. Children are embedded and indexed; parents
|
|
# carry the broader text the LLM gets back.
|
|
# - Search runs against children, then swaps each hit for its parent
|
|
# row before returning — so the writer sees a coherent passage
|
|
# instead of a 300-token sliver.
|
|
#
|
|
# Off by default: the schema (V17) is safe to apply even when the flag
|
|
# is false (the chunker still emits single-tier chunks and search just
|
|
# returns them unchanged). Flip to true ONLY after the corpus has been
|
|
# re-ingested with the hierarchical chunker — see precedent_library
|
|
# ingest pipeline + the backfill plan in TaskMaster #48.
|
|
PARENT_DOC_RETRIEVAL_ENABLED = (
|
|
os.environ.get("PARENT_DOC_RETRIEVAL_ENABLED", "false").lower() == "true"
|
|
)
|
|
# Child chunks are what get embedded + matched. Smaller = higher recall,
|
|
# more rows. 300 tokens (~600 chars Hebrew) is the empirical sweet spot
|
|
# referenced in the original parent-doc literature (Anthropic, LlamaIndex).
|
|
PARENT_DOC_CHILD_SIZE_TOKENS = int(
|
|
os.environ.get("PARENT_DOC_CHILD_SIZE_TOKENS", "300")
|
|
)
|
|
# Parent chunks are what get returned to the LLM. Large enough to hold
|
|
# a full rule statement plus the surrounding paragraph and any cited
|
|
# authority. 1500 tokens = ~5 children at 300 each.
|
|
PARENT_DOC_PARENT_SIZE_TOKENS = int(
|
|
os.environ.get("PARENT_DOC_PARENT_SIZE_TOKENS", "1500")
|
|
)
|
|
# Child overlap — keeps neighbouring children sharing ~50 tokens so a
|
|
# sentence on a chunk boundary still matches the natural phrasing.
|
|
PARENT_DOC_CHILD_OVERLAP_TOKENS = int(
|
|
os.environ.get("PARENT_DOC_CHILD_OVERLAP_TOKENS", "50")
|
|
)
|
|
|
|
# External service allowlist — case materials may ONLY be sent to these domains
|
|
ALLOWED_EXTERNAL_SERVICES = {
|
|
"api.voyageai.com", # Voyage AI (embeddings)
|
|
"vision.googleapis.com", # Google Cloud Vision (OCR)
|
|
}
|
|
|
|
# Audit
|
|
AUDIT_ENABLED = os.environ.get("AUDIT_ENABLED", "true").lower() == "true"
|
|
|
|
|
|
# ── Utility ───────────────────────────────────────────────────────
|
|
|
|
def parse_llm_json(raw: str):
|
|
"""Parse JSON from LLM response, handling markdown wrapping and truncation.
|
|
|
|
Handles:
|
|
1. Markdown ```json ... ``` code blocks
|
|
2. Extra text before/after JSON
|
|
3. Truncated JSON (missing closing brackets) — attempts recovery
|
|
"""
|
|
import json
|
|
import re
|
|
raw = raw.strip()
|
|
# Strip markdown code blocks
|
|
raw = re.sub(r"^```(?:json)?\s*\n?", "", raw)
|
|
raw = re.sub(r"\n?\s*```\s*$", "", raw)
|
|
# Try direct parse first
|
|
try:
|
|
return json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
# Try to find JSON object or array
|
|
for pattern in [r"\{.*\}", r"\[.*\]"]:
|
|
match = re.search(pattern, raw, re.DOTALL)
|
|
if match:
|
|
try:
|
|
return json.loads(match.group())
|
|
except json.JSONDecodeError:
|
|
continue
|
|
# Attempt truncated JSON recovery:
|
|
# Find the start of JSON, then try closing open brackets
|
|
for opener, closer in [("[", "]"), ("{", "}")]:
|
|
start = raw.find(opener)
|
|
if start < 0:
|
|
continue
|
|
fragment = raw[start:]
|
|
# Try progressively removing trailing partial content and closing
|
|
# Look for the last complete item (ending with }, or ])
|
|
for end_pattern in [r'.*\}(?=\s*,?\s*$)', r'.*\](?=\s*,?\s*$)', r'.*"(?=\s*$)']:
|
|
pass # fallback below
|
|
# Simple approach: find last complete JSON item boundary
|
|
# For arrays: find last "}" and close the array
|
|
if opener == "[":
|
|
last_brace = fragment.rfind("}")
|
|
if last_brace > 0:
|
|
truncated = fragment[:last_brace + 1] + "]"
|
|
try:
|
|
return json.loads(truncated)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
# For objects: find last complete key-value
|
|
if opener == "{":
|
|
last_brace = fragment.rfind("}")
|
|
if last_brace > 0:
|
|
# Check if this closes a nested object — try adding outer close
|
|
truncated = fragment[:last_brace + 1]
|
|
# Count unclosed braces
|
|
open_count = truncated.count("{") - truncated.count("}")
|
|
truncated += "}" * open_count
|
|
try:
|
|
return json.loads(truncated)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return None
|