feat(rag): Stage B — RAG improvements (HNSW + BM25 hybrid + MMR + dynamic boost)
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m35s

Five enhancements to the precedent retrieval stack:

* **#44 HNSW indexes** for precedent_chunks + halachot (replacing IVFFlat
  lists=50). Build time ~3s combined. Better recall@10 with pgvector 0.8.2.
* **#45 Halacha sweep** — 96 pending halachot at conf>=0.78 promoted to
  approved (1141 → 1237). Cluster at conf=0.78 spot-checked OK. Applied
  via psql only — env HALACHA_AUTO_APPROVE_THRESHOLD unchanged (0.80).
* **#43 MMR diversity** — search_precedent_library_hybrid now caps at
  ``max_per_case_law=2`` (default). Prevents one precedent dominating
  top-10 when many of its chunks/halachot rank high. New helper
  ``_diversify_by_case_law`` in hybrid_search.py.
* **#46 Dynamic halacha boost** — replaces the static ``score+=0.05``
  with ``score+=confidence*0.06``. Calibrated so avg-confidence (~0.85)
  stays at +0.05; high-conf halachot get a slight extra lift, low-conf
  ones get less. Behaviour preserved at the mean.
* **#41 BM25/tsvector hybrid + RRF**. Schema V12 adds STORED tsvector
  columns ``precedent_chunks.content_tsv`` and ``halachot.rule_tsv``
  (using simple config — Postgres has no Hebrew stemmer) + GIN indexes.
  New ``db.search_precedent_library_lexical`` mirrors the semantic
  function with ts_rank_cd over plainto_tsquery. ``hybrid_search``
  runs sem+lex in parallel and fuses via RRF before rerank. Toggle:
  env ``BM25_HYBRID_ENABLED`` (default true), graceful fallback to
  semantic-only on lexical failure.

#40 (VOYAGE_RERANK_ENABLED) was already true in Coolify env; no change.
#42 (Claude Haiku query expansion) deferred — latency + cost concerns
warrant a separate plan; the bm25 lexical leg already recovers most of
the exact-string recall #42 was meant to address.

Closes TaskMaster #41, #43-#46.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-26 08:08:02 +00:00
parent b197d2329c
commit af651d0135
3 changed files with 370 additions and 6 deletions

View File

@@ -714,6 +714,36 @@ CREATE INDEX IF NOT EXISTS idx_clr_a ON case_law_relations(case_law_id);
CREATE INDEX IF NOT EXISTS idx_clr_b ON case_law_relations(related_id);
"""
# ── V12: BM25/lexical search via tsvector ─────────────────────────
# PostgreSQL doesn't ship a Hebrew stemmer; the 'simple' configuration
# lowercases + tokenises on whitespace without stemming — exactly what
# we want for Hebrew. It also preserves alphanumeric tokens like
# "1461/20" (case numbers) which are the prime motivator for adding a
# lexical layer on top of the semantic cosine index.
# Both columns are GENERATED STORED so they stay in sync with the
# source rows for free, and GIN-indexed for ts_rank_cd lookups.
SCHEMA_V12_SQL = """
ALTER TABLE precedent_chunks
ADD COLUMN IF NOT EXISTS content_tsv tsvector
GENERATED ALWAYS AS (to_tsvector('simple', content)) STORED;
ALTER TABLE halachot
ADD COLUMN IF NOT EXISTS rule_tsv tsvector
GENERATED ALWAYS AS (
to_tsvector('simple',
coalesce(rule_statement,'') || ' ' ||
coalesce(supporting_quote,'') || ' ' ||
coalesce(reasoning_summary,'')
)
) STORED;
CREATE INDEX IF NOT EXISTS idx_precedent_chunks_tsv
ON precedent_chunks USING GIN(content_tsv);
CREATE INDEX IF NOT EXISTS idx_halachot_tsv
ON halachot USING GIN(rule_tsv);
"""
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
async with pool.acquire() as conn:
@@ -729,7 +759,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
await conn.execute(SCHEMA_V9_SQL)
await conn.execute(SCHEMA_V10_SQL)
await conn.execute(SCHEMA_V11_SQL)
logger.info("Database schema initialized (v1-v11)")
await conn.execute(SCHEMA_V12_SQL)
logger.info("Database schema initialized (v1-v12)")
async def init_schema() -> None:
@@ -2476,7 +2507,162 @@ async def search_precedent_library_semantic(
d = dict(r)
if d.get("decision_date") is not None:
d["decision_date"] = d["decision_date"].isoformat()
d["score"] = float(d["score"]) + 0.05 # rule-level boost
# Dynamic rule-level boost: scales with extractor confidence
# so high-conf halachot rank higher than low-conf ones.
# conf=0.78 → +0.047, conf=0.90 → +0.054, conf=0.95 → +0.057
# Calibrated so the average (≈0.85) stays at +0.05 (legacy value).
_conf = float(d.get("confidence") or 0.0)
d["score"] = float(d["score"]) + max(_conf * 0.06, 0.0)
d["type"] = "halacha"
results.append(d)
rows = await pool.fetch(chunk_sql, *c_params)
for r in rows:
d = dict(r)
if d.get("decision_date") is not None:
d["decision_date"] = d["decision_date"].isoformat()
d["score"] = float(d["score"])
d["type"] = "passage"
results.append(d)
results.sort(key=lambda x: x["score"], reverse=True)
return results[:limit]
async def search_precedent_library_lexical(
*,
query: str,
practice_area: str = "",
court: str = "",
precedent_level: str = "",
appeal_subtype: str = "",
is_binding: bool | None = None,
subject_tag: str = "",
source_kind: str = "external_upload",
district: str = "",
chair_name: str = "",
limit: int = 30,
include_halachot: bool = True,
) -> list[dict]:
"""Lexical (BM25-like) search via ``ts_rank_cd`` over ``content_tsv``
and ``rule_tsv`` (V12 columns).
Mirrors the filter set of :func:`search_precedent_library_semantic`
so the two layers can be fused 1:1 by rank in
:mod:`hybrid_search` via RRF.
Why ``plainto_tsquery``: it accepts free-text input, lowercases, and
AND-joins the terms — matches the bi-encoder's "all words contribute"
assumption better than ``websearch_to_tsquery`` (which inserts ORs).
Empty / stopword-only queries return zero rows (no error).
Why ``ts_rank_cd``: cover density variant — rewards documents where
the query terms appear close together (e.g. "1461/20 אנטרים" matches
the same paragraph). Higher is more relevant.
"""
if not (query or "").strip():
return []
pool = await get_pool()
halacha_filters = ["h.review_status IN ('approved', 'published')"]
chunk_filters = [f"cl.source_kind = '{source_kind}'"]
# $1 = query, $2 = limit. Filters append starting at $3.
h_params: list = [query, limit]
c_params: list = [query, limit]
h_idx = 3
c_idx = 3
if practice_area:
halacha_filters.append(f"${h_idx} = ANY(h.practice_areas)")
h_params.append(practice_area)
h_idx += 1
chunk_filters.append(f"cl.practice_area = ${c_idx}")
c_params.append(practice_area)
c_idx += 1
if court:
halacha_filters.append(f"cl.court ILIKE ${h_idx}")
h_params.append(f"%{court}%")
h_idx += 1
chunk_filters.append(f"cl.court ILIKE ${c_idx}")
c_params.append(f"%{court}%")
c_idx += 1
if precedent_level:
halacha_filters.append(f"cl.precedent_level = ${h_idx}")
h_params.append(precedent_level)
h_idx += 1
chunk_filters.append(f"cl.precedent_level = ${c_idx}")
c_params.append(precedent_level)
c_idx += 1
if appeal_subtype:
halacha_filters.append(f"cl.appeal_subtype = ${h_idx}")
h_params.append(appeal_subtype)
h_idx += 1
chunk_filters.append(f"cl.appeal_subtype = ${c_idx}")
c_params.append(appeal_subtype)
c_idx += 1
if is_binding is not None:
halacha_filters.append(f"cl.is_binding = ${h_idx}")
h_params.append(is_binding)
h_idx += 1
chunk_filters.append(f"cl.is_binding = ${c_idx}")
c_params.append(is_binding)
c_idx += 1
if subject_tag:
halacha_filters.append(f"${h_idx} = ANY(h.subject_tags)")
h_params.append(subject_tag)
h_idx += 1
if district:
halacha_filters.append(f"cl.district = ${h_idx}")
h_params.append(district)
h_idx += 1
chunk_filters.append(f"cl.district = ${c_idx}")
c_params.append(district)
c_idx += 1
if chair_name:
halacha_filters.append(f"cl.chair_name = ${h_idx}")
h_params.append(chair_name)
h_idx += 1
chunk_filters.append(f"cl.chair_name = ${c_idx}")
c_params.append(chair_name)
c_idx += 1
halacha_sql = f"""
SELECT h.id AS halacha_id, h.case_law_id, h.rule_statement,
h.reasoning_summary, h.supporting_quote, h.page_reference,
h.practice_areas, h.subject_tags, h.confidence, h.rule_type,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level, cl.chair_name, cl.district,
ts_rank_cd(h.rule_tsv, plainto_tsquery('simple', $1)) AS score
FROM halachot h
JOIN case_law cl ON cl.id = h.case_law_id
WHERE {' AND '.join(halacha_filters)}
AND h.rule_tsv @@ plainto_tsquery('simple', $1)
ORDER BY score DESC
LIMIT $2
"""
chunk_sql = f"""
SELECT pc.id AS chunk_id, pc.case_law_id, pc.content,
pc.section_type, pc.page_number,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level, cl.practice_area, cl.chair_name, cl.district,
ts_rank_cd(pc.content_tsv, plainto_tsquery('simple', $1)) AS score
FROM precedent_chunks pc
JOIN case_law cl ON cl.id = pc.case_law_id
WHERE {' AND '.join(chunk_filters)}
AND pc.content_tsv @@ plainto_tsquery('simple', $1)
ORDER BY score DESC
LIMIT $2
"""
results: list[dict] = []
if include_halachot:
rows = await pool.fetch(halacha_sql, *h_params)
for r in rows:
d = dict(r)
if d.get("decision_date") is not None:
d["decision_date"] = d["decision_date"].isoformat()
d["score"] = float(d["score"])
d["type"] = "halacha"
results.append(d)