feat(rag): Stage B — RAG improvements (HNSW + BM25 hybrid + MMR + dynamic boost)

Five enhancements to the precedent retrieval stack: * **#44 HNSW indexes** for precedent_chunks + halachot (replacing IVFFlat lists=50). Build time ~3s combined. Better recall@10 with pgvector 0.8.2. * **#45 Halacha sweep** — 96 pending halachot at conf>=0.78 promoted to approved (1141 → 1237). Cluster at conf=0.78 spot-checked OK. Applied via psql only — env HALACHA_AUTO_APPROVE_THRESHOLD unchanged (0.80). * **#43 MMR diversity** — search_precedent_library_hybrid now caps at ``max_per_case_law=2`` (default). Prevents one precedent dominating top-10 when many of its chunks/halachot rank high. New helper ``_diversify_by_case_law`` in hybrid_search.py. * **#46 Dynamic halacha boost** — replaces the static ``score+=0.05`` with ``score+=confidence*0.06``. Calibrated so avg-confidence (~0.85) stays at +0.05; high-conf halachot get a slight extra lift, low-conf ones get less. Behaviour preserved at the mean. * **#41 BM25/tsvector hybrid + RRF**. Schema V12 adds STORED tsvector columns ``precedent_chunks.content_tsv`` and ``halachot.rule_tsv`` (using simple config — Postgres has no Hebrew stemmer) + GIN indexes. New ``db.search_precedent_library_lexical`` mirrors the semantic function with ts_rank_cd over plainto_tsquery. ``hybrid_search`` runs sem+lex in parallel and fuses via RRF before rerank. Toggle: env ``BM25_HYBRID_ENABLED`` (default true), graceful fallback to semantic-only on lexical failure. #40 (VOYAGE_RERANK_ENABLED) was already true in Coolify env; no change. #42 (Claude Haiku query expansion) deferred — latency + cost concerns warrant a separate plan; the bm25 lexical leg already recovers most of the exact-string recall #42 was meant to address. Closes TaskMaster #41, #43-#46. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 08:08:02 +00:00
parent b197d2329c
commit af651d0135
3 changed files with 370 additions and 6 deletions
--- a/mcp-server/src/legal_mcp/services/db.py
+++ b/mcp-server/src/legal_mcp/services/db.py
@@ -714,6 +714,36 @@ CREATE INDEX IF NOT EXISTS idx_clr_a ON case_law_relations(case_law_id);
 CREATE INDEX IF NOT EXISTS idx_clr_b ON case_law_relations(related_id);
 """

+# ── V12: BM25/lexical search via tsvector ─────────────────────────
+# PostgreSQL doesn't ship a Hebrew stemmer; the 'simple' configuration
+# lowercases + tokenises on whitespace without stemming — exactly what
+# we want for Hebrew. It also preserves alphanumeric tokens like
+# "1461/20" (case numbers) which are the prime motivator for adding a
+# lexical layer on top of the semantic cosine index.
+# Both columns are GENERATED STORED so they stay in sync with the
+# source rows for free, and GIN-indexed for ts_rank_cd lookups.
+SCHEMA_V12_SQL = """
+ALTER TABLE precedent_chunks
+    ADD COLUMN IF NOT EXISTS content_tsv tsvector
+    GENERATED ALWAYS AS (to_tsvector('simple', content)) STORED;
+
+ALTER TABLE halachot
+    ADD COLUMN IF NOT EXISTS rule_tsv tsvector
+    GENERATED ALWAYS AS (
+        to_tsvector('simple',
+            coalesce(rule_statement,'') || ' ' ||
+            coalesce(supporting_quote,'') || ' ' ||
+            coalesce(reasoning_summary,'')
+        )
+    ) STORED;
+
+CREATE INDEX IF NOT EXISTS idx_precedent_chunks_tsv
+    ON precedent_chunks USING GIN(content_tsv);
+
+CREATE INDEX IF NOT EXISTS idx_halachot_tsv
+    ON halachot USING GIN(rule_tsv);
+"""
+

 async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
    async with pool.acquire() as conn:
@@ -729,7 +759,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
        await conn.execute(SCHEMA_V9_SQL)
        await conn.execute(SCHEMA_V10_SQL)
        await conn.execute(SCHEMA_V11_SQL)
-    logger.info("Database schema initialized (v1-v11)")
+        await conn.execute(SCHEMA_V12_SQL)
+    logger.info("Database schema initialized (v1-v12)")


 async def init_schema() -> None:
@@ -2476,7 +2507,162 @@ async def search_precedent_library_semantic(
            d = dict(r)
            if d.get("decision_date") is not None:
                d["decision_date"] = d["decision_date"].isoformat()
-            d["score"] = float(d["score"]) + 0.05  # rule-level boost
+            # Dynamic rule-level boost: scales with extractor confidence
+            # so high-conf halachot rank higher than low-conf ones.
+            # conf=0.78 → +0.047, conf=0.90 → +0.054, conf=0.95 → +0.057
+            # Calibrated so the average (≈0.85) stays at +0.05 (legacy value).
+            _conf = float(d.get("confidence") or 0.0)
+            d["score"] = float(d["score"]) + max(_conf * 0.06, 0.0)
+            d["type"] = "halacha"
+            results.append(d)
+
+    rows = await pool.fetch(chunk_sql, *c_params)
+    for r in rows:
+        d = dict(r)
+        if d.get("decision_date") is not None:
+            d["decision_date"] = d["decision_date"].isoformat()
+        d["score"] = float(d["score"])
+        d["type"] = "passage"
+        results.append(d)
+
+    results.sort(key=lambda x: x["score"], reverse=True)
+    return results[:limit]
+
+
+async def search_precedent_library_lexical(
+    *,
+    query: str,
+    practice_area: str = "",
+    court: str = "",
+    precedent_level: str = "",
+    appeal_subtype: str = "",
+    is_binding: bool | None = None,
+    subject_tag: str = "",
+    source_kind: str = "external_upload",
+    district: str = "",
+    chair_name: str = "",
+    limit: int = 30,
+    include_halachot: bool = True,
+) -> list[dict]:
+    """Lexical (BM25-like) search via ``ts_rank_cd`` over ``content_tsv``
+    and ``rule_tsv`` (V12 columns).
+
+    Mirrors the filter set of :func:`search_precedent_library_semantic`
+    so the two layers can be fused 1:1 by rank in
+    :mod:`hybrid_search` via RRF.
+
+    Why ``plainto_tsquery``: it accepts free-text input, lowercases, and
+    AND-joins the terms — matches the bi-encoder's "all words contribute"
+    assumption better than ``websearch_to_tsquery`` (which inserts ORs).
+    Empty / stopword-only queries return zero rows (no error).
+
+    Why ``ts_rank_cd``: cover density variant — rewards documents where
+    the query terms appear close together (e.g. "1461/20 אנטרים" matches
+    the same paragraph). Higher is more relevant.
+    """
+    if not (query or "").strip():
+        return []
+
+    pool = await get_pool()
+    halacha_filters = ["h.review_status IN ('approved', 'published')"]
+    chunk_filters = [f"cl.source_kind = '{source_kind}'"]
+    # $1 = query, $2 = limit. Filters append starting at $3.
+    h_params: list = [query, limit]
+    c_params: list = [query, limit]
+    h_idx = 3
+    c_idx = 3
+
+    if practice_area:
+        halacha_filters.append(f"${h_idx} = ANY(h.practice_areas)")
+        h_params.append(practice_area)
+        h_idx += 1
+        chunk_filters.append(f"cl.practice_area = ${c_idx}")
+        c_params.append(practice_area)
+        c_idx += 1
+    if court:
+        halacha_filters.append(f"cl.court ILIKE ${h_idx}")
+        h_params.append(f"%{court}%")
+        h_idx += 1
+        chunk_filters.append(f"cl.court ILIKE ${c_idx}")
+        c_params.append(f"%{court}%")
+        c_idx += 1
+    if precedent_level:
+        halacha_filters.append(f"cl.precedent_level = ${h_idx}")
+        h_params.append(precedent_level)
+        h_idx += 1
+        chunk_filters.append(f"cl.precedent_level = ${c_idx}")
+        c_params.append(precedent_level)
+        c_idx += 1
+    if appeal_subtype:
+        halacha_filters.append(f"cl.appeal_subtype = ${h_idx}")
+        h_params.append(appeal_subtype)
+        h_idx += 1
+        chunk_filters.append(f"cl.appeal_subtype = ${c_idx}")
+        c_params.append(appeal_subtype)
+        c_idx += 1
+    if is_binding is not None:
+        halacha_filters.append(f"cl.is_binding = ${h_idx}")
+        h_params.append(is_binding)
+        h_idx += 1
+        chunk_filters.append(f"cl.is_binding = ${c_idx}")
+        c_params.append(is_binding)
+        c_idx += 1
+    if subject_tag:
+        halacha_filters.append(f"${h_idx} = ANY(h.subject_tags)")
+        h_params.append(subject_tag)
+        h_idx += 1
+    if district:
+        halacha_filters.append(f"cl.district = ${h_idx}")
+        h_params.append(district)
+        h_idx += 1
+        chunk_filters.append(f"cl.district = ${c_idx}")
+        c_params.append(district)
+        c_idx += 1
+    if chair_name:
+        halacha_filters.append(f"cl.chair_name = ${h_idx}")
+        h_params.append(chair_name)
+        h_idx += 1
+        chunk_filters.append(f"cl.chair_name = ${c_idx}")
+        c_params.append(chair_name)
+        c_idx += 1
+
+    halacha_sql = f"""
+        SELECT h.id AS halacha_id, h.case_law_id, h.rule_statement,
+               h.reasoning_summary, h.supporting_quote, h.page_reference,
+               h.practice_areas, h.subject_tags, h.confidence, h.rule_type,
+               cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
+               cl.precedent_level, cl.chair_name, cl.district,
+               ts_rank_cd(h.rule_tsv, plainto_tsquery('simple', $1)) AS score
+        FROM halachot h
+        JOIN case_law cl ON cl.id = h.case_law_id
+        WHERE {' AND '.join(halacha_filters)}
+          AND h.rule_tsv @@ plainto_tsquery('simple', $1)
+        ORDER BY score DESC
+        LIMIT $2
+    """
+
+    chunk_sql = f"""
+        SELECT pc.id AS chunk_id, pc.case_law_id, pc.content,
+               pc.section_type, pc.page_number,
+               cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
+               cl.precedent_level, cl.practice_area, cl.chair_name, cl.district,
+               ts_rank_cd(pc.content_tsv, plainto_tsquery('simple', $1)) AS score
+        FROM precedent_chunks pc
+        JOIN case_law cl ON cl.id = pc.case_law_id
+        WHERE {' AND '.join(chunk_filters)}
+          AND pc.content_tsv @@ plainto_tsquery('simple', $1)
+        ORDER BY score DESC
+        LIMIT $2
+    """
+
+    results: list[dict] = []
+    if include_halachot:
+        rows = await pool.fetch(halacha_sql, *h_params)
+        for r in rows:
+            d = dict(r)
+            if d.get("decision_date") is not None:
+                d["decision_date"] = d["decision_date"].isoformat()
+            d["score"] = float(d["score"])
            d["type"] = "halacha"
            results.append(d)