#!/usr/bin/env python3 """FU-5 (GAP-11) — bootstrap a retrieval gold-set into data/eval/gold-set.jsonl. The gold-set is the labeled (query → relevant case_law_ids) set the eval harness (scripts/eval_retrieval.py) measures precision/recall against. This script SEEDS it automatically; the chair then reviews/augments (rows with source='chair' are never clobbered). Two seed sources: --source citations : the chosen hybrid signal — "cited == relevant". Reads search_relevance_feedback (populated by telemetry.infer_relevance_from_citations once decisions cite precedents) ⨝ search_logs, groups by query. Yields nothing until decisions accumulate citations + searches are logged with case context. --source known_item : known-item retrieval (Manning et al. 2008, ch. 8) — query = a precedent's case_name, relevant = that precedent (and any same-named sibling in the same corpus). A real, citation-free precision/recall signal available TODAY; this is what #52 (test_retrieval_by_name) checked by hand. Use this to get a baseline before the citation signal exists. --source both (default): emit both. Sources are tagged (bootstrap_known_item / bootstrap_citation) so the chair can tell them apart. Idempotent: regenerates the bootstrap_* rows each run; preserves source='chair' rows. Merge key = (corpus, normalized query). Usage (mcp-server venv; needs POSTGRES): PY=/home/chaim/legal-ai/mcp-server/.venv/bin/python POSTGRES_PASSWORD=… POSTGRES_HOST=127.0.0.1 POSTGRES_PORT=5433 \ $PY scripts/eval_gold_bootstrap.py --source both """ from __future__ import annotations import argparse import asyncio import hashlib import json import os import sys from pathlib import Path REPO_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(REPO_ROOT / "mcp-server" / "src")) if "POSTGRES_URL" not in os.environ: os.environ["POSTGRES_URL"] = ( f"postgres://{os.environ.get('POSTGRES_USER','legal_ai')}:" f"{os.environ.get('POSTGRES_PASSWORD','')}@" f"{os.environ.get('POSTGRES_HOST','127.0.0.1')}:" f"{os.environ.get('POSTGRES_PORT','5433')}/" f"{os.environ.get('POSTGRES_DB','legal_ai')}" ) GOLD_PATH = REPO_ROOT / "data" / "eval" / "gold-set.jsonl" # search_type (telemetry) → eval corpus name _TYPE_TO_CORPUS = {"precedent_library": "precedent_library", "internal_decisions": "internal_decisions"} # case_law.source_kind → eval corpus (which retrieval tool searches it) _KIND_TO_CORPUS = {"external_upload": "precedent_library", "internal_committee": "internal_decisions"} def _norm_query(q: str) -> str: return " ".join((q or "").split()).strip() def _entry_id(corpus: str, query: str) -> str: h = hashlib.sha1(f"{corpus}|{_norm_query(query)}".encode("utf-8")).hexdigest()[:10] return f"g-{h}" async def _known_item_rows(conn, sample: int | None) -> list[dict]: """query = case_name, relevant = all same-named precedents in the same corpus.""" rows = await conn.fetch( "SELECT id, coalesce(case_name,'') AS case_name, coalesce(practice_area,'') AS pa, " "source_kind FROM case_law " "WHERE source_kind IN ('external_upload','internal_committee') " "AND coalesce(searchable, true) AND length(trim(coalesce(case_name,''))) >= 2 " "ORDER BY source_kind, case_name") # group by (corpus, normalized case_name) → relevant ids groups: dict[tuple[str, str], dict] = {} for r in rows: corpus = _KIND_TO_CORPUS[r["source_kind"]] key = (corpus, _norm_query(r["case_name"])) g = groups.setdefault(key, {"pa": r["pa"], "ids": []}) g["ids"].append(str(r["id"])) out: list[dict] = [] for (corpus, name), g in groups.items(): out.append({ "id": _entry_id(corpus, name), "query": name, "practice_area": g["pa"], "corpus": corpus, "relevant_case_law_ids": g["ids"], "source": "bootstrap_known_item", "note": f"known-item: search by case_name → expect the case itself ({len(g['ids'])} same-named)", }) out.sort(key=lambda e: (e["corpus"], e["query"])) if sample is not None and sample > 0: out = out[:sample] return out async def _citation_rows(conn) -> list[dict]: """query → relevant case_law_ids, from the cited==relevant signal in search_relevance_feedback ⨝ search_logs (score >= 2).""" rows = await conn.fetch( "SELECT sl.query, sl.search_type, coalesce(sl.practice_area,'') AS pa, " " rf.case_law_id " "FROM search_relevance_feedback rf " "JOIN search_logs sl ON sl.id = rf.search_log_id " "WHERE rf.relevance_score >= 2 AND sl.search_type IN ('precedent_library','internal_decisions')") groups: dict[tuple[str, str], dict] = {} for r in rows: corpus = _TYPE_TO_CORPUS[r["search_type"]] key = (corpus, _norm_query(r["query"])) g = groups.setdefault(key, {"pa": r["pa"], "ids": set()}) g["ids"].add(str(r["case_law_id"])) if not g["pa"]: g["pa"] = r["pa"] out: list[dict] = [] for (corpus, query), g in groups.items(): out.append({ "id": _entry_id(corpus, query), "query": query, "practice_area": g["pa"], "corpus": corpus, "relevant_case_law_ids": sorted(g["ids"]), "source": "bootstrap_citation", "note": "cited == relevant (auto-inferred from finalized decisions)", }) out.sort(key=lambda e: (e["corpus"], e["query"])) return out def _load_existing() -> list[dict]: if not GOLD_PATH.exists(): return [] out = [] for line in GOLD_PATH.read_text(encoding="utf-8").splitlines(): line = line.strip() if line: out.append(json.loads(line)) return out def _merge(existing: list[dict], fresh: list[dict]) -> tuple[list[dict], dict]: """Keep all source='chair' rows; replace bootstrap_* rows with fresh ones. Merge key = (corpus, normalized query). Chair rows win on key conflict.""" chair = [e for e in existing if e.get("source") == "chair"] chair_keys = {(e["corpus"], _norm_query(e["query"])) for e in chair} kept_fresh = [e for e in fresh if (e["corpus"], _norm_query(e["query"])) not in chair_keys] merged = chair + kept_fresh merged.sort(key=lambda e: (e["corpus"], e["source"] != "chair", e["query"])) stats = { "chair_rows_preserved": len(chair), "bootstrap_rows": len(kept_fresh), "total": len(merged), } return merged, stats async def main() -> int: ap = argparse.ArgumentParser(description="FU-5 gold-set bootstrap") ap.add_argument("--source", choices=["citations", "known_item", "both"], default="both") ap.add_argument("--sample", type=int, default=None, help="cap known-item queries (default: all named)") args = ap.parse_args() from legal_mcp.services import db pool = await db.get_pool() fresh: list[dict] = [] async with pool.acquire() as conn: if args.source in ("citations", "both"): cit = await _citation_rows(conn) fresh += cit print(f"citation source: {len(cit)} queries") if args.source in ("known_item", "both"): ki = await _known_item_rows(conn, args.sample) fresh += ki print(f"known-item source: {len(ki)} queries") existing = _load_existing() merged, stats = _merge(existing, fresh) GOLD_PATH.parent.mkdir(parents=True, exist_ok=True) with GOLD_PATH.open("w", encoding="utf-8") as f: for e in merged: f.write(json.dumps(e, ensure_ascii=False) + "\n") print(f"wrote {GOLD_PATH}") print(f" chair rows preserved: {stats['chair_rows_preserved']}") print(f" bootstrap rows: {stats['bootstrap_rows']}") print(f" total gold queries: {stats['total']}") if stats["total"] == 0: print(" NOTE: gold-set empty — no citation signal yet and no named precedents found.") return 0 if __name__ == "__main__": sys.exit(asyncio.run(main()))