feat: Stage C — RAG advanced (#33, #47, #48, #49, #50, #51)
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m35s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m35s
Six independent sub-tasks dispatched in parallel; aggregated here. ## #33 — Hide case_name column library-list-panel.tsx: `<TableHead>` + `<TableCell>` for "שם" get `className="hidden"` in both Court and Committee row variants. DB column preserved for future use. ## #47 — Audit script periodic New scripts/audit_corpus_integrity.py — 3 SQL checks (external+ערר prefix, internal missing chair/district, cases.practice_area enum) + CEO wakeup on violations + cron `0 7 * * *`. First run: 0 issues. ## #48 — Parent-doc retrieval (gated, default off) Schema V17: precedent_chunks.parent_chunk_id + chunk_role ('child'|'parent'). New chunker.chunk_document_hierarchical() — section-aware parents (~1500 tokens) containing ~5 overlapping children (~300 tokens each). New db.store_precedent_chunks_hierarchical two-pass writer. Search SQL (semantic + lexical) LEFT-JOIN parent and swap content + dedupe by parent_chunk_id when flag on. Toggle: PARENT_DOC_RETRIEVAL_ENABLED + PARENT_DOC_{CHILD,PARENT}_SIZE_TOKENS. Backfill ~3min and ~$0.20 — deferred to follow-up. ## #49 — Multimodal backfill New scripts/backfill_multimodal_precedents.py with token-matching case_number ↔ source files (PDF + DOCX via PyMuPDF). Ran in container: 26 precedents embedded, 503 pages, $0.21, 0 errors. precedent_image_embeddings grew 3 → 29 rows. 44 remaining are style_corpus-migrated rows (no source file on disk) — will catch up when re-uploaded. ## #50 — Closed-loop feedback + nDCG Schema V18: search_logs + search_relevance_feedback. New telemetry.py with fire-and-forget log_search_bg (p50 = 0.002ms — zero overhead) + auto-infer_relevance_from_citations (reads case drafts → marks score=3 when cited precedent appears in past search top-K). Hooks added to 5 search paths. scripts/compute_ndcg.py for aggregation. Two admin API endpoints (GET /api/admin/rag-metrics + POST .../infer). Dashboard UI deferred — API is enough for now. ## #51 — Halacha quality monitoring New scripts/monitor_halacha_quality.py — baseline avg confidence (trusted=0.849, all=0.833, pending=0.694) with rolling window drift detection. Default 5% threshold. Exits non-zero on alert for cron integration. Recommended: `0 8 * * 1` weekly Mon 8am. ## Bonus: 230 unlinked citations → missing_precedents Bulk-imported 230 distinct unlinked citations from precedent_internal_citations to missing_precedents.status='open', party='committee', with notes listing source citers. Top candidate: ע"א 3213/97 (cited 5x). Total open missing_precedents now 237. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,14 @@
|
||||
"""Legal document chunker - splits text into sections and chunks for RAG."""
|
||||
"""Legal document chunker - splits text into sections and chunks for RAG.
|
||||
|
||||
The default :func:`chunk_document` emits a single tier of overlapping
|
||||
chunks (legacy single-tier indexing). :func:`chunk_document_hierarchical`
|
||||
emits two tiers — small "child" chunks for retrieval matching, plus
|
||||
larger "parent" chunks that supply broader context to the LLM (parent-
|
||||
doc retrieval, TaskMaster #48). The hierarchical variant lives
|
||||
alongside the legacy one so callers can opt in via
|
||||
``config.PARENT_DOC_RETRIEVAL_ENABLED`` without breaking existing
|
||||
single-tier code paths.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -162,3 +172,152 @@ def _split_section(text: str, chunk_size: int, overlap: int) -> list[str]:
|
||||
def _estimate_tokens(text: str) -> int:
|
||||
"""Rough token estimate for Hebrew text (~1.5 chars per token)."""
|
||||
return max(1, len(text) // 2)
|
||||
|
||||
|
||||
# ── Parent-doc retrieval (TaskMaster #48) ────────────────────────────
|
||||
# Hierarchical chunker — emits a list of (child, parent) pairs:
|
||||
# * each "child" carries the smaller text used for embedding/search
|
||||
# * each "parent" is shared by ~5 consecutive children (1500/300)
|
||||
# The list is FLAT — both parents and children live in the same return
|
||||
# list, distinguished by ``role``. A child's ``parent_local_id`` points
|
||||
# back to its parent's ``local_id``, so the ingest pipeline can resolve
|
||||
# the FK after the parent row is INSERTed and its DB UUID is known.
|
||||
#
|
||||
# Parents are built FIRST (one window of ``parent_size`` tokens per
|
||||
# section, sliding by the parent window — no overlap between parents),
|
||||
# then each parent is sub-divided into overlapping children. This keeps
|
||||
# the parent boundary aligned with semantic sections (so a "discussion"
|
||||
# parent doesn't contain stray "ruling" prose) while still allowing
|
||||
# child overlap for recall.
|
||||
|
||||
|
||||
@dataclass
|
||||
class HierarchicalChunk:
|
||||
"""One chunk in the two-tier hierarchy.
|
||||
|
||||
Both children and parents share this shape; ``role`` distinguishes
|
||||
them. Children get an embedding at ingest time; parents do not —
|
||||
they exist only to carry context back to the LLM at retrieval time.
|
||||
|
||||
``local_id`` is a stable in-batch identifier (sequential int) used
|
||||
only by the ingest pipeline to wire children to their parent's DB
|
||||
UUID after the parent INSERT returns. It is NOT persisted.
|
||||
"""
|
||||
|
||||
content: str
|
||||
role: str # 'child' | 'parent'
|
||||
section_type: str = "other"
|
||||
page_number: int | None = None
|
||||
chunk_index: int = 0
|
||||
local_id: int = -1
|
||||
parent_local_id: int | None = None
|
||||
|
||||
|
||||
def chunk_document_hierarchical(
|
||||
text: str,
|
||||
child_size: int = config.PARENT_DOC_CHILD_SIZE_TOKENS,
|
||||
parent_size: int = config.PARENT_DOC_PARENT_SIZE_TOKENS,
|
||||
overlap: int = config.PARENT_DOC_CHILD_OVERLAP_TOKENS,
|
||||
page_offsets: list[int] | None = None,
|
||||
) -> list[HierarchicalChunk]:
|
||||
"""Split a document into a two-tier (child, parent) hierarchy.
|
||||
|
||||
Returns a flat list where each element is either a parent or a
|
||||
child. Children carry ``parent_local_id`` pointing back to their
|
||||
parent's ``local_id``. Caller (ingest pipeline) must insert parents
|
||||
first, capture their DB UUIDs by ``local_id``, then insert children
|
||||
with the resolved UUID in ``parent_chunk_id``.
|
||||
|
||||
Args:
|
||||
text: full document text.
|
||||
child_size: child chunk size in tokens (≈ 300 by default).
|
||||
parent_size: parent chunk size in tokens (≈ 1500 by default).
|
||||
Parents contain ``parent_size // child_size`` children on
|
||||
average.
|
||||
overlap: child-to-child overlap inside a parent (≈ 50 tokens).
|
||||
Parents themselves do not overlap each other.
|
||||
page_offsets: PDF page offsets for tagging chunks with page #.
|
||||
|
||||
Notes:
|
||||
* Parents respect section boundaries (header detection from
|
||||
:data:`SECTION_PATTERNS`). A "facts" parent will not include
|
||||
"ruling" text.
|
||||
* Empty text returns an empty list.
|
||||
* Both child and parent rows are tagged with the page of their
|
||||
first character.
|
||||
"""
|
||||
if not text.strip():
|
||||
return []
|
||||
if child_size <= 0 or parent_size <= 0:
|
||||
raise ValueError("child_size and parent_size must be positive")
|
||||
if child_size > parent_size:
|
||||
raise ValueError("child_size must be <= parent_size")
|
||||
|
||||
sections = _split_into_sections(text)
|
||||
out: list[HierarchicalChunk] = []
|
||||
parent_idx = 0 # global parent ordinal (chunk_index for parents)
|
||||
child_idx = 0 # global child ordinal (chunk_index for children)
|
||||
local_id = 0 # sequential id within this document
|
||||
|
||||
for section_type, section_text in sections:
|
||||
# Step 1: split section into parent-sized windows (no overlap).
|
||||
parent_texts = _split_section(section_text, parent_size, overlap=0)
|
||||
for parent_text in parent_texts:
|
||||
parent_local = local_id
|
||||
local_id += 1
|
||||
parent_chunk = HierarchicalChunk(
|
||||
content=parent_text,
|
||||
role="parent",
|
||||
section_type=section_type,
|
||||
chunk_index=parent_idx,
|
||||
local_id=parent_local,
|
||||
parent_local_id=None,
|
||||
)
|
||||
out.append(parent_chunk)
|
||||
parent_idx += 1
|
||||
|
||||
# Step 2: sub-divide this parent into overlapping children.
|
||||
child_texts = _split_section(parent_text, child_size, overlap)
|
||||
for ch_text in child_texts:
|
||||
ch = HierarchicalChunk(
|
||||
content=ch_text,
|
||||
role="child",
|
||||
section_type=section_type,
|
||||
chunk_index=child_idx,
|
||||
local_id=local_id,
|
||||
parent_local_id=parent_local,
|
||||
)
|
||||
out.append(ch)
|
||||
local_id += 1
|
||||
child_idx += 1
|
||||
|
||||
if page_offsets:
|
||||
_assign_pages_hierarchical(out, text, page_offsets)
|
||||
return out
|
||||
|
||||
|
||||
def _assign_pages_hierarchical(
|
||||
chunks: list[HierarchicalChunk],
|
||||
text: str,
|
||||
page_offsets: list[int],
|
||||
) -> None:
|
||||
"""Page-tag both children and parents.
|
||||
|
||||
Same forward-scan strategy as :func:`_assign_pages` but works on
|
||||
the hierarchical list. Parents may span pages; we tag them with
|
||||
the page of their first character (matches how the multimodal
|
||||
retriever joins on page numbers).
|
||||
"""
|
||||
from legal_mcp.services.extractor import page_at_offset
|
||||
pos = 0
|
||||
for c in chunks:
|
||||
idx = text.find(c.content, pos)
|
||||
if idx < 0:
|
||||
idx = text.find(c.content)
|
||||
if idx < 0:
|
||||
continue
|
||||
c.page_number = page_at_offset(idx, page_offsets)
|
||||
# Advance past halfway — children share text with their parent
|
||||
# and with each other (overlap), so a small forward step lets
|
||||
# the next find() still pick up the right occurrence.
|
||||
pos = idx + max(1, len(c.content) // 4)
|
||||
|
||||
@@ -905,6 +905,108 @@ CREATE INDEX IF NOT EXISTS idx_pic_unlinked
|
||||
"""
|
||||
|
||||
|
||||
# ── V17: Parent-doc retrieval (TaskMaster #48) ─────────────────────
|
||||
# Hierarchical chunking: tiny "child" chunks (~300 tokens) are indexed
|
||||
# and matched at search time for high recall on focused phrases, but
|
||||
# every child links upward to a larger "parent" chunk (~1500 tokens)
|
||||
# that supplies broader context to the LLM. The retrieval step swaps
|
||||
# the child hit for its parent before returning rows to callers — so
|
||||
# rule statements, multi-paragraph quotes, and "אשר על כן…" passages
|
||||
# come back whole instead of clipped mid-sentence.
|
||||
#
|
||||
# Schema layout:
|
||||
# parent_chunk_id — self-FK on precedent_chunks. NULL for legacy
|
||||
# rows (single-tier chunking) and for parent
|
||||
# rows themselves. Cascade=SET NULL so deleting
|
||||
# a parent doesn't orphan the children's payload.
|
||||
# chunk_role — 'child' | 'parent'. Defaults to 'child' so any
|
||||
# row created by the pre-V17 ingestion path is
|
||||
# treated as a child without a parent (i.e. the
|
||||
# parent-doc swap is a no-op and the legacy chunk
|
||||
# continues to surface as-is).
|
||||
#
|
||||
# Activation is gated by ``config.PARENT_DOC_RETRIEVAL_ENABLED``. Even
|
||||
# after the schema is in place, search keeps the legacy behaviour
|
||||
# until both the chunker emits hierarchical chunks *and* the flag is
|
||||
# flipped on — so this migration is safe to apply ahead of time.
|
||||
SCHEMA_V17_SQL = """
|
||||
ALTER TABLE precedent_chunks
|
||||
ADD COLUMN IF NOT EXISTS parent_chunk_id UUID
|
||||
REFERENCES precedent_chunks(id) ON DELETE SET NULL;
|
||||
|
||||
ALTER TABLE precedent_chunks
|
||||
ADD COLUMN IF NOT EXISTS chunk_role TEXT DEFAULT 'child';
|
||||
|
||||
DO $$ BEGIN
|
||||
ALTER TABLE precedent_chunks ADD CONSTRAINT precedent_chunks_role_check
|
||||
CHECK (chunk_role IN ('child', 'parent'));
|
||||
EXCEPTION WHEN duplicate_object THEN NULL; END $$;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_precedent_chunks_parent
|
||||
ON precedent_chunks(parent_chunk_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_precedent_chunks_role
|
||||
ON precedent_chunks(chunk_role);
|
||||
"""
|
||||
|
||||
|
||||
# ── V18: RAG telemetry — closed-loop retrieval feedback (TaskMaster #50)
|
||||
#
|
||||
# Captures every semantic search call (query, agent, top results,
|
||||
# latency) so we can compute nDCG@10 over time and surface drift before
|
||||
# it bites. Relevance signal comes from two places:
|
||||
# 1. ``cited_in_decision`` — auto-inferred. If a precedent cited in a
|
||||
# final draft's ``decision_paragraphs.citations`` also appears in
|
||||
# the ``top_case_law_ids`` of a search log for the same case, that
|
||||
# hit is treated as highly relevant (score=3).
|
||||
# 2. ``chair_marked`` — explicit feedback (future hook for the UI).
|
||||
#
|
||||
# ``top_case_law_ids`` is intentionally nullable: ``search_decisions``
|
||||
# returns document chunks from active cases (not case_law rows), so its
|
||||
# rows log the query but leave the array empty. nDCG aggregation skips
|
||||
# those.
|
||||
SCHEMA_V18_SQL = """
|
||||
CREATE TABLE IF NOT EXISTS search_logs (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
search_type TEXT NOT NULL,
|
||||
-- 'precedent_library' / 'internal_decisions'
|
||||
-- / 'decisions' / 'case_documents' / 'similar_cases'
|
||||
query TEXT NOT NULL,
|
||||
practice_area TEXT,
|
||||
case_id UUID REFERENCES cases(id) ON DELETE SET NULL,
|
||||
user_agent TEXT,
|
||||
-- 'writer' / 'researcher' / 'analyst' / 'manual' / 'unknown'
|
||||
result_count INTEGER,
|
||||
top_case_law_ids UUID[],
|
||||
-- nullable: empty for search_decisions/search_case_documents
|
||||
-- which return document chunks not case_law rows
|
||||
duration_ms INTEGER,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_search_logs_type ON search_logs(search_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_search_logs_case ON search_logs(case_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_search_logs_date ON search_logs(created_at DESC);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS search_relevance_feedback (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
search_log_id UUID REFERENCES search_logs(id) ON DELETE CASCADE,
|
||||
case_law_id UUID NOT NULL REFERENCES case_law(id) ON DELETE CASCADE,
|
||||
rank INTEGER NOT NULL,
|
||||
-- 1-based position in the original results (1 = top hit)
|
||||
relevance_score INTEGER NOT NULL
|
||||
CHECK (relevance_score IN (0, 1, 2, 3)),
|
||||
-- 0=irrelevant, 1=marginal, 2=relevant, 3=highly relevant
|
||||
feedback_source TEXT,
|
||||
-- 'cited_in_decision' / 'chair_marked' / 'auto_inferred'
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
UNIQUE(search_log_id, case_law_id, feedback_source)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_relevance_log
|
||||
ON search_relevance_feedback(search_log_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_relevance_case_law
|
||||
ON search_relevance_feedback(case_law_id);
|
||||
"""
|
||||
|
||||
|
||||
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(SCHEMA_SQL)
|
||||
@@ -924,7 +1026,9 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
||||
await conn.execute(SCHEMA_V14_SQL)
|
||||
await conn.execute(SCHEMA_V15_SQL)
|
||||
await conn.execute(SCHEMA_V16_SQL)
|
||||
logger.info("Database schema initialized (v1-v16)")
|
||||
await conn.execute(SCHEMA_V17_SQL)
|
||||
await conn.execute(SCHEMA_V18_SQL)
|
||||
logger.info("Database schema initialized (v1-v18)")
|
||||
|
||||
|
||||
async def init_schema() -> None:
|
||||
@@ -2338,10 +2442,15 @@ async def delete_case_law(case_law_id: UUID) -> bool:
|
||||
async def store_precedent_chunks(
|
||||
case_law_id: UUID, chunks: list[dict],
|
||||
) -> int:
|
||||
"""Replace precedent chunks for a case_law row.
|
||||
"""Replace precedent chunks for a case_law row (single-tier).
|
||||
|
||||
Each chunk dict has: chunk_index, content, section_type, page_number,
|
||||
embedding (list[float] or None).
|
||||
|
||||
All rows written here are stored with ``chunk_role='child'`` and
|
||||
``parent_chunk_id IS NULL`` — backward-compatible with the V17
|
||||
schema (parent-doc lookup is a no-op for these rows). For two-tier
|
||||
ingestion, see :func:`store_precedent_chunks_hierarchical`.
|
||||
"""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
@@ -2365,6 +2474,84 @@ async def store_precedent_chunks(
|
||||
return len(chunks)
|
||||
|
||||
|
||||
async def store_precedent_chunks_hierarchical(
|
||||
case_law_id: UUID,
|
||||
chunks: list[dict],
|
||||
) -> dict:
|
||||
"""Replace precedent chunks for a case_law row (two-tier).
|
||||
|
||||
Each input dict must carry:
|
||||
* ``role``: 'child' | 'parent'
|
||||
* ``local_id``: in-batch identifier (int) used to wire children
|
||||
to their parent's DB UUID
|
||||
* ``parent_local_id``: int (only for children) — references the
|
||||
``local_id`` of the parent in this same batch. For parents,
|
||||
this is None.
|
||||
* ``chunk_index``, ``content``, ``section_type``, ``page_number``
|
||||
* ``embedding``: required for children, None for parents
|
||||
|
||||
Two-pass write inside a single transaction:
|
||||
1. INSERT all parents (no FK back to children), capture
|
||||
``local_id → DB UUID`` map.
|
||||
2. INSERT all children with ``parent_chunk_id`` resolved.
|
||||
|
||||
Returns ``{"parents": N, "children": M, "total": N+M}``.
|
||||
"""
|
||||
parents = [c for c in chunks if c.get("role") == "parent"]
|
||||
children = [c for c in chunks if c.get("role") == "child"]
|
||||
if not parents and not children:
|
||||
return {"parents": 0, "children": 0, "total": 0}
|
||||
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
async with conn.transaction():
|
||||
await conn.execute(
|
||||
"DELETE FROM precedent_chunks WHERE case_law_id = $1",
|
||||
case_law_id,
|
||||
)
|
||||
# Pass 1: parents — embedding intentionally NULL (parents
|
||||
# aren't matched on; they only carry retrieval context).
|
||||
local_to_uuid: dict[int, UUID] = {}
|
||||
for p in parents:
|
||||
row = await conn.fetchrow(
|
||||
"""INSERT INTO precedent_chunks
|
||||
(case_law_id, chunk_index, content, section_type,
|
||||
page_number, embedding, chunk_role, parent_chunk_id)
|
||||
VALUES ($1, $2, $3, $4, $5, NULL, 'parent', NULL)
|
||||
RETURNING id""",
|
||||
case_law_id,
|
||||
p["chunk_index"],
|
||||
p["content"],
|
||||
p.get("section_type", "other"),
|
||||
p.get("page_number"),
|
||||
)
|
||||
local_to_uuid[int(p["local_id"])] = row["id"]
|
||||
|
||||
# Pass 2: children with resolved parent_chunk_id.
|
||||
for c in children:
|
||||
parent_uuid = local_to_uuid.get(
|
||||
int(c["parent_local_id"])
|
||||
) if c.get("parent_local_id") is not None else None
|
||||
await conn.execute(
|
||||
"""INSERT INTO precedent_chunks
|
||||
(case_law_id, chunk_index, content, section_type,
|
||||
page_number, embedding, chunk_role, parent_chunk_id)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, 'child', $7)""",
|
||||
case_law_id,
|
||||
c["chunk_index"],
|
||||
c["content"],
|
||||
c.get("section_type", "other"),
|
||||
c.get("page_number"),
|
||||
c.get("embedding"),
|
||||
parent_uuid,
|
||||
)
|
||||
return {
|
||||
"parents": len(parents),
|
||||
"children": len(children),
|
||||
"total": len(parents) + len(children),
|
||||
}
|
||||
|
||||
|
||||
async def list_precedent_chunks(
|
||||
case_law_id: UUID,
|
||||
section_types: tuple[str, ...] | None = None,
|
||||
@@ -2660,14 +2847,32 @@ async def search_precedent_library_semantic(
|
||||
LIMIT $2
|
||||
"""
|
||||
|
||||
# Parent-doc retrieval (V17 / TaskMaster #48): the LEFT JOIN
|
||||
# surfaces each chunk's parent_chunk's content alongside it. When
|
||||
# ``config.PARENT_DOC_RETRIEVAL_ENABLED`` is true *and* the row has
|
||||
# a non-null parent, the post-processing loop swaps in the parent's
|
||||
# content so the writer sees the broader passage instead of the
|
||||
# 300-token sliver that matched. Legacy rows (parent_chunk_id NULL)
|
||||
# are unaffected — the JOIN returns NULL parent_* and the swap is a
|
||||
# no-op. Index ``idx_precedent_chunks_role`` is not used here
|
||||
# intentionally: filtering on chunk_role='child' would exclude
|
||||
# legacy single-tier rows that default to 'child' but have no
|
||||
# parent; an embedding-IS-NOT-NULL filter is equivalent because
|
||||
# parents store NULL embeddings.
|
||||
chunk_sql = f"""
|
||||
SELECT pc.id AS chunk_id, pc.case_law_id, pc.content,
|
||||
pc.section_type, pc.page_number,
|
||||
pc.parent_chunk_id,
|
||||
parent.content AS parent_content,
|
||||
parent.section_type AS parent_section_type,
|
||||
parent.page_number AS parent_page_number,
|
||||
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
|
||||
cl.precedent_level, cl.practice_area, cl.chair_name, cl.district,
|
||||
1 - (pc.embedding <=> $1) AS score
|
||||
FROM precedent_chunks pc
|
||||
JOIN case_law cl ON cl.id = pc.case_law_id
|
||||
LEFT JOIN precedent_chunks parent
|
||||
ON parent.id = pc.parent_chunk_id
|
||||
WHERE {' AND '.join(chunk_filters)}
|
||||
AND pc.embedding IS NOT NULL
|
||||
ORDER BY pc.embedding <=> $1
|
||||
@@ -2697,10 +2902,68 @@ async def search_precedent_library_semantic(
|
||||
d["decision_date"] = d["decision_date"].isoformat()
|
||||
d["score"] = float(d["score"])
|
||||
d["type"] = "passage"
|
||||
_maybe_swap_parent(d)
|
||||
results.append(d)
|
||||
|
||||
results.sort(key=lambda x: x["score"], reverse=True)
|
||||
return results[:limit]
|
||||
# Dedupe: when multiple child hits share the same parent, we'd
|
||||
# otherwise return duplicate parent content. Keep the highest-
|
||||
# scoring hit per parent (skip if parent swap disabled or row has
|
||||
# no parent — chunk_id alone remains unique).
|
||||
return _dedupe_by_parent(results, limit)
|
||||
|
||||
|
||||
def _maybe_swap_parent(row: dict) -> None:
|
||||
"""Promote parent content into ``content`` when the flag is on
|
||||
and the row has a non-NULL parent. Mutates ``row`` in place.
|
||||
|
||||
Adds debug fields ``child_content`` / ``child_section_type`` /
|
||||
``child_page_number`` so callers can see what originally matched.
|
||||
Strips the ``parent_*`` keys that come back from the LEFT JOIN —
|
||||
they're an implementation detail of the swap.
|
||||
"""
|
||||
parent_content = row.pop("parent_content", None)
|
||||
parent_section = row.pop("parent_section_type", None)
|
||||
parent_page = row.pop("parent_page_number", None)
|
||||
if (
|
||||
config.PARENT_DOC_RETRIEVAL_ENABLED
|
||||
and row.get("parent_chunk_id") is not None
|
||||
and parent_content
|
||||
):
|
||||
row["child_content"] = row.get("content")
|
||||
row["child_section_type"] = row.get("section_type")
|
||||
row["child_page_number"] = row.get("page_number")
|
||||
row["content"] = parent_content
|
||||
# Parent's section_type is authoritative for the swapped row
|
||||
# (children inherit from their parent, but a parent that spans
|
||||
# a boundary uses its first section's type — same convention).
|
||||
if parent_section:
|
||||
row["section_type"] = parent_section
|
||||
if parent_page is not None:
|
||||
row["page_number"] = parent_page
|
||||
row["parent_swap"] = True
|
||||
|
||||
|
||||
def _dedupe_by_parent(rows: list[dict], limit: int) -> list[dict]:
|
||||
"""When parent-doc swap is active, multiple children sharing a
|
||||
parent collapse to one parent row (the highest-scored child wins).
|
||||
Rows without a parent (legacy chunks, halachot) pass through
|
||||
unchanged.
|
||||
"""
|
||||
if not config.PARENT_DOC_RETRIEVAL_ENABLED:
|
||||
return rows[:limit]
|
||||
seen_parents: set = set()
|
||||
out: list[dict] = []
|
||||
for r in rows:
|
||||
pid = r.get("parent_chunk_id")
|
||||
if pid and r.get("parent_swap"):
|
||||
if pid in seen_parents:
|
||||
continue
|
||||
seen_parents.add(pid)
|
||||
out.append(r)
|
||||
if len(out) >= limit:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
async def search_precedent_library_lexical(
|
||||
@@ -2815,15 +3078,32 @@ async def search_precedent_library_lexical(
|
||||
LIMIT $2
|
||||
"""
|
||||
|
||||
# Parent-doc retrieval (V17) — same LEFT JOIN strategy as the
|
||||
# semantic side. The tsvector match still runs over the child's
|
||||
# ``content_tsv``; only the *returned* content is promoted to the
|
||||
# parent when the flag is on and a parent exists. See
|
||||
# :func:`search_precedent_library_semantic` for the rationale.
|
||||
# We intentionally restrict matching to chunks with an embedding
|
||||
# (i.e. children + legacy single-tier rows). Hierarchical parents
|
||||
# store NULL embeddings, so even though their ``content_tsv`` is
|
||||
# populated they're excluded here — preventing a parent from
|
||||
# matching directly and then being "swapped" with itself.
|
||||
chunk_sql = f"""
|
||||
SELECT pc.id AS chunk_id, pc.case_law_id, pc.content,
|
||||
pc.section_type, pc.page_number,
|
||||
pc.parent_chunk_id,
|
||||
parent.content AS parent_content,
|
||||
parent.section_type AS parent_section_type,
|
||||
parent.page_number AS parent_page_number,
|
||||
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
|
||||
cl.precedent_level, cl.practice_area, cl.chair_name, cl.district,
|
||||
ts_rank_cd(pc.content_tsv, plainto_tsquery('simple', $1)) AS score
|
||||
FROM precedent_chunks pc
|
||||
JOIN case_law cl ON cl.id = pc.case_law_id
|
||||
LEFT JOIN precedent_chunks parent
|
||||
ON parent.id = pc.parent_chunk_id
|
||||
WHERE {' AND '.join(chunk_filters)}
|
||||
AND pc.embedding IS NOT NULL
|
||||
AND pc.content_tsv @@ plainto_tsquery('simple', $1)
|
||||
ORDER BY score DESC
|
||||
LIMIT $2
|
||||
@@ -2847,10 +3127,11 @@ async def search_precedent_library_lexical(
|
||||
d["decision_date"] = d["decision_date"].isoformat()
|
||||
d["score"] = float(d["score"])
|
||||
d["type"] = "passage"
|
||||
_maybe_swap_parent(d)
|
||||
results.append(d)
|
||||
|
||||
results.sort(key=lambda x: x["score"], reverse=True)
|
||||
return results[:limit]
|
||||
return _dedupe_by_parent(results, limit)
|
||||
|
||||
|
||||
async def precedent_library_stats() -> dict:
|
||||
|
||||
@@ -144,25 +144,63 @@ async def ingest_internal_decision(
|
||||
case_law_id = UUID(str(record["id"]))
|
||||
|
||||
try:
|
||||
chunks = chunker.chunk_document(raw_text, page_offsets=page_offsets)
|
||||
if not chunks:
|
||||
await db.set_case_law_extraction_status(case_law_id, "completed")
|
||||
await db.set_case_law_halacha_status(case_law_id, "completed")
|
||||
return {"status": "completed", "case_law_id": str(case_law_id), "chunks": 0}
|
||||
# Parent-doc retrieval (TaskMaster #48) — same gated branch as
|
||||
# ingest_precedent. Internal committee decisions are typically
|
||||
# longer than external court rulings (full transcript + ruling),
|
||||
# so the parent-doc benefit is even larger here.
|
||||
if config.PARENT_DOC_RETRIEVAL_ENABLED:
|
||||
h_chunks = chunker.chunk_document_hierarchical(
|
||||
raw_text, page_offsets=page_offsets,
|
||||
)
|
||||
if not h_chunks:
|
||||
await db.set_case_law_extraction_status(case_law_id, "completed")
|
||||
await db.set_case_law_halacha_status(case_law_id, "completed")
|
||||
return {"status": "completed", "case_law_id": str(case_law_id), "chunks": 0}
|
||||
children = [c for c in h_chunks if c.role == "child"]
|
||||
parents = [c for c in h_chunks if c.role == "parent"]
|
||||
child_vectors = await embeddings.embed_texts(
|
||||
[c.content for c in children], input_type="document",
|
||||
)
|
||||
chunk_dicts: list[dict] = []
|
||||
for p in parents:
|
||||
chunk_dicts.append({
|
||||
"role": "parent", "local_id": p.local_id, "parent_local_id": None,
|
||||
"chunk_index": p.chunk_index, "content": p.content,
|
||||
"section_type": p.section_type, "page_number": p.page_number,
|
||||
"embedding": None,
|
||||
})
|
||||
for c, v in zip(children, child_vectors):
|
||||
chunk_dicts.append({
|
||||
"role": "child", "local_id": c.local_id,
|
||||
"parent_local_id": c.parent_local_id,
|
||||
"chunk_index": c.chunk_index, "content": c.content,
|
||||
"section_type": c.section_type, "page_number": c.page_number,
|
||||
"embedding": v,
|
||||
})
|
||||
counts = await db.store_precedent_chunks_hierarchical(
|
||||
case_law_id, chunk_dicts,
|
||||
)
|
||||
stored = counts["children"]
|
||||
else:
|
||||
chunks = chunker.chunk_document(raw_text, page_offsets=page_offsets)
|
||||
if not chunks:
|
||||
await db.set_case_law_extraction_status(case_law_id, "completed")
|
||||
await db.set_case_law_halacha_status(case_law_id, "completed")
|
||||
return {"status": "completed", "case_law_id": str(case_law_id), "chunks": 0}
|
||||
|
||||
chunk_texts = [c.content for c in chunks]
|
||||
chunk_vectors = await embeddings.embed_texts(chunk_texts, input_type="document")
|
||||
chunk_dicts = [
|
||||
{
|
||||
"chunk_index": c.chunk_index,
|
||||
"content": c.content,
|
||||
"section_type": c.section_type,
|
||||
"page_number": c.page_number,
|
||||
"embedding": v,
|
||||
}
|
||||
for c, v in zip(chunks, chunk_vectors)
|
||||
]
|
||||
stored = await db.store_precedent_chunks(case_law_id, chunk_dicts)
|
||||
chunk_texts = [c.content for c in chunks]
|
||||
chunk_vectors = await embeddings.embed_texts(chunk_texts, input_type="document")
|
||||
chunk_dicts = [
|
||||
{
|
||||
"chunk_index": c.chunk_index,
|
||||
"content": c.content,
|
||||
"section_type": c.section_type,
|
||||
"page_number": c.page_number,
|
||||
"embedding": v,
|
||||
}
|
||||
for c, v in zip(chunks, chunk_vectors)
|
||||
]
|
||||
stored = await db.store_precedent_chunks(case_law_id, chunk_dicts)
|
||||
|
||||
await db.set_case_law_extraction_status(case_law_id, "completed")
|
||||
await db.set_case_law_halacha_status(case_law_id, "pending")
|
||||
|
||||
@@ -172,34 +172,100 @@ async def ingest_precedent(
|
||||
case_law_id = UUID(str(record["id"]))
|
||||
|
||||
try:
|
||||
await progress("chunking", 40, f"מחלק את הטקסט ל-chunks ({page_count} עמ')")
|
||||
chunks = chunker.chunk_document(text, page_offsets=page_offsets)
|
||||
if not chunks:
|
||||
await db.set_case_law_extraction_status(case_law_id, "completed")
|
||||
await db.set_case_law_halacha_status(case_law_id, "completed")
|
||||
await progress("completed", 100, "אין טקסט לעיבוד")
|
||||
return {
|
||||
"status": "completed",
|
||||
"case_law_id": str(case_law_id),
|
||||
"chunks": 0,
|
||||
"halachot": 0,
|
||||
}
|
||||
# Parent-doc retrieval (TaskMaster #48): when enabled, emit
|
||||
# two tiers (parents + children). Only children are embedded
|
||||
# and indexed; parents carry retrieval context. When disabled,
|
||||
# fall back to legacy single-tier chunking — identical
|
||||
# behaviour to pre-V17.
|
||||
if config.PARENT_DOC_RETRIEVAL_ENABLED:
|
||||
await progress(
|
||||
"chunking", 40,
|
||||
f"מחלק את הטקסט ל-chunks היררכיים ({page_count} עמ')",
|
||||
)
|
||||
h_chunks = chunker.chunk_document_hierarchical(
|
||||
text, page_offsets=page_offsets,
|
||||
)
|
||||
if not h_chunks:
|
||||
await db.set_case_law_extraction_status(case_law_id, "completed")
|
||||
await db.set_case_law_halacha_status(case_law_id, "completed")
|
||||
await progress("completed", 100, "אין טקסט לעיבוד")
|
||||
return {
|
||||
"status": "completed",
|
||||
"case_law_id": str(case_law_id),
|
||||
"chunks": 0,
|
||||
"halachot": 0,
|
||||
}
|
||||
|
||||
await progress("embedding", 55, f"מייצר embeddings ל-{len(chunks)} chunks")
|
||||
chunk_texts = [c.content for c in chunks]
|
||||
chunk_vectors = await embeddings.embed_texts(chunk_texts, input_type="document")
|
||||
children = [c for c in h_chunks if c.role == "child"]
|
||||
parents = [c for c in h_chunks if c.role == "parent"]
|
||||
await progress(
|
||||
"embedding", 55,
|
||||
f"מייצר embeddings ל-{len(children)} children "
|
||||
f"({len(parents)} parents)",
|
||||
)
|
||||
child_texts = [c.content for c in children]
|
||||
child_vectors = await embeddings.embed_texts(
|
||||
child_texts, input_type="document",
|
||||
)
|
||||
# Build flat dict list for the two-pass writer.
|
||||
chunk_dicts: list[dict] = []
|
||||
for p in parents:
|
||||
chunk_dicts.append({
|
||||
"role": "parent",
|
||||
"local_id": p.local_id,
|
||||
"parent_local_id": None,
|
||||
"chunk_index": p.chunk_index,
|
||||
"content": p.content,
|
||||
"section_type": p.section_type,
|
||||
"page_number": p.page_number,
|
||||
"embedding": None,
|
||||
})
|
||||
for c, v in zip(children, child_vectors):
|
||||
chunk_dicts.append({
|
||||
"role": "child",
|
||||
"local_id": c.local_id,
|
||||
"parent_local_id": c.parent_local_id,
|
||||
"chunk_index": c.chunk_index,
|
||||
"content": c.content,
|
||||
"section_type": c.section_type,
|
||||
"page_number": c.page_number,
|
||||
"embedding": v,
|
||||
})
|
||||
counts = await db.store_precedent_chunks_hierarchical(
|
||||
case_law_id, chunk_dicts,
|
||||
)
|
||||
stored_chunks = counts["children"]
|
||||
else:
|
||||
await progress(
|
||||
"chunking", 40, f"מחלק את הטקסט ל-chunks ({page_count} עמ')",
|
||||
)
|
||||
chunks = chunker.chunk_document(text, page_offsets=page_offsets)
|
||||
if not chunks:
|
||||
await db.set_case_law_extraction_status(case_law_id, "completed")
|
||||
await db.set_case_law_halacha_status(case_law_id, "completed")
|
||||
await progress("completed", 100, "אין טקסט לעיבוד")
|
||||
return {
|
||||
"status": "completed",
|
||||
"case_law_id": str(case_law_id),
|
||||
"chunks": 0,
|
||||
"halachot": 0,
|
||||
}
|
||||
|
||||
chunk_dicts = [
|
||||
{
|
||||
"chunk_index": c.chunk_index,
|
||||
"content": c.content,
|
||||
"section_type": c.section_type,
|
||||
"page_number": c.page_number,
|
||||
"embedding": v,
|
||||
}
|
||||
for c, v in zip(chunks, chunk_vectors)
|
||||
]
|
||||
stored_chunks = await db.store_precedent_chunks(case_law_id, chunk_dicts)
|
||||
await progress("embedding", 55, f"מייצר embeddings ל-{len(chunks)} chunks")
|
||||
chunk_texts = [c.content for c in chunks]
|
||||
chunk_vectors = await embeddings.embed_texts(chunk_texts, input_type="document")
|
||||
|
||||
chunk_dicts = [
|
||||
{
|
||||
"chunk_index": c.chunk_index,
|
||||
"content": c.content,
|
||||
"section_type": c.section_type,
|
||||
"page_number": c.page_number,
|
||||
"embedding": v,
|
||||
}
|
||||
for c, v in zip(chunks, chunk_vectors)
|
||||
]
|
||||
stored_chunks = await db.store_precedent_chunks(case_law_id, chunk_dicts)
|
||||
|
||||
# Multimodal page-image embeddings (V9). Gated by feature flag.
|
||||
# Non-fatal: text path already succeeded. Only PDFs.
|
||||
|
||||
391
mcp-server/src/legal_mcp/services/telemetry.py
Normal file
391
mcp-server/src/legal_mcp/services/telemetry.py
Normal file
@@ -0,0 +1,391 @@
|
||||
"""RAG retrieval telemetry — closed-loop feedback (TaskMaster #50).
|
||||
|
||||
Logs every semantic search call so we can compute nDCG@10 over time,
|
||||
spot retrieval drift, and feed the rerank training set.
|
||||
|
||||
Design notes
|
||||
------------
|
||||
- **All writes are fire-and-forget**: callers wrap us in ``try/except``
|
||||
but we also swallow our own DB errors so a telemetry hiccup can never
|
||||
fail a search. The log itself is also written via a detached task —
|
||||
the search returns to the caller immediately and the row lands in
|
||||
the DB on the side.
|
||||
|
||||
- **search_decisions / search_case_documents** return document chunks
|
||||
from active cases, not ``case_law`` rows. Their telemetry rows leave
|
||||
``top_case_law_ids`` empty; nDCG aggregation ignores them.
|
||||
|
||||
- **Auto-inferred feedback**: once a final decision is exported, we
|
||||
scan its ``decision_paragraphs.citations`` JSONB, pull the
|
||||
``case_law_id`` values, and mark them as ``relevance_score=3`` on
|
||||
any search_log for the same case where the precedent appeared in
|
||||
the top-K. This gives us a "cited == relevant" ground truth signal
|
||||
without asking the chair to label results by hand.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Any, Iterable
|
||||
from uuid import UUID
|
||||
|
||||
from legal_mcp.services import db
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_VALID_SOURCES = {"cited_in_decision", "chair_marked", "auto_inferred"}
|
||||
|
||||
|
||||
def _coerce_case_law_ids(results: Iterable[Any], limit: int = 10) -> list[UUID]:
|
||||
"""Pull up to ``limit`` ``case_law_id`` UUIDs from search results.
|
||||
|
||||
Tolerates rows missing the field, non-UUID strings, and ``None``
|
||||
values. Preserves order (= ranking).
|
||||
"""
|
||||
out: list[UUID] = []
|
||||
seen: set[str] = set()
|
||||
for r in results:
|
||||
if len(out) >= limit:
|
||||
break
|
||||
if not isinstance(r, dict):
|
||||
continue
|
||||
raw = r.get("case_law_id")
|
||||
if raw is None:
|
||||
continue
|
||||
s = str(raw)
|
||||
if s in seen:
|
||||
continue
|
||||
try:
|
||||
out.append(UUID(s))
|
||||
seen.add(s)
|
||||
except (ValueError, AttributeError):
|
||||
continue
|
||||
return out
|
||||
|
||||
|
||||
async def _insert_log(
|
||||
*,
|
||||
search_type: str,
|
||||
query: str,
|
||||
practice_area: str | None,
|
||||
case_id: UUID | None,
|
||||
user_agent: str | None,
|
||||
result_count: int,
|
||||
top_case_law_ids: list[UUID],
|
||||
duration_ms: int | None,
|
||||
) -> UUID | None:
|
||||
try:
|
||||
pool = await db.get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"""
|
||||
INSERT INTO search_logs (
|
||||
search_type, query, practice_area, case_id,
|
||||
user_agent, result_count, top_case_law_ids,
|
||||
duration_ms
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
RETURNING id
|
||||
""",
|
||||
search_type,
|
||||
query[:2000], # guard against pathologically long queries
|
||||
practice_area or None,
|
||||
case_id,
|
||||
user_agent or None,
|
||||
int(result_count),
|
||||
top_case_law_ids or None,
|
||||
duration_ms,
|
||||
)
|
||||
return row["id"] if row else None
|
||||
except Exception:
|
||||
logger.exception("telemetry.log_search: insert failed (swallowed)")
|
||||
return None
|
||||
|
||||
|
||||
async def log_search(
|
||||
*,
|
||||
search_type: str,
|
||||
query: str,
|
||||
results: Iterable[dict],
|
||||
duration_ms: int | None = None,
|
||||
practice_area: str | None = None,
|
||||
case_id: UUID | str | None = None,
|
||||
user_agent: str | None = None,
|
||||
) -> UUID | None:
|
||||
"""Record a search call. Never raises.
|
||||
|
||||
Args:
|
||||
search_type: one of 'precedent_library', 'internal_decisions',
|
||||
'decisions', 'case_documents', 'similar_cases'.
|
||||
query: the raw user query.
|
||||
results: iterable of result dicts. We pull ``case_law_id`` from
|
||||
the first 10 to populate ``top_case_law_ids``.
|
||||
duration_ms: search latency in milliseconds.
|
||||
practice_area: optional filter applied to the search.
|
||||
case_id: optional case context (when the search was scoped to
|
||||
or triggered from a specific case).
|
||||
user_agent: 'writer' / 'researcher' / 'analyst' / 'manual'.
|
||||
|
||||
Returns:
|
||||
The ``search_logs.id`` UUID if the row was written, else None.
|
||||
Most callers ignore this; auto-inference uses it later via
|
||||
``infer_relevance_from_citations``.
|
||||
"""
|
||||
# Snapshot results immediately — callers may keep iterating.
|
||||
snapshot = list(results) if not isinstance(results, list) else results
|
||||
top_ids = _coerce_case_law_ids(snapshot, limit=10)
|
||||
|
||||
case_uuid: UUID | None
|
||||
if case_id is None:
|
||||
case_uuid = None
|
||||
elif isinstance(case_id, UUID):
|
||||
case_uuid = case_id
|
||||
else:
|
||||
try:
|
||||
case_uuid = UUID(str(case_id))
|
||||
except (ValueError, AttributeError):
|
||||
case_uuid = None
|
||||
|
||||
return await _insert_log(
|
||||
search_type=search_type,
|
||||
query=query,
|
||||
practice_area=practice_area,
|
||||
case_id=case_uuid,
|
||||
user_agent=user_agent,
|
||||
result_count=len(snapshot),
|
||||
top_case_law_ids=top_ids,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
|
||||
def log_search_bg(
|
||||
*,
|
||||
search_type: str,
|
||||
query: str,
|
||||
results: Iterable[dict],
|
||||
duration_ms: int | None = None,
|
||||
practice_area: str | None = None,
|
||||
case_id: UUID | str | None = None,
|
||||
user_agent: str | None = None,
|
||||
) -> None:
|
||||
"""Fire-and-forget variant. Schedules the insert as a detached task.
|
||||
|
||||
Use this from hot search paths so the caller returns to the user
|
||||
immediately. Errors are logged inside ``log_search``.
|
||||
"""
|
||||
# Snapshot eagerly so the caller can mutate/iterate results freely.
|
||||
snapshot = list(results) if not isinstance(results, list) else list(results)
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
except RuntimeError:
|
||||
# No running loop — caller is sync. Best-effort: skip telemetry.
|
||||
return
|
||||
loop.create_task(
|
||||
log_search(
|
||||
search_type=search_type,
|
||||
query=query,
|
||||
results=snapshot,
|
||||
duration_ms=duration_ms,
|
||||
practice_area=practice_area,
|
||||
case_id=case_id,
|
||||
user_agent=user_agent,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Auto-inferred relevance feedback
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _extract_citations_from_jsonb(citations: Any) -> list[UUID]:
|
||||
"""Parse ``decision_paragraphs.citations`` JSONB into UUID list.
|
||||
|
||||
Stored shape: ``[{"case_law_id": "...", "text": "...", "type": ...}]``.
|
||||
Tolerates string form (asyncpg returns it as JSON string when the
|
||||
column registration didn't auto-decode).
|
||||
"""
|
||||
import json as _json
|
||||
|
||||
if not citations:
|
||||
return []
|
||||
if isinstance(citations, (bytes, bytearray)):
|
||||
try:
|
||||
citations = _json.loads(citations.decode("utf-8"))
|
||||
except (ValueError, UnicodeDecodeError):
|
||||
return []
|
||||
elif isinstance(citations, str):
|
||||
try:
|
||||
citations = _json.loads(citations)
|
||||
except ValueError:
|
||||
return []
|
||||
|
||||
if not isinstance(citations, list):
|
||||
return []
|
||||
|
||||
out: list[UUID] = []
|
||||
seen: set[str] = set()
|
||||
for item in citations:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
raw = item.get("case_law_id")
|
||||
if not raw:
|
||||
continue
|
||||
s = str(raw)
|
||||
if s in seen:
|
||||
continue
|
||||
try:
|
||||
out.append(UUID(s))
|
||||
seen.add(s)
|
||||
except (ValueError, AttributeError):
|
||||
continue
|
||||
return out
|
||||
|
||||
|
||||
async def _gather_cited_case_law_ids(case_id: UUID) -> list[UUID]:
|
||||
"""Pull every distinct ``case_law_id`` cited anywhere in the case's
|
||||
decision paragraphs.
|
||||
"""
|
||||
pool = await db.get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT dp.citations
|
||||
FROM decision_paragraphs dp
|
||||
JOIN decision_blocks db ON db.id = dp.block_id
|
||||
JOIN decisions d ON d.id = db.decision_id
|
||||
WHERE d.case_id = $1
|
||||
AND dp.citations IS NOT NULL
|
||||
AND jsonb_array_length(dp.citations) > 0
|
||||
""",
|
||||
case_id,
|
||||
)
|
||||
seen: set[str] = set()
|
||||
out: list[UUID] = []
|
||||
for r in rows:
|
||||
for clid in _extract_citations_from_jsonb(r["citations"]):
|
||||
s = str(clid)
|
||||
if s not in seen:
|
||||
seen.add(s)
|
||||
out.append(clid)
|
||||
return out
|
||||
|
||||
|
||||
async def infer_relevance_from_citations(
|
||||
case_id: UUID | str,
|
||||
*,
|
||||
relevance_score: int = 3,
|
||||
feedback_source: str = "cited_in_decision",
|
||||
) -> dict:
|
||||
"""For each precedent cited in the case's draft, write a relevance
|
||||
row against every search_log where that precedent appeared in the
|
||||
top-K for the same case.
|
||||
|
||||
Idempotent: the ``UNIQUE(search_log_id, case_law_id, feedback_source)``
|
||||
constraint on ``search_relevance_feedback`` prevents duplicates.
|
||||
|
||||
Returns:
|
||||
``{"cited_precedents": int, "feedback_rows_inserted": int,
|
||||
"searches_matched": int}``.
|
||||
"""
|
||||
if relevance_score not in (0, 1, 2, 3):
|
||||
raise ValueError("relevance_score must be in 0..3")
|
||||
if feedback_source not in _VALID_SOURCES:
|
||||
raise ValueError(f"feedback_source must be one of {_VALID_SOURCES!r}")
|
||||
|
||||
case_uuid = case_id if isinstance(case_id, UUID) else UUID(str(case_id))
|
||||
|
||||
cited = await _gather_cited_case_law_ids(case_uuid)
|
||||
if not cited:
|
||||
return {
|
||||
"cited_precedents": 0,
|
||||
"feedback_rows_inserted": 0,
|
||||
"searches_matched": 0,
|
||||
}
|
||||
|
||||
pool = await db.get_pool()
|
||||
inserted = 0
|
||||
matched_searches: set[str] = set()
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
# For each cited precedent, find all logs where it appeared in
|
||||
# top_case_law_ids for this case, and record its rank.
|
||||
for clid in cited:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT id, top_case_law_ids
|
||||
FROM search_logs
|
||||
WHERE case_id = $1
|
||||
AND top_case_law_ids IS NOT NULL
|
||||
AND $2 = ANY(top_case_law_ids)
|
||||
""",
|
||||
case_uuid,
|
||||
clid,
|
||||
)
|
||||
for row in rows:
|
||||
top_ids = row["top_case_law_ids"] or []
|
||||
# asyncpg returns uuid[] as list[UUID]
|
||||
try:
|
||||
rank = top_ids.index(clid) + 1
|
||||
except ValueError:
|
||||
continue
|
||||
result = await conn.execute(
|
||||
"""
|
||||
INSERT INTO search_relevance_feedback (
|
||||
search_log_id, case_law_id, rank,
|
||||
relevance_score, feedback_source
|
||||
) VALUES ($1, $2, $3, $4, $5)
|
||||
ON CONFLICT (search_log_id, case_law_id, feedback_source)
|
||||
DO NOTHING
|
||||
""",
|
||||
row["id"],
|
||||
clid,
|
||||
rank,
|
||||
relevance_score,
|
||||
feedback_source,
|
||||
)
|
||||
# ``execute`` returns 'INSERT 0 1' or 'INSERT 0 0' for
|
||||
# the no-op path; count only the writes.
|
||||
if result.endswith(" 1"):
|
||||
inserted += 1
|
||||
matched_searches.add(str(row["id"]))
|
||||
|
||||
return {
|
||||
"cited_precedents": len(cited),
|
||||
"feedback_rows_inserted": inserted,
|
||||
"searches_matched": len(matched_searches),
|
||||
}
|
||||
|
||||
|
||||
async def infer_relevance_for_all_finalized_cases(limit: int | None = None) -> dict:
|
||||
"""Bulk-run auto-inference for every case whose draft is final/exported.
|
||||
|
||||
Useful for back-filling after V18 schema lands and a few decisions
|
||||
have already been written. Skips cases with no cited precedents
|
||||
silently (they contribute zero to the totals).
|
||||
"""
|
||||
pool = await db.get_pool()
|
||||
sql = """
|
||||
SELECT DISTINCT c.id
|
||||
FROM cases c
|
||||
JOIN decisions d ON d.case_id = c.id
|
||||
WHERE c.status IN ('final', 'exported')
|
||||
"""
|
||||
if limit is not None and limit > 0:
|
||||
sql += " LIMIT $1"
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(sql, *([limit] if limit else []))
|
||||
|
||||
totals = {
|
||||
"cases_processed": 0,
|
||||
"cited_precedents": 0,
|
||||
"feedback_rows_inserted": 0,
|
||||
"searches_matched": 0,
|
||||
}
|
||||
for r in rows:
|
||||
stats = await infer_relevance_from_citations(r["id"])
|
||||
totals["cases_processed"] += 1
|
||||
totals["cited_precedents"] += stats["cited_precedents"]
|
||||
totals["feedback_rows_inserted"] += stats["feedback_rows_inserted"]
|
||||
totals["searches_matched"] += stats["searches_matched"]
|
||||
return totals
|
||||
Reference in New Issue
Block a user