fix(retrieval): make decisions findable by name + unhide committee uploads
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 3m57s

Root cause of "agent can't find the Agasi decision in the corpus" (CMPA-55):
the decision was fully ingested, but the retrieval layer failed on the
realistic agent query — searching by case name.

- RC-A (#52): lexical tsvector covered only chunk content + halacha text,
  so a bare-name query ("אגסי") matched decisions that *cite* the case, not
  the case itself. Add meta_tsv on case_law(case_name, case_number) (SCHEMA
  V20) and OR it into the lexical halacha/chunk SQL with a match boost, so a
  name/number hit surfaces the case's own rows. Agasi: rank 4 → rank 1.
- RC-B (#53): precedent_library_list hard-defaulted source_kind=external_upload
  and never exposed the param, hiding uploaded ערר/בל"מ (internal_committee)
  decisions. Thread source_kind through service → tool → MCP tool (supports
  'internal_committee' / 'all_committees').
- #54: agent instructions (researcher/analyst/writer) — search-by-name
  protocol: add content/case-number, search both corpora, use all_committees
  before declaring "not in corpus".
- #55: chunker produced tiny fragment chunks ("דיון", "החלטה") from header
  keywords matched mid-sentence. Anchor SECTION_PATTERNS to line start +
  merge sub-min sections; exclude <50-char fragments at query time (484
  existing fragments hidden; full re-chunk tracked as #57).

Tests: scripts/test_retrieval_by_name.py (name ranks case above citer +
substantive regressions); chunker unit checks (0 tiny chunks). New findings
filed as tasks #56 (halacha source_kind leak) and #57 (re-chunk migration).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-30 11:26:19 +00:00
parent 165efc62b0
commit 58ab003206
11 changed files with 355 additions and 57 deletions

View File

@@ -201,11 +201,20 @@ async def precedent_library_list(
precedent_level: str = "",
source_type: str = "",
search: str = "",
source_kind: str = "external_upload",
limit: int = 100,
) -> str:
"""רשימת הפסיקה בקורפוס הסמכותי, עם פילטרים."""
"""רשימת הפסיקה בקורפוס, עם פילטרים.
source_kind: 'external_upload' (ברירת מחדל — פס"ד בתי משפט) /
'internal_committee' (החלטות ועדות ערר ערר/בל"מ שהועלו) /
'all_committees' (שתיהן — internal + appeals_committee).
החלטות ערר/בל"מ שמעלים נשמרות כ-internal_committee — כדי לראותן
ברשימה השתמש ב-source_kind='internal_committee' או 'all_committees'.
"""
return await plib.precedent_library_list(
practice_area, court, precedent_level, source_type, search, limit,
practice_area, court, precedent_level, source_type, search,
source_kind, limit,
)

View File

@@ -97,13 +97,32 @@ def _assign_pages(chunks: list[Chunk], text: str, page_offsets: list[int]) -> No
pos = idx + max(1, len(c.content) // 2)
# A section shorter than this (stripped chars) is not a real section — it's
# an artifact of a header keyword matched mid-text. Such a fragment is merged
# into the preceding section rather than emitted as its own chunk. See #55:
# unanchored keywords like "דיון"/"החלטה"/"מסקנה" appearing inside a sentence
# used to carve tiny boundary chunks ("דיון). במסגרת ה") that polluted search.
MIN_SECTION_CHARS = 60
def _split_into_sections(text: str) -> list[tuple[str, str]]:
"""Split text into (section_type, text) pairs based on Hebrew headers."""
"""Split text into (section_type, text) pairs based on Hebrew headers.
Header keywords are matched only at the **start of a line** (after
optional whitespace / list numbering like ``5.`` or ``ג.``). A real
section header in these decisions sits on its own line; anchoring to
the line start prevents common words ("דיון", "החלטה", "מסקנה") that
appear mid-sentence from being treated as section boundaries — which
previously produced tiny fragment chunks (#55).
"""
# Find all section headers and their positions
markers: list[tuple[int, str]] = []
for pattern, section_type in SECTION_PATTERNS:
for match in re.finditer(pattern, text):
# ^ + MULTILINE: line start only. Optional leading spaces/tabs and an
# optional ordinal prefix ("5.", "5)", "ג.") before the keyword.
anchored = rf"^[ \t]*(?:\d+[.)]\s*|[א-ת][.)]\s*)?(?:{pattern})"
for match in re.finditer(anchored, text, re.MULTILINE):
markers.append((match.start(), section_type))
if not markers:
@@ -120,11 +139,18 @@ def _split_into_sections(text: str) -> list[tuple[str, str]]:
if intro_text:
sections.append(("intro", intro_text))
# Each section
# Each section. A section whose text is too short to stand alone is
# merged into the previous section (keeping the previous type) so a
# near-adjacent pair of headers can't produce a fragment chunk.
for i, (pos, section_type) in enumerate(markers):
end = markers[i + 1][0] if i + 1 < len(markers) else len(text)
section_text = text[pos:end].strip()
if section_text:
if not section_text:
continue
if len(section_text) < MIN_SECTION_CHARS and sections:
prev_type, prev_text = sections[-1]
sections[-1] = (prev_type, f"{prev_text}\n{section_text}")
else:
sections.append((section_type, section_text))
return sections

View File

@@ -1070,6 +1070,29 @@ ALTER TABLE case_law ADD COLUMN IF NOT EXISTS citation_formatted TEXT DEFAULT ''
"""
# ── V20: case-name / case-number lexical match ────────────────────
# RC-A fix: the V12 tsvectors cover only chunk *content* + halacha
# text, so a bare case-name query ("אגסי") matched decisions that
# *cite* the case rather than the case itself. case_name and
# case_number live on the parent case_law row, so we add a dedicated
# meta tsvector there and OR it into the lexical search — a name/number
# hit then surfaces all of that case's chunks + halachot. 'simple'
# config (no stemmer) preserves Hebrew names + alphanumeric case
# numbers like "81002-01-21" exactly as V12 does for content.
SCHEMA_V20_SQL = """
ALTER TABLE case_law
ADD COLUMN IF NOT EXISTS meta_tsv tsvector
GENERATED ALWAYS AS (
to_tsvector('simple',
coalesce(case_name,'') || ' ' || coalesce(case_number,'')
)
) STORED;
CREATE INDEX IF NOT EXISTS idx_case_law_meta_tsv
ON case_law USING GIN(meta_tsv);
"""
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
async with pool.acquire() as conn:
await conn.execute(SCHEMA_SQL)
@@ -1092,7 +1115,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
await conn.execute(SCHEMA_V17_SQL)
await conn.execute(SCHEMA_V18_SQL)
await conn.execute(SCHEMA_V19_SQL)
logger.info("Database schema initialized (v1-v19)")
await conn.execute(SCHEMA_V20_SQL)
logger.info("Database schema initialized (v1-v20)")
async def init_schema() -> None:
@@ -3217,6 +3241,9 @@ async def search_precedent_library_semantic(
ON parent.id = pc.parent_chunk_id
WHERE {' AND '.join(chunk_filters)}
AND pc.embedding IS NOT NULL
-- #55: exclude tiny fragment chunks (artifacts of pre-fix
-- mid-sentence header splits) that carry no retrievable signal.
AND length(trim(pc.content)) >= 50
ORDER BY pc.embedding <=> $1
LIMIT $2
"""
@@ -3411,11 +3438,17 @@ async def search_precedent_library_lexical(
h.practice_areas, h.subject_tags, h.confidence, h.rule_type,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level, cl.chair_name, cl.district,
ts_rank_cd(h.rule_tsv, plainto_tsquery('simple', $1)) AS score
GREATEST(
ts_rank_cd(h.rule_tsv, plainto_tsquery('simple', $1)),
ts_rank_cd(cl.meta_tsv, plainto_tsquery('simple', $1))
)
+ CASE WHEN cl.meta_tsv @@ plainto_tsquery('simple', $1)
THEN 1.0 ELSE 0.0 END AS score
FROM halachot h
JOIN case_law cl ON cl.id = h.case_law_id
WHERE {' AND '.join(halacha_filters)}
AND h.rule_tsv @@ plainto_tsquery('simple', $1)
AND (h.rule_tsv @@ plainto_tsquery('simple', $1)
OR cl.meta_tsv @@ plainto_tsquery('simple', $1))
ORDER BY score DESC
LIMIT $2
"""
@@ -3439,14 +3472,22 @@ async def search_precedent_library_lexical(
parent.page_number AS parent_page_number,
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
cl.precedent_level, cl.practice_area, cl.chair_name, cl.district,
ts_rank_cd(pc.content_tsv, plainto_tsquery('simple', $1)) AS score
GREATEST(
ts_rank_cd(pc.content_tsv, plainto_tsquery('simple', $1)),
ts_rank_cd(cl.meta_tsv, plainto_tsquery('simple', $1))
)
+ CASE WHEN cl.meta_tsv @@ plainto_tsquery('simple', $1)
THEN 1.0 ELSE 0.0 END AS score
FROM precedent_chunks pc
JOIN case_law cl ON cl.id = pc.case_law_id
LEFT JOIN precedent_chunks parent
ON parent.id = pc.parent_chunk_id
WHERE {' AND '.join(chunk_filters)}
AND pc.embedding IS NOT NULL
AND pc.content_tsv @@ plainto_tsquery('simple', $1)
-- #55: exclude tiny fragment chunks (see semantic query above).
AND length(trim(pc.content)) >= 50
AND (pc.content_tsv @@ plainto_tsquery('simple', $1)
OR cl.meta_tsv @@ plainto_tsquery('simple', $1))
ORDER BY score DESC
LIMIT $2
"""

View File

@@ -533,6 +533,7 @@ async def list_precedents(
precedent_level: str = "",
source_type: str = "",
search: str = "",
source_kind: str = "external_upload",
limit: int = 100,
offset: int = 0,
) -> list[dict]:
@@ -542,6 +543,7 @@ async def list_precedents(
precedent_level=precedent_level,
source_type=source_type,
search=search,
source_kind=source_kind,
limit=limit,
offset=offset,
)

View File

@@ -103,6 +103,7 @@ async def precedent_library_list(
precedent_level: str = "",
source_type: str = "",
search: str = "",
source_kind: str = "external_upload",
limit: int = 100,
) -> str:
"""רשימה של פסיקה בקורפוס הסמכותי, עם פילטרים."""
@@ -112,6 +113,7 @@ async def precedent_library_list(
precedent_level=precedent_level,
source_type=source_type,
search=search,
source_kind=source_kind,
limit=limit,
)
return _ok(rows)