fix(retrieval): make decisions findable by name + unhide committee uploads
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 3m57s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 3m57s
Root cause of "agent can't find the Agasi decision in the corpus" (CMPA-55): the decision was fully ingested, but the retrieval layer failed on the realistic agent query — searching by case name. - RC-A (#52): lexical tsvector covered only chunk content + halacha text, so a bare-name query ("אגסי") matched decisions that *cite* the case, not the case itself. Add meta_tsv on case_law(case_name, case_number) (SCHEMA V20) and OR it into the lexical halacha/chunk SQL with a match boost, so a name/number hit surfaces the case's own rows. Agasi: rank 4 → rank 1. - RC-B (#53): precedent_library_list hard-defaulted source_kind=external_upload and never exposed the param, hiding uploaded ערר/בל"מ (internal_committee) decisions. Thread source_kind through service → tool → MCP tool (supports 'internal_committee' / 'all_committees'). - #54: agent instructions (researcher/analyst/writer) — search-by-name protocol: add content/case-number, search both corpora, use all_committees before declaring "not in corpus". - #55: chunker produced tiny fragment chunks ("דיון", "החלטה") from header keywords matched mid-sentence. Anchor SECTION_PATTERNS to line start + merge sub-min sections; exclude <50-char fragments at query time (484 existing fragments hidden; full re-chunk tracked as #57). Tests: scripts/test_retrieval_by_name.py (name ranks case above citer + substantive regressions); chunker unit checks (0 tiny chunks). New findings filed as tasks #56 (halacha source_kind leak) and #57 (re-chunk migration). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -201,11 +201,20 @@ async def precedent_library_list(
|
||||
precedent_level: str = "",
|
||||
source_type: str = "",
|
||||
search: str = "",
|
||||
source_kind: str = "external_upload",
|
||||
limit: int = 100,
|
||||
) -> str:
|
||||
"""רשימת הפסיקה בקורפוס הסמכותי, עם פילטרים."""
|
||||
"""רשימת הפסיקה בקורפוס, עם פילטרים.
|
||||
|
||||
source_kind: 'external_upload' (ברירת מחדל — פס"ד בתי משפט) /
|
||||
'internal_committee' (החלטות ועדות ערר ערר/בל"מ שהועלו) /
|
||||
'all_committees' (שתיהן — internal + appeals_committee).
|
||||
החלטות ערר/בל"מ שמעלים נשמרות כ-internal_committee — כדי לראותן
|
||||
ברשימה השתמש ב-source_kind='internal_committee' או 'all_committees'.
|
||||
"""
|
||||
return await plib.precedent_library_list(
|
||||
practice_area, court, precedent_level, source_type, search, limit,
|
||||
practice_area, court, precedent_level, source_type, search,
|
||||
source_kind, limit,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -97,13 +97,32 @@ def _assign_pages(chunks: list[Chunk], text: str, page_offsets: list[int]) -> No
|
||||
pos = idx + max(1, len(c.content) // 2)
|
||||
|
||||
|
||||
# A section shorter than this (stripped chars) is not a real section — it's
|
||||
# an artifact of a header keyword matched mid-text. Such a fragment is merged
|
||||
# into the preceding section rather than emitted as its own chunk. See #55:
|
||||
# unanchored keywords like "דיון"/"החלטה"/"מסקנה" appearing inside a sentence
|
||||
# used to carve tiny boundary chunks ("דיון). במסגרת ה") that polluted search.
|
||||
MIN_SECTION_CHARS = 60
|
||||
|
||||
|
||||
def _split_into_sections(text: str) -> list[tuple[str, str]]:
|
||||
"""Split text into (section_type, text) pairs based on Hebrew headers."""
|
||||
"""Split text into (section_type, text) pairs based on Hebrew headers.
|
||||
|
||||
Header keywords are matched only at the **start of a line** (after
|
||||
optional whitespace / list numbering like ``5.`` or ``ג.``). A real
|
||||
section header in these decisions sits on its own line; anchoring to
|
||||
the line start prevents common words ("דיון", "החלטה", "מסקנה") that
|
||||
appear mid-sentence from being treated as section boundaries — which
|
||||
previously produced tiny fragment chunks (#55).
|
||||
"""
|
||||
# Find all section headers and their positions
|
||||
markers: list[tuple[int, str]] = []
|
||||
|
||||
for pattern, section_type in SECTION_PATTERNS:
|
||||
for match in re.finditer(pattern, text):
|
||||
# ^ + MULTILINE: line start only. Optional leading spaces/tabs and an
|
||||
# optional ordinal prefix ("5.", "5)", "ג.") before the keyword.
|
||||
anchored = rf"^[ \t]*(?:\d+[.)]\s*|[א-ת][.)]\s*)?(?:{pattern})"
|
||||
for match in re.finditer(anchored, text, re.MULTILINE):
|
||||
markers.append((match.start(), section_type))
|
||||
|
||||
if not markers:
|
||||
@@ -120,11 +139,18 @@ def _split_into_sections(text: str) -> list[tuple[str, str]]:
|
||||
if intro_text:
|
||||
sections.append(("intro", intro_text))
|
||||
|
||||
# Each section
|
||||
# Each section. A section whose text is too short to stand alone is
|
||||
# merged into the previous section (keeping the previous type) so a
|
||||
# near-adjacent pair of headers can't produce a fragment chunk.
|
||||
for i, (pos, section_type) in enumerate(markers):
|
||||
end = markers[i + 1][0] if i + 1 < len(markers) else len(text)
|
||||
section_text = text[pos:end].strip()
|
||||
if section_text:
|
||||
if not section_text:
|
||||
continue
|
||||
if len(section_text) < MIN_SECTION_CHARS and sections:
|
||||
prev_type, prev_text = sections[-1]
|
||||
sections[-1] = (prev_type, f"{prev_text}\n{section_text}")
|
||||
else:
|
||||
sections.append((section_type, section_text))
|
||||
|
||||
return sections
|
||||
|
||||
@@ -1070,6 +1070,29 @@ ALTER TABLE case_law ADD COLUMN IF NOT EXISTS citation_formatted TEXT DEFAULT ''
|
||||
"""
|
||||
|
||||
|
||||
# ── V20: case-name / case-number lexical match ────────────────────
|
||||
# RC-A fix: the V12 tsvectors cover only chunk *content* + halacha
|
||||
# text, so a bare case-name query ("אגסי") matched decisions that
|
||||
# *cite* the case rather than the case itself. case_name and
|
||||
# case_number live on the parent case_law row, so we add a dedicated
|
||||
# meta tsvector there and OR it into the lexical search — a name/number
|
||||
# hit then surfaces all of that case's chunks + halachot. 'simple'
|
||||
# config (no stemmer) preserves Hebrew names + alphanumeric case
|
||||
# numbers like "81002-01-21" exactly as V12 does for content.
|
||||
SCHEMA_V20_SQL = """
|
||||
ALTER TABLE case_law
|
||||
ADD COLUMN IF NOT EXISTS meta_tsv tsvector
|
||||
GENERATED ALWAYS AS (
|
||||
to_tsvector('simple',
|
||||
coalesce(case_name,'') || ' ' || coalesce(case_number,'')
|
||||
)
|
||||
) STORED;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_case_law_meta_tsv
|
||||
ON case_law USING GIN(meta_tsv);
|
||||
"""
|
||||
|
||||
|
||||
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(SCHEMA_SQL)
|
||||
@@ -1092,7 +1115,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
||||
await conn.execute(SCHEMA_V17_SQL)
|
||||
await conn.execute(SCHEMA_V18_SQL)
|
||||
await conn.execute(SCHEMA_V19_SQL)
|
||||
logger.info("Database schema initialized (v1-v19)")
|
||||
await conn.execute(SCHEMA_V20_SQL)
|
||||
logger.info("Database schema initialized (v1-v20)")
|
||||
|
||||
|
||||
async def init_schema() -> None:
|
||||
@@ -3217,6 +3241,9 @@ async def search_precedent_library_semantic(
|
||||
ON parent.id = pc.parent_chunk_id
|
||||
WHERE {' AND '.join(chunk_filters)}
|
||||
AND pc.embedding IS NOT NULL
|
||||
-- #55: exclude tiny fragment chunks (artifacts of pre-fix
|
||||
-- mid-sentence header splits) that carry no retrievable signal.
|
||||
AND length(trim(pc.content)) >= 50
|
||||
ORDER BY pc.embedding <=> $1
|
||||
LIMIT $2
|
||||
"""
|
||||
@@ -3411,11 +3438,17 @@ async def search_precedent_library_lexical(
|
||||
h.practice_areas, h.subject_tags, h.confidence, h.rule_type,
|
||||
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
|
||||
cl.precedent_level, cl.chair_name, cl.district,
|
||||
ts_rank_cd(h.rule_tsv, plainto_tsquery('simple', $1)) AS score
|
||||
GREATEST(
|
||||
ts_rank_cd(h.rule_tsv, plainto_tsquery('simple', $1)),
|
||||
ts_rank_cd(cl.meta_tsv, plainto_tsquery('simple', $1))
|
||||
)
|
||||
+ CASE WHEN cl.meta_tsv @@ plainto_tsquery('simple', $1)
|
||||
THEN 1.0 ELSE 0.0 END AS score
|
||||
FROM halachot h
|
||||
JOIN case_law cl ON cl.id = h.case_law_id
|
||||
WHERE {' AND '.join(halacha_filters)}
|
||||
AND h.rule_tsv @@ plainto_tsquery('simple', $1)
|
||||
AND (h.rule_tsv @@ plainto_tsquery('simple', $1)
|
||||
OR cl.meta_tsv @@ plainto_tsquery('simple', $1))
|
||||
ORDER BY score DESC
|
||||
LIMIT $2
|
||||
"""
|
||||
@@ -3439,14 +3472,22 @@ async def search_precedent_library_lexical(
|
||||
parent.page_number AS parent_page_number,
|
||||
cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
|
||||
cl.precedent_level, cl.practice_area, cl.chair_name, cl.district,
|
||||
ts_rank_cd(pc.content_tsv, plainto_tsquery('simple', $1)) AS score
|
||||
GREATEST(
|
||||
ts_rank_cd(pc.content_tsv, plainto_tsquery('simple', $1)),
|
||||
ts_rank_cd(cl.meta_tsv, plainto_tsquery('simple', $1))
|
||||
)
|
||||
+ CASE WHEN cl.meta_tsv @@ plainto_tsquery('simple', $1)
|
||||
THEN 1.0 ELSE 0.0 END AS score
|
||||
FROM precedent_chunks pc
|
||||
JOIN case_law cl ON cl.id = pc.case_law_id
|
||||
LEFT JOIN precedent_chunks parent
|
||||
ON parent.id = pc.parent_chunk_id
|
||||
WHERE {' AND '.join(chunk_filters)}
|
||||
AND pc.embedding IS NOT NULL
|
||||
AND pc.content_tsv @@ plainto_tsquery('simple', $1)
|
||||
-- #55: exclude tiny fragment chunks (see semantic query above).
|
||||
AND length(trim(pc.content)) >= 50
|
||||
AND (pc.content_tsv @@ plainto_tsquery('simple', $1)
|
||||
OR cl.meta_tsv @@ plainto_tsquery('simple', $1))
|
||||
ORDER BY score DESC
|
||||
LIMIT $2
|
||||
"""
|
||||
|
||||
@@ -533,6 +533,7 @@ async def list_precedents(
|
||||
precedent_level: str = "",
|
||||
source_type: str = "",
|
||||
search: str = "",
|
||||
source_kind: str = "external_upload",
|
||||
limit: int = 100,
|
||||
offset: int = 0,
|
||||
) -> list[dict]:
|
||||
@@ -542,6 +543,7 @@ async def list_precedents(
|
||||
precedent_level=precedent_level,
|
||||
source_type=source_type,
|
||||
search=search,
|
||||
source_kind=source_kind,
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
@@ -103,6 +103,7 @@ async def precedent_library_list(
|
||||
precedent_level: str = "",
|
||||
source_type: str = "",
|
||||
search: str = "",
|
||||
source_kind: str = "external_upload",
|
||||
limit: int = 100,
|
||||
) -> str:
|
||||
"""רשימה של פסיקה בקורפוס הסמכותי, עם פילטרים."""
|
||||
@@ -112,6 +113,7 @@ async def precedent_library_list(
|
||||
precedent_level=precedent_level,
|
||||
source_type=source_type,
|
||||
search=search,
|
||||
source_kind=source_kind,
|
||||
limit=limit,
|
||||
)
|
||||
return _ok(rows)
|
||||
|
||||
Reference in New Issue
Block a user