fix(retrieval): make decisions findable by name + unhide committee uploads

Root cause of "agent can't find the Agasi decision in the corpus" (CMPA-55): the decision was fully ingested, but the retrieval layer failed on the realistic agent query — searching by case name. - RC-A (#52): lexical tsvector covered only chunk content + halacha text, so a bare-name query ("אגסי") matched decisions that *cite* the case, not the case itself. Add meta_tsv on case_law(case_name, case_number) (SCHEMA V20) and OR it into the lexical halacha/chunk SQL with a match boost, so a name/number hit surfaces the case's own rows. Agasi: rank 4 → rank 1. - RC-B (#53): precedent_library_list hard-defaulted source_kind=external_upload and never exposed the param, hiding uploaded ערר/בל"מ (internal_committee) decisions. Thread source_kind through service → tool → MCP tool (supports 'internal_committee' / 'all_committees'). - #54: agent instructions (researcher/analyst/writer) — search-by-name protocol: add content/case-number, search both corpora, use all_committees before declaring "not in corpus". - #55: chunker produced tiny fragment chunks ("דיון", "החלטה") from header keywords matched mid-sentence. Anchor SECTION_PATTERNS to line start + merge sub-min sections; exclude <50-char fragments at query time (484 existing fragments hidden; full re-chunk tracked as #57). Tests: scripts/test_retrieval_by_name.py (name ranks case above citer + substantive regressions); chunker unit checks (0 tiny chunks). New findings filed as tasks #56 (halacha source_kind leak) and #57 (re-chunk migration). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-30 11:26:19 +00:00
parent 165efc62b0
commit 58ab003206
11 changed files with 355 additions and 57 deletions
--- a/mcp-server/src/legal_mcp/server.py
+++ b/mcp-server/src/legal_mcp/server.py
@@ -201,11 +201,20 @@ async def precedent_library_list(
    precedent_level: str = "",
    source_type: str = "",
    search: str = "",
+    source_kind: str = "external_upload",
    limit: int = 100,
 ) -> str:
-    """רשימת הפסיקה בקורפוס הסמכותי, עם פילטרים."""
+    """רשימת הפסיקה בקורפוס, עם פילטרים.
+
+    source_kind: 'external_upload' (ברירת מחדל — פס"ד בתי משפט) /
+    'internal_committee' (החלטות ועדות ערר ערר/בל"מ שהועלו) /
+    'all_committees' (שתיהן — internal + appeals_committee).
+    החלטות ערר/בל"מ שמעלים נשמרות כ-internal_committee — כדי לראותן
+    ברשימה השתמש ב-source_kind='internal_committee' או 'all_committees'.
+    """
    return await plib.precedent_library_list(
-        practice_area, court, precedent_level, source_type, search, limit,
+        practice_area, court, precedent_level, source_type, search,
+        source_kind, limit,
    )


--- a/mcp-server/src/legal_mcp/services/chunker.py
+++ b/mcp-server/src/legal_mcp/services/chunker.py
@@ -97,13 +97,32 @@ def _assign_pages(chunks: list[Chunk], text: str, page_offsets: list[int]) -> No
        pos = idx + max(1, len(c.content) // 2)


+# A section shorter than this (stripped chars) is not a real section — it's
+# an artifact of a header keyword matched mid-text. Such a fragment is merged
+# into the preceding section rather than emitted as its own chunk. See #55:
+# unanchored keywords like "דיון"/"החלטה"/"מסקנה" appearing inside a sentence
+# used to carve tiny boundary chunks ("דיון). במסגרת ה") that polluted search.
+MIN_SECTION_CHARS = 60
+
+
 def _split_into_sections(text: str) -> list[tuple[str, str]]:
-    """Split text into (section_type, text) pairs based on Hebrew headers."""
+    """Split text into (section_type, text) pairs based on Hebrew headers.
+
+    Header keywords are matched only at the **start of a line** (after
+    optional whitespace / list numbering like ``5.`` or ``ג.``). A real
+    section header in these decisions sits on its own line; anchoring to
+    the line start prevents common words ("דיון", "החלטה", "מסקנה") that
+    appear mid-sentence from being treated as section boundaries — which
+    previously produced tiny fragment chunks (#55).
+    """
    # Find all section headers and their positions
    markers: list[tuple[int, str]] = []

    for pattern, section_type in SECTION_PATTERNS:
-        for match in re.finditer(pattern, text):
+        # ^ + MULTILINE: line start only. Optional leading spaces/tabs and an
+        # optional ordinal prefix ("5.", "5)", "ג.") before the keyword.
+        anchored = rf"^[ \t]*(?:\d+[.)]\s*|[א-ת][.)]\s*)?(?:{pattern})"
+        for match in re.finditer(anchored, text, re.MULTILINE):
            markers.append((match.start(), section_type))

    if not markers:
@@ -120,11 +139,18 @@ def _split_into_sections(text: str) -> list[tuple[str, str]]:
        if intro_text:
            sections.append(("intro", intro_text))

-    # Each section
+    # Each section. A section whose text is too short to stand alone is
+    # merged into the previous section (keeping the previous type) so a
+    # near-adjacent pair of headers can't produce a fragment chunk.
    for i, (pos, section_type) in enumerate(markers):
        end = markers[i + 1][0] if i + 1 < len(markers) else len(text)
        section_text = text[pos:end].strip()
-        if section_text:
+        if not section_text:
+            continue
+        if len(section_text) < MIN_SECTION_CHARS and sections:
+            prev_type, prev_text = sections[-1]
+            sections[-1] = (prev_type, f"{prev_text}\n{section_text}")
+        else:
            sections.append((section_type, section_text))

    return sections
--- a/mcp-server/src/legal_mcp/services/db.py
+++ b/mcp-server/src/legal_mcp/services/db.py
@@ -1070,6 +1070,29 @@ ALTER TABLE case_law ADD COLUMN IF NOT EXISTS citation_formatted TEXT DEFAULT ''
 """


+# ── V20: case-name / case-number lexical match ────────────────────
+# RC-A fix: the V12 tsvectors cover only chunk *content* + halacha
+# text, so a bare case-name query ("אגסי") matched decisions that
+# *cite* the case rather than the case itself. case_name and
+# case_number live on the parent case_law row, so we add a dedicated
+# meta tsvector there and OR it into the lexical search — a name/number
+# hit then surfaces all of that case's chunks + halachot. 'simple'
+# config (no stemmer) preserves Hebrew names + alphanumeric case
+# numbers like "81002-01-21" exactly as V12 does for content.
+SCHEMA_V20_SQL = """
+ALTER TABLE case_law
+    ADD COLUMN IF NOT EXISTS meta_tsv tsvector
+    GENERATED ALWAYS AS (
+        to_tsvector('simple',
+            coalesce(case_name,'') || ' ' || coalesce(case_number,'')
+        )
+    ) STORED;
+
+CREATE INDEX IF NOT EXISTS idx_case_law_meta_tsv
+    ON case_law USING GIN(meta_tsv);
+"""
+
+
 async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
    async with pool.acquire() as conn:
        await conn.execute(SCHEMA_SQL)
@@ -1092,7 +1115,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
        await conn.execute(SCHEMA_V17_SQL)
        await conn.execute(SCHEMA_V18_SQL)
        await conn.execute(SCHEMA_V19_SQL)
-    logger.info("Database schema initialized (v1-v19)")
+        await conn.execute(SCHEMA_V20_SQL)
+    logger.info("Database schema initialized (v1-v20)")


 async def init_schema() -> None:
@@ -3217,6 +3241,9 @@ async def search_precedent_library_semantic(
            ON parent.id = pc.parent_chunk_id
        WHERE {' AND '.join(chunk_filters)}
          AND pc.embedding IS NOT NULL
+          -- #55: exclude tiny fragment chunks (artifacts of pre-fix
+          -- mid-sentence header splits) that carry no retrievable signal.
+          AND length(trim(pc.content)) >= 50
        ORDER BY pc.embedding <=> $1
        LIMIT $2
    """
@@ -3411,11 +3438,17 @@ async def search_precedent_library_lexical(
               h.practice_areas, h.subject_tags, h.confidence, h.rule_type,
               cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
               cl.precedent_level, cl.chair_name, cl.district,
-               ts_rank_cd(h.rule_tsv, plainto_tsquery('simple', $1)) AS score
+               GREATEST(
+                   ts_rank_cd(h.rule_tsv, plainto_tsquery('simple', $1)),
+                   ts_rank_cd(cl.meta_tsv, plainto_tsquery('simple', $1))
+               )
+               + CASE WHEN cl.meta_tsv @@ plainto_tsquery('simple', $1)
+                      THEN 1.0 ELSE 0.0 END AS score
        FROM halachot h
        JOIN case_law cl ON cl.id = h.case_law_id
        WHERE {' AND '.join(halacha_filters)}
-          AND h.rule_tsv @@ plainto_tsquery('simple', $1)
+          AND (h.rule_tsv @@ plainto_tsquery('simple', $1)
+               OR cl.meta_tsv @@ plainto_tsquery('simple', $1))
        ORDER BY score DESC
        LIMIT $2
    """
@@ -3439,14 +3472,22 @@ async def search_precedent_library_lexical(
               parent.page_number AS parent_page_number,
               cl.case_number, cl.case_name, cl.court, cl.date AS decision_date,
               cl.precedent_level, cl.practice_area, cl.chair_name, cl.district,
-               ts_rank_cd(pc.content_tsv, plainto_tsquery('simple', $1)) AS score
+               GREATEST(
+                   ts_rank_cd(pc.content_tsv, plainto_tsquery('simple', $1)),
+                   ts_rank_cd(cl.meta_tsv, plainto_tsquery('simple', $1))
+               )
+               + CASE WHEN cl.meta_tsv @@ plainto_tsquery('simple', $1)
+                      THEN 1.0 ELSE 0.0 END AS score
        FROM precedent_chunks pc
        JOIN case_law cl ON cl.id = pc.case_law_id
        LEFT JOIN precedent_chunks parent
            ON parent.id = pc.parent_chunk_id
        WHERE {' AND '.join(chunk_filters)}
          AND pc.embedding IS NOT NULL
-          AND pc.content_tsv @@ plainto_tsquery('simple', $1)
+          -- #55: exclude tiny fragment chunks (see semantic query above).
+          AND length(trim(pc.content)) >= 50
+          AND (pc.content_tsv @@ plainto_tsquery('simple', $1)
+               OR cl.meta_tsv @@ plainto_tsquery('simple', $1))
        ORDER BY score DESC
        LIMIT $2
    """
--- a/mcp-server/src/legal_mcp/services/precedent_library.py
+++ b/mcp-server/src/legal_mcp/services/precedent_library.py
@@ -533,6 +533,7 @@ async def list_precedents(
    precedent_level: str = "",
    source_type: str = "",
    search: str = "",
+    source_kind: str = "external_upload",
    limit: int = 100,
    offset: int = 0,
 ) -> list[dict]:
@@ -542,6 +543,7 @@ async def list_precedents(
        precedent_level=precedent_level,
        source_type=source_type,
        search=search,
+        source_kind=source_kind,
        limit=limit,
        offset=offset,
    )
--- a/mcp-server/src/legal_mcp/tools/precedent_library.py
+++ b/mcp-server/src/legal_mcp/tools/precedent_library.py
@@ -103,6 +103,7 @@ async def precedent_library_list(
    precedent_level: str = "",
    source_type: str = "",
    search: str = "",
+    source_kind: str = "external_upload",
    limit: int = 100,
 ) -> str:
    """רשימה של פסיקה בקורפוס הסמכותי, עם פילטרים."""
@@ -112,6 +113,7 @@ async def precedent_library_list(
        precedent_level=precedent_level,
        source_type=source_type,
        search=search,
+        source_kind=source_kind,
        limit=limit,
    )
    return _ok(rows)