Merge pull request 'fix(#79): chunker never emits sub-50-char fragment chunks (#55 follow-up)' (#45) from fix/79-chunker-no-tiny-fragments into main

2026-06-03 08:10:39 +00:00
parent 0a88bed58b 6fcfdc76db
commit bb42aeeff4
1 changed files with 26 additions and 1 deletions
--- a/mcp-server/src/legal_mcp/services/chunker.py
+++ b/mcp-server/src/legal_mcp/services/chunker.py
@@ -104,6 +104,14 @@ def _assign_pages(chunks: list[Chunk], text: str, page_offsets: list[int]) -> No
 # used to carve tiny boundary chunks ("דיון). במסגרת ה") that polluted search.
 MIN_SECTION_CHARS = 60
 # A split chunk shorter than this (stripped chars) must not stand alone — it
 # rides with adjacent content instead. This is the chunk-level analogue of
 # MIN_SECTION_CHARS and matches the query-time filter that hides <50-char
 # chunks. Without it, a section that opens with a short header line ("דיון",
 # "טענות המשיבים") followed by a paragraph larger than chunk_size flushed the
 # header as its own tiny chunk (#79, follow-up to #55).
 MIN_CHUNK_CHARS = 50
 def _split_into_sections(text: str) -> list[tuple[str, str]]:
    """Split text into (section_type, text) pairs based on Hebrew headers.
@@ -168,11 +176,20 @@ def _split_section(text: str, chunk_size: int, overlap: int) -> list[str]:
    chunks: list[str] = []
    current: list[str] = []
    current_tokens = 0
    current_chars = 0
    for para in paragraphs:
        para_tokens = _estimate_tokens(para)
-        if current_tokens + para_tokens > chunk_size and current:
+        # Don't flush a buffer that is still below MIN_CHUNK_CHARS — let it
        # absorb this paragraph even if that overflows chunk_size. A short
        # header line ("דיון") must ride with the following paragraph rather
        # than be emitted as a tiny fragment chunk (#79).
        if (
            current_tokens + para_tokens > chunk_size
            and current
            and current_chars >= MIN_CHUNK_CHARS
        ):
            chunks.append("\n".join(current))
            # Keep overlap
            overlap_paras: list[str] = []
@@ -185,13 +202,21 @@ def _split_section(text: str, chunk_size: int, overlap: int) -> list[str]:
                overlap_tokens += pt
            current = overlap_paras
            current_tokens = overlap_tokens
            current_chars = sum(len(p) for p in current)
        current.append(para)
        current_tokens += para_tokens
        current_chars += len(para)
    if current:
        chunks.append("\n".join(current))
    # Fold a trailing tiny chunk back into its predecessor — a short trailing
    # line (e.g. a stray quote fragment) shouldn't stand alone either (#79).
    if len(chunks) >= 2 and len(chunks[-1].strip()) < MIN_CHUNK_CHARS:
        tail = chunks.pop()
        chunks[-1] = f"{chunks[-1]}\n{tail}"
    return chunks