diff --git a/mcp-server/src/legal_mcp/services/chunker.py b/mcp-server/src/legal_mcp/services/chunker.py index d67f5cf..af3320a 100644 --- a/mcp-server/src/legal_mcp/services/chunker.py +++ b/mcp-server/src/legal_mcp/services/chunker.py @@ -104,6 +104,14 @@ def _assign_pages(chunks: list[Chunk], text: str, page_offsets: list[int]) -> No # used to carve tiny boundary chunks ("דיון). במסגרת ה") that polluted search. MIN_SECTION_CHARS = 60 +# A split chunk shorter than this (stripped chars) must not stand alone — it +# rides with adjacent content instead. This is the chunk-level analogue of +# MIN_SECTION_CHARS and matches the query-time filter that hides <50-char +# chunks. Without it, a section that opens with a short header line ("דיון", +# "טענות המשיבים") followed by a paragraph larger than chunk_size flushed the +# header as its own tiny chunk (#79, follow-up to #55). +MIN_CHUNK_CHARS = 50 + def _split_into_sections(text: str) -> list[tuple[str, str]]: """Split text into (section_type, text) pairs based on Hebrew headers. @@ -168,11 +176,20 @@ def _split_section(text: str, chunk_size: int, overlap: int) -> list[str]: chunks: list[str] = [] current: list[str] = [] current_tokens = 0 + current_chars = 0 for para in paragraphs: para_tokens = _estimate_tokens(para) - if current_tokens + para_tokens > chunk_size and current: + # Don't flush a buffer that is still below MIN_CHUNK_CHARS — let it + # absorb this paragraph even if that overflows chunk_size. A short + # header line ("דיון") must ride with the following paragraph rather + # than be emitted as a tiny fragment chunk (#79). + if ( + current_tokens + para_tokens > chunk_size + and current + and current_chars >= MIN_CHUNK_CHARS + ): chunks.append("\n".join(current)) # Keep overlap overlap_paras: list[str] = [] @@ -185,13 +202,21 @@ def _split_section(text: str, chunk_size: int, overlap: int) -> list[str]: overlap_tokens += pt current = overlap_paras current_tokens = overlap_tokens + current_chars = sum(len(p) for p in current) current.append(para) current_tokens += para_tokens + current_chars += len(para) if current: chunks.append("\n".join(current)) + # Fold a trailing tiny chunk back into its predecessor — a short trailing + # line (e.g. a stray quote fragment) shouldn't stand alone either (#79). + if len(chunks) >= 2 and len(chunks[-1].strip()) < MIN_CHUNK_CHARS: + tail = chunks.pop() + chunks[-1] = f"{chunks[-1]}\n{tail}" + return chunks