From 6fcfdc76db9e4c87038eac2fcfc510ce9aab15a4 Mon Sep 17 00:00:00 2001
From: Chaim <chaim@marcus-law.co.il>
Date: Wed, 3 Jun 2026 08:10:10 +0000
Subject: [PATCH] fix(#79): chunker never emits sub-50-char fragment chunks
 (#55 follow-up)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A section that opens with a short header line ('דיון', 'טענות המשיבים')
followed by a paragraph larger than chunk_size flushed the header alone as a
tiny chunk. #55 added a query-time >=50 filter to hide these; this removes
them at the source.

_split_section: (1) don't flush a buffer still below MIN_CHUNK_CHARS — let it
absorb the next paragraph even if that overflows chunk_size, so a short header
rides with its following content; (2) fold a trailing tiny chunk back into its
predecessor.

Verified: re-chunked the 4 corpus docs that still had a tiny chunk
(ע"א 5138/04, בר"מ 2340/02, בג"ץ 6525/15, 403-17) — corpus-wide chunks<50
went 4 -> 0; all 4 stay embedded/searchable and rank top in a relevant search
(נווה שלום #1 for the s.19(ג)(1) exemption query). No regression.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mcp-server/src/legal_mcp/services/chunker.py | 27 +++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/mcp-server/src/legal_mcp/services/chunker.py b/mcp-server/src/legal_mcp/services/chunker.py
index d67f5cf..af3320a 100644
--- a/mcp-server/src/legal_mcp/services/chunker.py
+++ b/mcp-server/src/legal_mcp/services/chunker.py
@@ -104,6 +104,14 @@ def _assign_pages(chunks: list[Chunk], text: str, page_offsets: list[int]) -> No
 # used to carve tiny boundary chunks ("דיון). במסגרת ה") that polluted search.
 MIN_SECTION_CHARS = 60
 
+# A split chunk shorter than this (stripped chars) must not stand alone — it
+# rides with adjacent content instead. This is the chunk-level analogue of
+# MIN_SECTION_CHARS and matches the query-time filter that hides <50-char
+# chunks. Without it, a section that opens with a short header line ("דיון",
+# "טענות המשיבים") followed by a paragraph larger than chunk_size flushed the
+# header as its own tiny chunk (#79, follow-up to #55).
+MIN_CHUNK_CHARS = 50
+
 
 def _split_into_sections(text: str) -> list[tuple[str, str]]:
     """Split text into (section_type, text) pairs based on Hebrew headers.
@@ -168,11 +176,20 @@ def _split_section(text: str, chunk_size: int, overlap: int) -> list[str]:
     chunks: list[str] = []
     current: list[str] = []
     current_tokens = 0
+    current_chars = 0
 
     for para in paragraphs:
         para_tokens = _estimate_tokens(para)
 
-        if current_tokens + para_tokens > chunk_size and current:
+        # Don't flush a buffer that is still below MIN_CHUNK_CHARS — let it
+        # absorb this paragraph even if that overflows chunk_size. A short
+        # header line ("דיון") must ride with the following paragraph rather
+        # than be emitted as a tiny fragment chunk (#79).
+        if (
+            current_tokens + para_tokens > chunk_size
+            and current
+            and current_chars >= MIN_CHUNK_CHARS
+        ):
             chunks.append("\n".join(current))
             # Keep overlap
             overlap_paras: list[str] = []
@@ -185,13 +202,21 @@ def _split_section(text: str, chunk_size: int, overlap: int) -> list[str]:
                 overlap_tokens += pt
             current = overlap_paras
             current_tokens = overlap_tokens
+            current_chars = sum(len(p) for p in current)
 
         current.append(para)
         current_tokens += para_tokens
+        current_chars += len(para)
 
     if current:
         chunks.append("\n".join(current))
 
+    # Fold a trailing tiny chunk back into its predecessor — a short trailing
+    # line (e.g. a stray quote fragment) shouldn't stand alone either (#79).
+    if len(chunks) >= 2 and len(chunks[-1].strip()) < MIN_CHUNK_CHARS:
+        tail = chunks.pop()
+        chunks[-1] = f"{chunks[-1]}\n{tail}"
+
     return chunks
 
 
-- 
2.49.1