Merge pull request 'fix(#79): chunker never emits sub-50-char fragment chunks (#55 follow-up)' (#45) from fix/79-chunker-no-tiny-fragments into main
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m38s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m38s
This commit was merged in pull request #45.
This commit is contained in:
@@ -104,6 +104,14 @@ def _assign_pages(chunks: list[Chunk], text: str, page_offsets: list[int]) -> No
|
|||||||
# used to carve tiny boundary chunks ("דיון). במסגרת ה") that polluted search.
|
# used to carve tiny boundary chunks ("דיון). במסגרת ה") that polluted search.
|
||||||
MIN_SECTION_CHARS = 60
|
MIN_SECTION_CHARS = 60
|
||||||
|
|
||||||
|
# A split chunk shorter than this (stripped chars) must not stand alone — it
|
||||||
|
# rides with adjacent content instead. This is the chunk-level analogue of
|
||||||
|
# MIN_SECTION_CHARS and matches the query-time filter that hides <50-char
|
||||||
|
# chunks. Without it, a section that opens with a short header line ("דיון",
|
||||||
|
# "טענות המשיבים") followed by a paragraph larger than chunk_size flushed the
|
||||||
|
# header as its own tiny chunk (#79, follow-up to #55).
|
||||||
|
MIN_CHUNK_CHARS = 50
|
||||||
|
|
||||||
|
|
||||||
def _split_into_sections(text: str) -> list[tuple[str, str]]:
|
def _split_into_sections(text: str) -> list[tuple[str, str]]:
|
||||||
"""Split text into (section_type, text) pairs based on Hebrew headers.
|
"""Split text into (section_type, text) pairs based on Hebrew headers.
|
||||||
@@ -168,11 +176,20 @@ def _split_section(text: str, chunk_size: int, overlap: int) -> list[str]:
|
|||||||
chunks: list[str] = []
|
chunks: list[str] = []
|
||||||
current: list[str] = []
|
current: list[str] = []
|
||||||
current_tokens = 0
|
current_tokens = 0
|
||||||
|
current_chars = 0
|
||||||
|
|
||||||
for para in paragraphs:
|
for para in paragraphs:
|
||||||
para_tokens = _estimate_tokens(para)
|
para_tokens = _estimate_tokens(para)
|
||||||
|
|
||||||
if current_tokens + para_tokens > chunk_size and current:
|
# Don't flush a buffer that is still below MIN_CHUNK_CHARS — let it
|
||||||
|
# absorb this paragraph even if that overflows chunk_size. A short
|
||||||
|
# header line ("דיון") must ride with the following paragraph rather
|
||||||
|
# than be emitted as a tiny fragment chunk (#79).
|
||||||
|
if (
|
||||||
|
current_tokens + para_tokens > chunk_size
|
||||||
|
and current
|
||||||
|
and current_chars >= MIN_CHUNK_CHARS
|
||||||
|
):
|
||||||
chunks.append("\n".join(current))
|
chunks.append("\n".join(current))
|
||||||
# Keep overlap
|
# Keep overlap
|
||||||
overlap_paras: list[str] = []
|
overlap_paras: list[str] = []
|
||||||
@@ -185,13 +202,21 @@ def _split_section(text: str, chunk_size: int, overlap: int) -> list[str]:
|
|||||||
overlap_tokens += pt
|
overlap_tokens += pt
|
||||||
current = overlap_paras
|
current = overlap_paras
|
||||||
current_tokens = overlap_tokens
|
current_tokens = overlap_tokens
|
||||||
|
current_chars = sum(len(p) for p in current)
|
||||||
|
|
||||||
current.append(para)
|
current.append(para)
|
||||||
current_tokens += para_tokens
|
current_tokens += para_tokens
|
||||||
|
current_chars += len(para)
|
||||||
|
|
||||||
if current:
|
if current:
|
||||||
chunks.append("\n".join(current))
|
chunks.append("\n".join(current))
|
||||||
|
|
||||||
|
# Fold a trailing tiny chunk back into its predecessor — a short trailing
|
||||||
|
# line (e.g. a stray quote fragment) shouldn't stand alone either (#79).
|
||||||
|
if len(chunks) >= 2 and len(chunks[-1].strip()) < MIN_CHUNK_CHARS:
|
||||||
|
tail = chunks.pop()
|
||||||
|
chunks[-1] = f"{chunks[-1]}\n{tail}"
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user