Split text into numbered chunks at section boundaries. Each chunk gets line numbers corresponding to the original text. If the text is small enough, returns a single chunk. A preamble (NAME/SYNOPSIS/DESCRIPTION intro) is prepended to each chunk beyond the first so the model has cont
(text: str)
| 194 | |
| 195 | |
| 196 | def chunk_text(text: str) -> list[str]: |
| 197 | """Split text into numbered chunks at section boundaries. |
| 198 | |
| 199 | Each chunk gets line numbers corresponding to the original text. |
| 200 | If the text is small enough, returns a single chunk. |
| 201 | A preamble (NAME/SYNOPSIS/DESCRIPTION intro) is prepended to each |
| 202 | chunk beyond the first so the model has context. |
| 203 | |
| 204 | When a single section exceeds the chunk size, it is sub-split at |
| 205 | paragraph (blank-line) boundaries. |
| 206 | """ |
| 207 | numbered_full, _ = number_lines(text) |
| 208 | if len(numbered_full) <= CHUNK_SIZE_CHARS: |
| 209 | return [numbered_full] |
| 210 | |
| 211 | sections = _split_sections(text) |
| 212 | preamble = _build_preamble(text) |
| 213 | preamble_text = "" |
| 214 | if preamble: |
| 215 | preamble_text = ( |
| 216 | "[Context — this is a continuation of the same man page]\n\n" |
| 217 | + preamble |
| 218 | + "\n\n---\n\n" |
| 219 | ) |
| 220 | budget = CHUNK_SIZE_CHARS - len(preamble_text) |
| 221 | |
| 222 | total_lines = text.count("\n") + 1 |
| 223 | width = len(str(total_lines)) |
| 224 | |
| 225 | def _number_block(start_line: int, block_text: str) -> str: |
| 226 | lines = block_text.split("\n") |
| 227 | numbered = [] |
| 228 | for j, line in enumerate(lines): |
| 229 | lineno = start_line + j |
| 230 | numbered.append(f"{lineno:>{width}}| {line}") |
| 231 | return "\n".join(numbered) |
| 232 | |
| 233 | def _split_by_lines(start_line: int, block_text: str) -> list[tuple[int, str]]: |
| 234 | """Last-resort split: cut at line boundaries to fit budget.""" |
| 235 | lines = block_text.split("\n") |
| 236 | result: list[tuple[int, str]] = [] |
| 237 | cur_lines: list[str] = [] |
| 238 | cur_start = start_line |
| 239 | for line in lines: |
| 240 | candidate = "\n".join(cur_lines + [line]) |
| 241 | if len(_number_block(cur_start, candidate)) > budget and cur_lines: |
| 242 | result.append((cur_start, "\n".join(cur_lines))) |
| 243 | cur_start += len(cur_lines) |
| 244 | cur_lines = [] |
| 245 | cur_lines.append(line) |
| 246 | if cur_lines: |
| 247 | result.append((cur_start, "\n".join(cur_lines))) |
| 248 | return result |
| 249 | |
| 250 | blocks: list[tuple[int, str]] = [] |
| 251 | for start_line, section_text in sections: |
| 252 | numbered = _number_block(start_line, section_text) |
| 253 | if len(numbered) <= budget: |