If body exceeds cap, return (head, overflow). Otherwise (body, None). Two-stage split: 1. Walk `## ` section boundaries (skipping headings inside fenced code blocks); accumulate sections into `head` while we fit. 2. If `head` itself still exceeds cap (e.g. the H1 + intro is alrea
(body: str, cap_bytes: int)
| 273 | |
| 274 | |
| 275 | def _split_body_if_oversized(body: str, cap_bytes: int) -> tuple[str, str | None]: |
| 276 | """If body exceeds cap, return (head, overflow). Otherwise (body, None). |
| 277 | |
| 278 | Two-stage split: |
| 279 | 1. Walk `## ` section boundaries (skipping headings inside fenced code blocks); |
| 280 | accumulate sections into `head` while we fit. |
| 281 | 2. If `head` itself still exceeds cap (e.g. the H1 + intro is already huge), hard-cut |
| 282 | at a UTF-8 codepoint boundary (preferring a newline) to guarantee output ≤ cap |
| 283 | without dropping multibyte chars. |
| 284 | |
| 285 | The pointer note appended to `head` is accounted for in the effective cap. |
| 286 | """ |
| 287 | encoded = body.encode("utf-8") |
| 288 | if len(encoded) <= cap_bytes: |
| 289 | return body, None |
| 290 | |
| 291 | # Reserve room for the pointer note we'll append. |
| 292 | effective_cap = cap_bytes - _POINTER_BYTES |
| 293 | |
| 294 | # _split_body_fence_aware returns sections WITH the leading `## ` already in place |
| 295 | # for every section after the first; the first section is the pre-heading head. |
| 296 | # Joining is just `head + "\n" + section` (no re-prepend of `## `). |
| 297 | sections = _split_body_fence_aware(body) if body else [body] |
| 298 | head = sections[0] if sections else body |
| 299 | overflow_parts: list[str] = [] |
| 300 | running = head |
| 301 | for section in sections[1:]: |
| 302 | candidate = running.rstrip("\n") + "\n\n" + section |
| 303 | if len(candidate.encode("utf-8")) > effective_cap: |
| 304 | overflow_parts.append(section) |
| 305 | else: |
| 306 | running = candidate |
| 307 | |
| 308 | # If running is still over cap (the head/H1 intro itself is too big), UTF-8-safe hard cut. |
| 309 | if len(running.encode("utf-8")) > effective_cap: |
| 310 | running_encoded = running.encode("utf-8") |
| 311 | head_bytes, tail_bytes = _utf8_safe_cut(running_encoded, effective_cap) |
| 312 | truncated_overflow = tail_bytes.decode("utf-8") |
| 313 | overflow_parts.insert(0, truncated_overflow.lstrip("\n")) |
| 314 | running = head_bytes.decode("utf-8") |
| 315 | |
| 316 | if not overflow_parts: |
| 317 | # Edge case: hit the cap exactly. Fall back to a UTF-8-safe hard cut. |
| 318 | head_bytes, tail_bytes = _utf8_safe_cut(encoded, effective_cap) |
| 319 | running = head_bytes.decode("utf-8") |
| 320 | overflow_parts = [tail_bytes.decode("utf-8")] |
| 321 | |
| 322 | return running + _POINTER, "\n".join(overflow_parts) |
| 323 | |
| 324 | |
| 325 | # ── Adapter ────────────────────────────────────────────────────────────────── |