Chunk source code.
(text: str, language: str, desired_length: int)
| 142 | |
| 143 | |
| 144 | def chunk(text: str, language: str, desired_length: int) -> list[ChunkBoundary] | None: |
| 145 | """Chunk source code.""" |
| 146 | if not text.strip(): |
| 147 | return [] |
| 148 | |
| 149 | as_bytes = text.encode("utf-8") |
| 150 | parser = _cached_get_parser(language) |
| 151 | if parser is None: |
| 152 | return None |
| 153 | root = parser.parse(as_bytes).root_node |
| 154 | |
| 155 | chunks = [] |
| 156 | for chunk_boundary in _merge_node(root, desired_length): |
| 157 | start_char = len(as_bytes[: chunk_boundary.start].decode("utf-8")) |
| 158 | end_char = len(as_bytes[: chunk_boundary.end].decode("utf-8")) |
| 159 | chunks.append(ChunkBoundary(start=start_char, end=end_char)) |
| 160 | |
| 161 | return chunks |