Split a text into chunks of ~CHUNK_SIZE tokens, based on punctuation and newline boundaries. Adapted from https://github.com/openai/chatgpt-retrieval-plugin/blob/main/services/chunks.py Args: text: The text to split into chunks. Returns:
(
self,
text: str,
)
| 313 | return final_docs |
| 314 | |
| 315 | def chunk_tokens( |
| 316 | self, |
| 317 | text: str, |
| 318 | ) -> List[str]: |
| 319 | """ |
| 320 | Split a text into chunks of ~CHUNK_SIZE tokens, |
| 321 | based on punctuation and newline boundaries. |
| 322 | Adapted from |
| 323 | https://github.com/openai/chatgpt-retrieval-plugin/blob/main/services/chunks.py |
| 324 | |
| 325 | Args: |
| 326 | text: The text to split into chunks. |
| 327 | |
| 328 | Returns: |
| 329 | A list of text chunks, each of which is a string of tokens |
| 330 | roughly self.config.chunk_size tokens long. |
| 331 | """ |
| 332 | # Return an empty list if the text is empty or whitespace |
| 333 | if not text or text.isspace(): |
| 334 | return [] |
| 335 | |
| 336 | # Tokenize the text |
| 337 | tokens = self.tokenizer.encode(text, disallowed_special=()) |
| 338 | |
| 339 | # Initialize an empty list of chunks |
| 340 | chunks = [] |
| 341 | |
| 342 | # Initialize a counter for the number of chunks |
| 343 | num_chunks = 0 |
| 344 | |
| 345 | # Loop until all tokens are consumed |
| 346 | while tokens and num_chunks < self.config.max_chunks: |
| 347 | # Take the first chunk_size tokens as a chunk |
| 348 | chunk = tokens[: self.config.chunk_size] |
| 349 | |
| 350 | # Decode the chunk into text |
| 351 | chunk_text = self.tokenizer.decode(chunk) |
| 352 | |
| 353 | # Skip the chunk if it is empty or whitespace |
| 354 | if not chunk_text or chunk_text.isspace(): |
| 355 | # Remove the tokens corresponding to the chunk text |
| 356 | # from remaining tokens |
| 357 | tokens = tokens[len(chunk) :] |
| 358 | # Continue to the next iteration of the loop |
| 359 | continue |
| 360 | |
| 361 | # Find the last period or punctuation mark in the chunk |
| 362 | punctuation_matches = [ |
| 363 | (m.start(), m.group()) |
| 364 | for m in re.finditer(r"(?:[.!?][\s\n]|\n)", chunk_text) |
| 365 | ] |
| 366 | |
| 367 | last_punctuation = max([pos for pos, _ in punctuation_matches] + [-1]) |
| 368 | |
| 369 | # If there is a punctuation mark, and the last punctuation index is |
| 370 | # after MIN_CHUNK_SIZE_CHARS |
| 371 | if ( |
| 372 | last_punctuation != -1 |
no outgoing calls