Will look for the words added to the trie within `text`. Output is the original string splitted along the boundaries of the words found. This trie will match the longest possible word first ! Example: ```python >>> trie = Trie() >>> trie.sp
(self, text: str)
| 318 | ref[""] = 1 |
| 319 | |
| 320 | def split(self, text: str) -> List[str]: |
| 321 | """ |
| 322 | Will look for the words added to the trie within `text`. Output is the original string splitted along the |
| 323 | boundaries of the words found. |
| 324 | |
| 325 | This trie will match the longest possible word first ! |
| 326 | |
| 327 | Example: |
| 328 | |
| 329 | ```python |
| 330 | >>> trie = Trie() |
| 331 | >>> trie.split("[CLS] This is a extra_id_100") |
| 332 | ["[CLS] This is a extra_id_100"] |
| 333 | |
| 334 | >>> trie.add("[CLS]") |
| 335 | >>> trie.add("extra_id_1") |
| 336 | >>> trie.add("extra_id_100") |
| 337 | >>> trie.split("[CLS] This is a extra_id_100") |
| 338 | ["[CLS]", " This is a ", "extra_id_100"] |
| 339 | ``` |
| 340 | """ |
| 341 | # indexes are counted left of the chars index. |
| 342 | # "hello", index 0, is left of h, index 1 is between h and e. |
| 343 | # index 5 is right of the "o". |
| 344 | |
| 345 | # States are going to capture every possible start (indexes as above) |
| 346 | # as keys, and have as values, a pointer to the position in the trie |
| 347 | # where we're at. This is a partial match for now. |
| 348 | # This enables to keep track of multiple matches while we're iterating |
| 349 | # the string |
| 350 | # If the trie contains, "blowing", and "lower" and we encounter the |
| 351 | # string "blower", we need to split into ["b", "lower"]. |
| 352 | # This is where we need to keep track of multiple possible starts. |
| 353 | states = OrderedDict() |
| 354 | |
| 355 | # This will contain every indices where we need |
| 356 | # to cut. |
| 357 | # We force to cut at offset 0 and len(text) (added later) |
| 358 | offsets = [0] |
| 359 | |
| 360 | # This is used by the lookahead which needs to skip over |
| 361 | # some text where the full match exceeded the place in the initial |
| 362 | # for loop |
| 363 | skip = 0 |
| 364 | # Main loop, Giving this algorithm O(n) complexity |
| 365 | for current, current_char in enumerate(text): |
| 366 | if skip and current < skip: |
| 367 | # Prevents the lookahead for matching twice |
| 368 | # like extra_id_100 and id_100 |
| 369 | continue |
| 370 | |
| 371 | # This will track every state |
| 372 | # that stop matching, we need to stop tracking them. |
| 373 | # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then |
| 374 | # fail on "b", we need to remove 0 from the valid states. |
| 375 | to_remove = set() |
| 376 | # Whenever we found a match, we need to drop everything |
| 377 | # this is a greedy algorithm, it will match on the first found token |