| 22 | ref[""] = 1 |
| 23 | |
| 24 | def split(self, text: str) -> List[str]: |
| 25 | states = OrderedDict() |
| 26 | |
| 27 | # This will contain every indices where we need |
| 28 | # to cut. |
| 29 | # We force to cut at offset 0 and len(text) (added later) |
| 30 | offsets = [0] |
| 31 | |
| 32 | # This is used by the lookahead which needs to skip over |
| 33 | # some text where the full match exceeded the place in the initial |
| 34 | # for loop |
| 35 | skip = 0 |
| 36 | # Main loop, Giving this algorithm O(n) complexity |
| 37 | for current, current_char in enumerate(text): |
| 38 | if skip and current < skip: |
| 39 | # Prevents the lookahead for matching twice |
| 40 | # like extra_id_100 and id_100 |
| 41 | continue |
| 42 | |
| 43 | # This will track every state |
| 44 | # that stop matching, we need to stop tracking them. |
| 45 | # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then |
| 46 | # fail on "b", we need to remove 0 from the valid states. |
| 47 | to_remove = set() |
| 48 | # Whenever we found a match, we need to drop everything |
| 49 | # this is a greedy algorithm, it will match on the first found token |
| 50 | reset = False |
| 51 | |
| 52 | # In this case, we already have partial matches (But unfinished) |
| 53 | for start, trie_pointer in states.items(): |
| 54 | if "" in trie_pointer: |
| 55 | # This is a final match, we need to reset and |
| 56 | # store the results in `offsets`. |
| 57 | |
| 58 | # Lookahead to match longest first |
| 59 | # Important in case of extra_id_1 vs extra_id_100 |
| 60 | # Here we are also actively looking for other earlier partial |
| 61 | # matches |
| 62 | # "[CLS]", "L", we need to match CLS even if L is special |
| 63 | for lookstart, looktrie_pointer in states.items(): |
| 64 | if lookstart > start: |
| 65 | # This partial match is later, we can stop looking |
| 66 | break |
| 67 | elif lookstart < start: |
| 68 | # This partial match is earlier, the trie pointer |
| 69 | # was already updated, so index is + 1 |
| 70 | lookahead_index = current + 1 |
| 71 | end = current + 1 |
| 72 | else: |
| 73 | # Here lookstart == start and |
| 74 | # looktrie_pointer == trie_pointer |
| 75 | # It wasn't updated yet so indices are current ones |
| 76 | lookahead_index = current |
| 77 | end = current |
| 78 | next_char = text[lookahead_index] if lookahead_index < len(text) else None |
| 79 | if "" in looktrie_pointer: |
| 80 | start = lookstart |
| 81 | end = lookahead_index |