| 143 | return self.cut_text(text, offsets) |
| 144 | |
| 145 | def cut_text(self, text, offsets): |
| 146 | # We have all the offsets now, we just need to do the actual splitting. |
| 147 | # We need to eventually add the first part of the string and the eventual |
| 148 | # last part. |
| 149 | offsets.append(len(text)) |
| 150 | tokens = [] |
| 151 | start = 0 |
| 152 | for end in offsets: |
| 153 | if start > end: |
| 154 | logger.error( |
| 155 | "There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it" |
| 156 | " anyway." |
| 157 | ) |
| 158 | continue |
| 159 | elif start == end: |
| 160 | # This might happen if there's a match at index 0 |
| 161 | # we're also preventing zero-width cuts in case of two |
| 162 | # consecutive matches |
| 163 | continue |
| 164 | tokens.append(text[start:end]) |
| 165 | start = end |
| 166 | |
| 167 | return tokens |