| 159 | self.annotators = set() |
| 160 | |
| 161 | def tokenize(self, text): |
| 162 | data = [] |
| 163 | matches = [m for m in self._regexp.finditer(text)] |
| 164 | for i in range(len(matches)): |
| 165 | # Get text |
| 166 | token = matches[i].group() |
| 167 | |
| 168 | # Get whitespace |
| 169 | span = matches[i].span() |
| 170 | start_ws = span[0] |
| 171 | if i + 1 < len(matches): |
| 172 | end_ws = matches[i + 1].span()[0] |
| 173 | else: |
| 174 | end_ws = span[1] |
| 175 | |
| 176 | # Format data |
| 177 | data.append(( |
| 178 | token, |
| 179 | text[start_ws: end_ws], |
| 180 | span, |
| 181 | )) |
| 182 | return Tokens(data, self.annotators) |
| 183 | |
| 184 | |
| 185 | def has_answer(tokenized_answers, text): |