Return all the possible indexes, whose decoding output may contain the input token.
(self, token: str)
| 150 | return self._maybe_decode_bytes |
| 151 | |
| 152 | def indexes_containing_token(self, token: str): |
| 153 | """Return all the possible indexes, whose decoding output may contain |
| 154 | the input token.""" |
| 155 | # traversing vocab is time consuming, can not be accelerated with |
| 156 | # multi threads (computation) or multi process (can't pickle tokenizer) |
| 157 | # so, we maintain latest 10 stop words and return directly if matched |
| 158 | for _token, _indexes in self._indexes_tokens_deque: |
| 159 | if token == _token: |
| 160 | return _indexes |
| 161 | |
| 162 | if self.token2id == {}: |
| 163 | # decode is slower than convert_ids_to_tokens |
| 164 | if self.maybe_decode_bytes: |
| 165 | for i in range(self.vocab_size): |
| 166 | try: |
| 167 | self.token2id[self.model.decode(i)] = i |
| 168 | except: # noqa: E722 |
| 169 | # some tokens just can't be decoded by `decode` |
| 170 | pass |
| 171 | else: |
| 172 | self.token2id = {self.model.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} |
| 173 | if token == ' ': # ' ' is special |
| 174 | token = '▁' |
| 175 | indexes = [i for _token, i in self.token2id.items() if token in _token] |
| 176 | if len(indexes) > self.max_indexes_num: |
| 177 | # multiple id decode to same token |
| 178 | indexes = [i for i in indexes if self.decode([i]) == token] |
| 179 | indexes = indexes[:self.max_indexes_num] |
| 180 | self.logger.warning(f'There are too many(>{self.max_indexes_num}) possible ' |
| 181 | f'indexes may decoding {token}, we will use {indexes} only') |
| 182 | # there might be token id that exceeds self.vocab_size |
| 183 | if len(indexes) == 0: |
| 184 | indexes = self.encode(token, False) |
| 185 | if len(indexes) != 1: |
| 186 | self.logger.warning(f'The token {token}, its length of indexes {indexes} is ' |
| 187 | 'not 1. Currently, it can not be used as stop words') |
| 188 | indexes = [] |
| 189 | self._indexes_tokens_deque.append((token, indexes)) |
| 190 | return indexes |
| 191 | |
| 192 | def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True, **kwargs): |
| 193 | """Tokenize a prompt. |