MCPcopy
hub / github.com/InternLM/lmdeploy / indexes_containing_token

Method indexes_containing_token

lmdeploy/tokenizer.py:152–190  ·  view source on GitHub ↗

Return all the possible indexes, whose decoding output may contain the input token.

(self, token: str)

Source from the content-addressed store, hash-verified

150 return self._maybe_decode_bytes
151
152 def indexes_containing_token(self, token: str):
153 """Return all the possible indexes, whose decoding output may contain
154 the input token."""
155 # traversing vocab is time consuming, can not be accelerated with
156 # multi threads (computation) or multi process (can't pickle tokenizer)
157 # so, we maintain latest 10 stop words and return directly if matched
158 for _token, _indexes in self._indexes_tokens_deque:
159 if token == _token:
160 return _indexes
161
162 if self.token2id == {}:
163 # decode is slower than convert_ids_to_tokens
164 if self.maybe_decode_bytes:
165 for i in range(self.vocab_size):
166 try:
167 self.token2id[self.model.decode(i)] = i
168 except: # noqa: E722
169 # some tokens just can't be decoded by `decode`
170 pass
171 else:
172 self.token2id = {self.model.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
173 if token == ' ': # ' ' is special
174 token = '▁'
175 indexes = [i for _token, i in self.token2id.items() if token in _token]
176 if len(indexes) > self.max_indexes_num:
177 # multiple id decode to same token
178 indexes = [i for i in indexes if self.decode([i]) == token]
179 indexes = indexes[:self.max_indexes_num]
180 self.logger.warning(f'There are too many(>{self.max_indexes_num}) possible '
181 f'indexes may decoding {token}, we will use {indexes} only')
182 # there might be token id that exceeds self.vocab_size
183 if len(indexes) == 0:
184 indexes = self.encode(token, False)
185 if len(indexes) != 1:
186 self.logger.warning(f'The token {token}, its length of indexes {indexes} is '
187 'not 1. Currently, it can not be used as stop words')
188 indexes = []
189 self._indexes_tokens_deque.append((token, indexes))
190 return indexes
191
192 def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True, **kwargs):
193 """Tokenize a prompt.

Callers 1

Calls 5

decodeMethod · 0.95
encodeMethod · 0.95
itemsMethod · 0.80
decodeMethod · 0.45
appendMethod · 0.45

Tested by 1