hub / github.com/InternLM/lmdeploy / indexes_containing_token

Method indexes_containing_token

lmdeploy/tokenizer.py:152–190 · view source on GitHub ↗

Return all the possible indexes, whose decoding output may contain the input token.

(self, token: str)

Source from the content-addressed store, hash-verified

150	return self._maybe_decode_bytes
151
152	def indexes_containing_token(self, token: str):
153	"""Return all the possible indexes, whose decoding output may contain
154	the input token."""
155	# traversing vocab is time consuming, can not be accelerated with
156	# multi threads (computation) or multi process (can't pickle tokenizer)
157	# so, we maintain latest 10 stop words and return directly if matched
158	for _token, _indexes in self._indexes_tokens_deque:
159	if token == _token:
160	return _indexes
161
162	if self.token2id == {}:
163	# decode is slower than convert_ids_to_tokens
164	if self.maybe_decode_bytes:
165	for i in range(self.vocab_size):
166	try:
167	self.token2id[self.model.decode(i)] = i
168	except: # noqa: E722
169	# some tokens just can't be decoded by `decode`
170	pass
171	else:
172	self.token2id = {self.model.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
173	if token == ' ': # ' ' is special
174	token = '▁'
175	indexes = [i for _token, i in self.token2id.items() if token in _token]
176	if len(indexes) > self.max_indexes_num:
177	# multiple id decode to same token
178	indexes = [i for i in indexes if self.decode([i]) == token]
179	indexes = indexes[:self.max_indexes_num]
180	self.logger.warning(f'There are too many(>{self.max_indexes_num}) possible '
181	f'indexes may decoding {token}, we will use {indexes} only')
182	# there might be token id that exceeds self.vocab_size
183	if len(indexes) == 0:
184	indexes = self.encode(token, False)
185	if len(indexes) != 1:
186	self.logger.warning(f'The token {token}, its length of indexes {indexes} is '
187	'not 1. Currently, it can not be used as stop words')
188	indexes = []
189	self._indexes_tokens_deque.append((token, indexes))
190	return indexes
191
192	def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True, **kwargs):
193	"""Tokenize a prompt.

Callers 1

test_tokenizer_with_stop_wordsFunction · 0.95

Calls 5

decodeMethod · 0.95

encodeMethod · 0.95

itemsMethod · 0.80

decodeMethod · 0.45

appendMethod · 0.45

Tested by 1

test_tokenizer_with_stop_wordsFunction · 0.76