hub / github.com/THUDM/GLM / EncodeAsIds

Method EncodeAsIds

data_utils/tokenization.py:325–394 · view source on GitHub ↗

encode text using text tokenizer and shift Id values for command tokens

(self, text, process_fn=None)

Source from the content-addressed store, hash-verified

323	return self._text_token_vocab
324
325	def EncodeAsIds(self, text, process_fn=None):
326	"""
327	encode text using text tokenizer and shift Id values for command tokens
328	"""
329	processed_text = text
330	if process_fn is not None:
331	processed_text = process_fn(processed_text)
332
333	def split_on_token(tok_extended: CommandToken, text):
334	result = []
335	tok = tok_extended.token
336	split_text = text.split(tok)
337	for i, sub_text in enumerate(split_text):
338	# CommandToken can control whitespace stripping around them.
339	# We use them for GPT2 and Roberta to have different behavior depending on the special token
340	# Cf. https://github.com/huggingface/transformers/pull/2778
341	# and https://github.com/huggingface/transformers/issues/3788
342	# Strip white spaces on the right
343	if tok_extended.rstrip and i > 0:
344	# A bit counter-intuitive but we strip the left of the string
345	# since tok_extended.rstrip means the special token is eating all white spaces on its right
346	sub_text = sub_text.lstrip()
347	# Strip white spaces on the left
348	if tok_extended.lstrip and i < len(split_text) - 1:
349	sub_text = sub_text.rstrip() # Opposite here
350
351	if i == 0 and not sub_text:
352	result.append(tok)
353	elif i == len(split_text) - 1:
354	if sub_text:
355	result.append(sub_text)
356	else:
357	pass
358	else:
359	if sub_text:
360	result.append(sub_text)
361	result.append(tok)
362	return result
363
364	def split_on_tokens(tok_list, text):
365	if not text.strip():
366	return []
367	if not tok_list:
368	return self.text_tokenizer.encode(text)
369
370	tokenized_text = []
371	text_list = [text]
372	for tok in tok_list:
373	tokenized_text = []
374	for sub_text in text_list:
375	if sub_text not in self._command_token_tokens:
376	tokenized_text.extend(split_on_token(tok, sub_text))
377	else:
378	tokenized_text.append(sub_text)
379	text_list = tokenized_text
380
381	return list(
382	itertools.chain.from_iterable(

Callers 15

__call__Method · 0.95

finetuneFunction · 0.45

read_contextFunction · 0.45

encode_inputMethod · 0.45

encodeMethod · 0.45

encode_inputMethod · 0.45

get_verbalization_idsFunction · 0.45

get_tokenized_inputMethod · 0.45

__getitem__Method · 0.45

encodeMethod · 0.45

Calls 2

set_command_tokensMethod · 0.95

TokenizationClass · 0.85

Tested by

no test coverage detected