MCPcopy Index your code
hub / github.com/THUDM/GLM / EncodeAsIds

Method EncodeAsIds

data_utils/tokenization.py:325–394  ·  view source on GitHub ↗

encode text using text tokenizer and shift Id values for command tokens

(self, text, process_fn=None)

Source from the content-addressed store, hash-verified

323 return self._text_token_vocab
324
325 def EncodeAsIds(self, text, process_fn=None):
326 """
327 encode text using text tokenizer and shift Id values for command tokens
328 """
329 processed_text = text
330 if process_fn is not None:
331 processed_text = process_fn(processed_text)
332
333 def split_on_token(tok_extended: CommandToken, text):
334 result = []
335 tok = tok_extended.token
336 split_text = text.split(tok)
337 for i, sub_text in enumerate(split_text):
338 # CommandToken can control whitespace stripping around them.
339 # We use them for GPT2 and Roberta to have different behavior depending on the special token
340 # Cf. https://github.com/huggingface/transformers/pull/2778
341 # and https://github.com/huggingface/transformers/issues/3788
342 # Strip white spaces on the right
343 if tok_extended.rstrip and i > 0:
344 # A bit counter-intuitive but we strip the left of the string
345 # since tok_extended.rstrip means the special token is eating all white spaces on its right
346 sub_text = sub_text.lstrip()
347 # Strip white spaces on the left
348 if tok_extended.lstrip and i < len(split_text) - 1:
349 sub_text = sub_text.rstrip() # Opposite here
350
351 if i == 0 and not sub_text:
352 result.append(tok)
353 elif i == len(split_text) - 1:
354 if sub_text:
355 result.append(sub_text)
356 else:
357 pass
358 else:
359 if sub_text:
360 result.append(sub_text)
361 result.append(tok)
362 return result
363
364 def split_on_tokens(tok_list, text):
365 if not text.strip():
366 return []
367 if not tok_list:
368 return self.text_tokenizer.encode(text)
369
370 tokenized_text = []
371 text_list = [text]
372 for tok in tok_list:
373 tokenized_text = []
374 for sub_text in text_list:
375 if sub_text not in self._command_token_tokens:
376 tokenized_text.extend(split_on_token(tok, sub_text))
377 else:
378 tokenized_text.append(sub_text)
379 text_list = tokenized_text
380
381 return list(
382 itertools.chain.from_iterable(

Callers 15

__call__Method · 0.95
finetuneFunction · 0.45
read_contextFunction · 0.45
encode_inputMethod · 0.45
encodeMethod · 0.45
encode_inputMethod · 0.45
get_verbalization_idsFunction · 0.45
get_tokenized_inputMethod · 0.45
__getitem__Method · 0.45
encodeMethod · 0.45
encodeMethod · 0.45
encodeMethod · 0.45

Calls 2

set_command_tokensMethod · 0.95
TokenizationClass · 0.85

Tested by

no test coverage detected