Method call

wan/modules/tokenizers.py:49–73 · view source on GitHub ↗

(self, sequence, **kwargs)

Source from the content-addressed store, hash-verified

47	self.vocab_size = self.tokenizer.vocab_size
48
49	def __call__(self, sequence, **kwargs):
50	return_mask = kwargs.pop('return_mask', False)
51
52	# arguments
53	_kwargs = {'return_tensors': 'pt'}
54	if self.seq_len is not None:
55	_kwargs.update({
56	'padding': 'max_length',
57	'truncation': True,
58	'max_length': self.seq_len
59	})
60	_kwargs.update(**kwargs)
61
62	# tokenization
63	if isinstance(sequence, str):
64	sequence = [sequence]
65	if self.clean:
66	sequence = [self._clean(u) for u in sequence]
67	ids = self.tokenizer(sequence, **_kwargs)
68
69	# output
70	if return_mask:
71	return ids.input_ids, ids.attention_mask
72	else:
73	return ids.input_ids
74
75	def _clean(self, text):
76	if self.clean == 'whitespace':

nothing calls this directly

_cleanMethod · 0.95

updateMethod · 0.80

no test coverage detected