Method tokenize

utils/data_util.py:106–124 · view source on GitHub ↗

分词函数

(self, text, add_cls=True, add_sep=True, max_length=None)

Source from the content-addressed store, hash-verified

104	self._do_lower_case = do_lower_case
105
106	def tokenize(self, text, add_cls=True, add_sep=True, max_length=None):
107	"""分词函数
108	"""
109	if self._do_lower_case:
110	text = unicodedata.normalize('NFD', text)
111	text = ''.join(
112	[ch for ch in text if unicodedata.category(ch) != 'Mn'])
113	text = text.lower()
114
115	tokens = self._tokenize(text)
116	if add_cls:
117	tokens.insert(0, self._token_cls)
118	if add_sep:
119	tokens.append(self._token_sep)
120
121	if max_length is not None:
122	self.truncate_sequence(max_length, tokens, None, -2)
123
124	return tokens
125
126	def token_to_id(self, token):
127	"""token转换为对应的id

encodeMethod · 0.95

extract_chinese_and_punct.pyFile · 0.45

covert_to_tokensFunction · 0.45

__init__Method · 0.45

covert_to_tokensFunction · 0.45

__init__Method · 0.45

collateMethod · 0.45

__init__Method · 0.45

collateMethod · 0.45

get_text_inputFunction · 0.45

_tokenizeMethod · 0.95

truncate_sequenceMethod · 0.95

insertMethod · 0.45

no test coverage detected