MCPcopy
hub / github.com/loujie0822/DeepIE / tokenize

Method tokenize

utils/data_util.py:106–124  ·  view source on GitHub ↗

分词函数

(self, text, add_cls=True, add_sep=True, max_length=None)

Source from the content-addressed store, hash-verified

104 self._do_lower_case = do_lower_case
105
106 def tokenize(self, text, add_cls=True, add_sep=True, max_length=None):
107 """分词函数
108 """
109 if self._do_lower_case:
110 text = unicodedata.normalize('NFD', text)
111 text = ''.join(
112 [ch for ch in text if unicodedata.category(ch) != 'Mn'])
113 text = text.lower()
114
115 tokens = self._tokenize(text)
116 if add_cls:
117 tokens.insert(0, self._token_cls)
118 if add_sep:
119 tokens.append(self._token_sep)
120
121 if max_length is not None:
122 self.truncate_sequence(max_length, tokens, None, -2)
123
124 return tokens
125
126 def token_to_id(self, token):
127 """token转换为对应的id

Callers 15

encodeMethod · 0.95
covert_to_tokensFunction · 0.45
covert_to_tokensFunction · 0.45
__init__Method · 0.45
covert_to_tokensFunction · 0.45
__init__Method · 0.45
collateMethod · 0.45
__init__Method · 0.45
collateMethod · 0.45
get_text_inputFunction · 0.45
get_text_inputFunction · 0.45

Calls 3

_tokenizeMethod · 0.95
truncate_sequenceMethod · 0.95
insertMethod · 0.45

Tested by

no test coverage detected