分词函数
(self, text, add_cls=True, add_sep=True, max_length=None)
| 104 | self._do_lower_case = do_lower_case |
| 105 | |
| 106 | def tokenize(self, text, add_cls=True, add_sep=True, max_length=None): |
| 107 | """分词函数 |
| 108 | """ |
| 109 | if self._do_lower_case: |
| 110 | text = unicodedata.normalize('NFD', text) |
| 111 | text = ''.join( |
| 112 | [ch for ch in text if unicodedata.category(ch) != 'Mn']) |
| 113 | text = text.lower() |
| 114 | |
| 115 | tokens = self._tokenize(text) |
| 116 | if add_cls: |
| 117 | tokens.insert(0, self._token_cls) |
| 118 | if add_sep: |
| 119 | tokens.append(self._token_sep) |
| 120 | |
| 121 | if max_length is not None: |
| 122 | self.truncate_sequence(max_length, tokens, None, -2) |
| 123 | |
| 124 | return tokens |
| 125 | |
| 126 | def token_to_id(self, token): |
| 127 | """token转换为对应的id |
no test coverage detected