(self, s: str, encode_special_tokens=False)
| 46 | self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens]) |
| 47 | |
| 48 | def tokenize(self, s: str, encode_special_tokens=False): |
| 49 | if encode_special_tokens: |
| 50 | last_index = 0 |
| 51 | t = [] |
| 52 | for match in re.finditer(self.role_special_token_expression, s): |
| 53 | if last_index < match.start(): |
| 54 | t.extend(self.sp_model.EncodeAsPieces(s[last_index : match.start()])) |
| 55 | t.append(s[match.start() : match.end()]) |
| 56 | last_index = match.end() |
| 57 | if last_index < len(s): |
| 58 | t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) |
| 59 | return t |
| 60 | else: |
| 61 | return self.sp_model.EncodeAsPieces(s) |
| 62 | |
| 63 | def encode(self, s: str, bos: bool = False, eos: bool = False) -> list[int]: |
| 64 | assert isinstance(s, str) |
no test coverage detected