Tokenize a prompt. Args: s: a prompt. add_bos: Whether to add ``bos`` token id when encoding the prompt. add_special_tokens: Whether or not to add special tokens when encoding the prompt. Returns: list[int]: token ids.
(self, s: str, add_bos: bool = True, add_special_tokens: bool = True, **kwargs)
| 463 | return self.model.get_vocab() |
| 464 | |
| 465 | def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True, **kwargs): |
| 466 | """Tokenize a prompt. |
| 467 | |
| 468 | Args: |
| 469 | s: a prompt. |
| 470 | add_bos: Whether to add ``bos`` token id when encoding the prompt. |
| 471 | add_special_tokens: Whether or not to add special tokens |
| 472 | when encoding the prompt. |
| 473 | |
| 474 | Returns: |
| 475 | list[int]: token ids. |
| 476 | """ |
| 477 | encoded = self.model.encode(s, add_bos, add_special_tokens, **kwargs) |
| 478 | if encoded[:2] == [self.bos_token_id] * 2: |
| 479 | self.logger.warning(f'Detected duplicate bos token {self.bos_token_id} in prompt, ' |
| 480 | 'this will likely reduce response quality, one of them will be' |
| 481 | 'removed') |
| 482 | encoded = encoded[1:] |
| 483 | return encoded |
| 484 | |
| 485 | def decode( |
| 486 | self, |
no outgoing calls