Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`. Args: token_ids (
(
self,
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
truncate_before_pattern: Optional[List[str]] = None,
**kwargs,
)
| 308 | return (text, kwargs) |
| 309 | |
| 310 | def decode( |
| 311 | self, |
| 312 | token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], |
| 313 | skip_special_tokens: bool = False, |
| 314 | clean_up_tokenization_spaces: bool = None, |
| 315 | truncate_before_pattern: Optional[List[str]] = None, |
| 316 | **kwargs, |
| 317 | ) -> str: |
| 318 | """ |
| 319 | Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special |
| 320 | tokens and clean up tokenization spaces. |
| 321 | |
| 322 | Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`. |
| 323 | |
| 324 | Args: |
| 325 | token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`): |
| 326 | List of tokenized input ids. Can be obtained using the `__call__` method. |
| 327 | skip_special_tokens (`bool`, *optional*, defaults to `False`): |
| 328 | Whether or not to remove special tokens in the decoding. |
| 329 | clean_up_tokenization_spaces (`bool`, *optional*): |
| 330 | Whether or not to clean up the tokenization spaces. If `None`, will default to |
| 331 | `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`). |
| 332 | truncate_before_pattern (`List[str]`, *optional*, defaults to `None`): |
| 333 | A list of regular expression strings that will be used to truncate the returned string. This can be |
| 334 | used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning |
| 335 | of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`. |
| 336 | kwargs (additional keyword arguments, *optional*): |
| 337 | Will be passed to the underlying model specific decode method. |
| 338 | |
| 339 | Returns: |
| 340 | `str`: The decoded sentence. |
| 341 | """ |
| 342 | decoded_text = super()._decode( |
| 343 | token_ids=token_ids, |
| 344 | skip_special_tokens=skip_special_tokens, |
| 345 | clean_up_tokenization_spaces=clean_up_tokenization_spaces, |
| 346 | **kwargs, |
| 347 | ) |
| 348 | |
| 349 | if truncate_before_pattern is not None and len(truncate_before_pattern) > 0: |
| 350 | decoded_text = self.truncate(decoded_text, truncate_before_pattern) |
| 351 | |
| 352 | return decoded_text |
| 353 | |
| 354 | def truncate(self, completion, truncate_before_pattern): |
| 355 | def find_re(string, pattern, start_pos): |
no test coverage detected