Get the tokenizer for the model.
(
model_args: ModelArguments, data_args: DataArguments, auto_set_chat_template: bool = True
)
| 64 | |
| 65 | |
| 66 | def get_tokenizer( |
| 67 | model_args: ModelArguments, data_args: DataArguments, auto_set_chat_template: bool = True |
| 68 | ) -> PreTrainedTokenizer: |
| 69 | """Get the tokenizer for the model.""" |
| 70 | tokenizer = AutoTokenizer.from_pretrained( |
| 71 | ( |
| 72 | model_args.model_name_or_path |
| 73 | if model_args.tokenizer_name_or_path is None |
| 74 | else model_args.tokenizer_name_or_path |
| 75 | ), |
| 76 | revision=model_args.model_revision, |
| 77 | trust_remote_code=model_args.trust_remote_code, |
| 78 | ) |
| 79 | if tokenizer.pad_token_id is None: |
| 80 | tokenizer.pad_token_id = tokenizer.eos_token_id |
| 81 | |
| 82 | if data_args.truncation_side is not None: |
| 83 | tokenizer.truncation_side = data_args.truncation_side |
| 84 | |
| 85 | # Set reasonable default for models without max length |
| 86 | if tokenizer.model_max_length > 100_000: |
| 87 | tokenizer.model_max_length = 2048 |
| 88 | |
| 89 | if data_args.chat_template is not None: |
| 90 | tokenizer.chat_template = data_args.chat_template |
| 91 | elif auto_set_chat_template and tokenizer.get_chat_template() is None: |
| 92 | tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE |
| 93 | |
| 94 | return tokenizer |
| 95 | |
| 96 | |
| 97 | def get_peft_config(model_args: ModelArguments) -> PeftConfig | None: |
no outgoing calls