(tokenizer_dir: Optional[str] = None,
vocab_file: Optional[str] = None,
model_name: str = 'GPTForCausalLM',
model_version: Optional[str] = None,
tokenizer_type: Optional[str] = None)
| 197 | |
| 198 | |
| 199 | def load_tokenizer(tokenizer_dir: Optional[str] = None, |
| 200 | vocab_file: Optional[str] = None, |
| 201 | model_name: str = 'GPTForCausalLM', |
| 202 | model_version: Optional[str] = None, |
| 203 | tokenizer_type: Optional[str] = None): |
| 204 | func = partial(_load_tokenizer, tokenizer_dir, vocab_file, model_name, |
| 205 | model_version, tokenizer_type) |
| 206 | if mpi_world_size() > 1: |
| 207 | # Under MPI env, load tokenizer will result in multiple processes to download the same file to the same folder. |
| 208 | # This will result some random bug. Force loading on rank0 to warmup the tokenizer to avoid this issue. |
| 209 | if mpi_rank() == 0: |
| 210 | func() |
| 211 | mpi_barrier() |
| 212 | return func() |
| 213 | |
| 214 | |
| 215 | def prepare_enc_dec_inputs(batch_input_ids: List[torch.Tensor], model_name: str, |
no test coverage detected