()
| 604 | |
| 605 | |
| 606 | def main(): |
| 607 | # Parse input arguments |
| 608 | # See all possible arguments in src/transformers/training_args.py |
| 609 | # If use DeepSpeed zero3, init_dist must before HfArgumentParser |
| 610 | launcher = os.environ.get('LAUNCHER', 'slurm') |
| 611 | init_dist(launcher=launcher, backend='nccl') |
| 612 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) |
| 613 | if len(sys.argv) == 2 and sys.argv[1].endswith('.json'): |
| 614 | # If we pass only one argument to the script, and it's the path to a json file, |
| 615 | # let's parse it to get our arguments. |
| 616 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) |
| 617 | else: |
| 618 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() |
| 619 | |
| 620 | # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The |
| 621 | # information sent is the one passed as arguments along with your Python/PyTorch versions. |
| 622 | # send_example_telemetry('InternV-Chat', model_args, data_args) |
| 623 | |
| 624 | # Setup logging |
| 625 | logging.basicConfig( |
| 626 | format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', |
| 627 | datefmt='%m/%d/%Y %H:%M:%S', |
| 628 | handlers=[logging.StreamHandler(sys.stdout)], |
| 629 | ) |
| 630 | |
| 631 | if training_args.should_log: |
| 632 | # The default of training_args.log_level is passive, so we set log level at info here to have that default. |
| 633 | transformers.utils.logging.set_verbosity_info() |
| 634 | |
| 635 | log_level = training_args.get_process_log_level() |
| 636 | logger.setLevel(log_level) |
| 637 | set_verbosity(log_level) |
| 638 | enable_default_handler() |
| 639 | enable_explicit_format() |
| 640 | |
| 641 | # Log on each process the small summary: |
| 642 | logger.warning( |
| 643 | f'Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}' |
| 644 | + f'distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}' |
| 645 | ) |
| 646 | logger.info(f'Training/evaluation parameters {training_args}') |
| 647 | |
| 648 | # Detecting last checkpoint and eventually continue from last checkpoint. |
| 649 | last_checkpoint = None |
| 650 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: |
| 651 | last_checkpoint = get_last_checkpoint(training_args.output_dir) |
| 652 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: |
| 653 | raise ValueError( |
| 654 | f'Output directory ({training_args.output_dir}) already exists and is not empty. ' |
| 655 | 'Use --overwrite_output_dir to overcome.' |
| 656 | ) |
| 657 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: |
| 658 | logger.info( |
| 659 | f'Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change ' |
| 660 | 'the `--output_dir` or add `--overwrite_output_dir` to train from scratch.' |
| 661 | ) |
| 662 | # Set seed before initializing model. |
| 663 | set_seed(training_args.seed) |
no test coverage detected