Make dataset and collator for supervised fine-tuning.
(tokenizer: transformers.PreTrainedTokenizer,
data_args)
| 579 | |
| 580 | |
| 581 | def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, |
| 582 | data_args) -> Dict: |
| 583 | """Make dataset and collator for supervised fine-tuning.""" |
| 584 | dataset_cls = (LazySupervisedDataset |
| 585 | if data_args.lazy_preprocess else SupervisedDataset) |
| 586 | train_dataset = dataset_cls(tokenizer=tokenizer, |
| 587 | data_path=data_args.data_path, |
| 588 | multimodal_cfg=dict( |
| 589 | is_multimodal=data_args.is_multimodal, |
| 590 | sep_image_conv_front=data_args.sep_image_conv_front, |
| 591 | image_token_len=data_args.image_token_len, |
| 592 | image_folder=data_args.image_folder, |
| 593 | image_aspect_ratio=data_args.image_aspect_ratio, |
| 594 | use_im_start_end=getattr(data_args, 'mm_use_im_start_end', False), |
| 595 | image_processor=getattr(data_args, 'image_processor', None))) |
| 596 | data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) |
| 597 | return dict(train_dataset=train_dataset, |
| 598 | eval_dataset=None, |
| 599 | data_collator=data_collator) |
| 600 | |
| 601 | |
| 602 | def train(): |
no test coverage detected