Initialize the mm model, including the data processor, model, etc. Args: args (argparse.Namespace): Command-line arguments containing information such as the model name and pretrained model path. Returns: None.
(self, args)
| 204 | self._init_mm_model(args) |
| 205 | |
| 206 | def _init_mm_model(self, args): |
| 207 | """ |
| 208 | Initialize the mm model, including the data processor, model, etc. |
| 209 | |
| 210 | Args: |
| 211 | args (argparse.Namespace): |
| 212 | Command-line arguments containing information such as the model name and pretrained model path. |
| 213 | |
| 214 | Returns: |
| 215 | None. |
| 216 | """ |
| 217 | |
| 218 | if not os.path.exists(os.path.join(args.model_name_or_path, "preprocessor_config.json")): |
| 219 | assert args.vision_model_name_or_path is not None, "vision_model_name_or_path is None" |
| 220 | vision_model_name_or_path = args.vision_model_name_or_path |
| 221 | else: |
| 222 | vision_model_name_or_path = args.model_name_or_path |
| 223 | |
| 224 | image_preprocess = AdaptiveImageProcessor.from_pretrained(vision_model_name_or_path) |
| 225 | data_processor_config = get_config( |
| 226 | args.dpconfig, |
| 227 | tokenizer=args.model_name_or_path, |
| 228 | vision_model_name_or_path=vision_model_name_or_path, |
| 229 | save_to_disk=False, |
| 230 | crop_tile_option=args.crop_tile_option, |
| 231 | crop_tile_rate=args.crop_tile_rate, |
| 232 | min_crop_flag=args.use_min_crop, |
| 233 | variable_resolution=args.variable_resolution, |
| 234 | rope_3d=args.rope_3d, |
| 235 | ) |
| 236 | data_processor_parser = PdArgumentParser(End2EndProcessorArguments) |
| 237 | self.processor = End2EndProcessor( |
| 238 | data_processor_parser.parse_dict(dict(**dict(data_processor_config.processor_args))), |
| 239 | tokenizer=self.tokenizer, |
| 240 | image_preprocess=image_preprocess, |
| 241 | ) |
| 242 | |
| 243 | self.processor.eval() |
| 244 | self.processor.sft() |
| 245 | |
| 246 | self.image_preprocess = AdaptiveImageProcessor.from_pretrained(vision_model_name_or_path) |
| 247 | |
| 248 | config = Ernie4_5_VLMoeConfig.from_pretrained( |
| 249 | args.model_name_or_path, |
| 250 | tensor_parallel_degree=self.tensor_parallel_degree, |
| 251 | tensor_parallel_rank=self.tensor_parallel_rank, |
| 252 | moe_group="dummy", |
| 253 | ) |
| 254 | config.vision_config.attn_sep = False |
| 255 | config.pixel_hidden_size = config.vision_config.hidden_size |
| 256 | config.im_patch_id = self.tokenizer.get_vocab()[ |
| 257 | MMSpecialTokensConfig.get_special_tokens_info()["image_placeholder"] |
| 258 | ] |
| 259 | config.max_text_id = config.im_patch_id |
| 260 | logger.info(f"[STAGE] image_placeholder_id: {config.im_patch_id}") |
| 261 | |
| 262 | config.moe_capacity = ( |
| 263 | [config.moe_num_experts[0] * 2] * 3 |
no test coverage detected