Build the vision part of a VLM model when backend is turbomind, or load the whole VLM model when `self.with_llm==True`
(self, trust_remote_code: bool = False)
| 135 | self.preprocess_func = self._preprocess_7b |
| 136 | |
| 137 | def build_model(self, trust_remote_code: bool = False): |
| 138 | """Build the vision part of a VLM model when backend is turbomind, or |
| 139 | load the whole VLM model when `self.with_llm==True`""" |
| 140 | from accelerate import init_empty_weights |
| 141 | with init_empty_weights(), warnings.catch_warnings(), \ |
| 142 | init_empty_vit(self.model_path): |
| 143 | warnings.simplefilter('ignore') |
| 144 | config = self.hf_config |
| 145 | model = AutoModelForCausalLM.from_config(config, trust_remote_code=trust_remote_code) |
| 146 | model.vit.load_model() |
| 147 | model.vit.resize_pos() |
| 148 | if hasattr(self.hf_config, 'img_size'): |
| 149 | model.vit.vision_tower.vision_model.embeddings.image_size = \ |
| 150 | self.hf_config.img_size |
| 151 | model.vit.vision_tower.vision_model.post_layernorm.to_empty(device='cpu').half() |
| 152 | self.vl_model = model |
| 153 | if not self.with_llm: |
| 154 | del model.model |
| 155 | del model.output |
| 156 | |
| 157 | from accelerate.utils import get_balanced_memory, infer_auto_device_map |
| 158 | max_memory = get_balanced_memory(model, |
| 159 | max_memory=self.max_memory, |
| 160 | dtype=torch.half, |
| 161 | no_split_module_classes=['CLIPEncoderLayer']) |
| 162 | device_map = infer_auto_device_map(model, |
| 163 | no_split_module_classes=['CLIPEncoderLayer'], |
| 164 | max_memory=max_memory, |
| 165 | dtype=torch.half) |
| 166 | # make all tensor on same device for postprocess |
| 167 | if 'plora_glb_GN' in device_map: |
| 168 | device_map['plora_sub_GN'] = device_map['plora_glb_GN'] |
| 169 | |
| 170 | from accelerate import load_checkpoint_and_dispatch |
| 171 | with disable_logging(): |
| 172 | load_checkpoint_and_dispatch(model=model, |
| 173 | checkpoint=self.model_path, |
| 174 | device_map=device_map if not self.with_llm else {'': 'cpu'}, |
| 175 | no_split_module_classes=['CLIPEncoderLayer'], |
| 176 | dtype=torch.half) |
| 177 | |
| 178 | if 'plora_glb_GN' in device_map: |
| 179 | add_device_hook(model.vit.vision_tower.vision_model.encoder.layers[-1], device_map['plora_glb_GN'], |
| 180 | lambda x: (x[0].to(device=device_map['plora_glb_GN']), )) |
| 181 | |
| 182 | self.model = model.eval() |
| 183 | |
| 184 | def _preprocess_2d5(self, image: Image, params: dict) -> dict: |
| 185 | """Image preprocessing for internlm-xcomposer2d5-7b.""" |
nothing calls this directly
no test coverage detected