Build the vision part of a VLM model when backend is turbomind, or load the whole VLM model when `self.with_llm==True`
(self, trust_remote_code: bool = False)
| 24 | device_map='auto') |
| 25 | |
| 26 | def build_model(self, trust_remote_code: bool = False): |
| 27 | """Build the vision part of a VLM model when backend is turbomind, or |
| 28 | load the whole VLM model when `self.with_llm==True`""" |
| 29 | from accelerate import init_empty_weights, load_checkpoint_and_dispatch |
| 30 | with init_empty_weights(): |
| 31 | model = AutoModelForCausalLM.from_config(self.hf_config, trust_remote_code=trust_remote_code) |
| 32 | |
| 33 | self.vl_model = model |
| 34 | if not self.with_llm: |
| 35 | # Remove nn modules other than embedding from the LLM model |
| 36 | for key in ['emb_drop', 'ln_f', 'blocks', 'ff_out']: |
| 37 | del model.model.transformer[key] |
| 38 | self.token_embedding = model.model.transformer.wte |
| 39 | |
| 40 | with disable_logging(): |
| 41 | load_checkpoint_and_dispatch(model=model, |
| 42 | checkpoint=self.model_path, |
| 43 | device_map='auto' if not self.with_llm else {'': 'cpu'}, |
| 44 | max_memory=self.max_memory, |
| 45 | no_split_module_classes=['ResidualAttentionBlock', 'Embedding'], |
| 46 | dtype=torch.half) |
| 47 | |
| 48 | # We need eval mode to freeze the weights in model, thus, |
| 49 | # avoid randomness in inference. |
| 50 | self.model = model.eval() |
| 51 | |
| 52 | def preprocess(self, messages: list[dict]) -> list[dict]: |
| 53 | """Refer to the `super.preprocess() for spec.""" |
nothing calls this directly
no test coverage detected