()
| 46 | |
| 47 | @torch.no_grad() |
| 48 | def main(): |
| 49 | args = parse_args() |
| 50 | print("\n=============== Argument ===============") |
| 51 | for key in vars(args): |
| 52 | print(f"{key}: {vars(args)[key]}") |
| 53 | print("========================================") |
| 54 | |
| 55 | model_id = args.model_id |
| 56 | |
| 57 | hf_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) |
| 58 | processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) |
| 59 | quantization_config = generate_quanto_config(hf_config, args.quant_type) |
| 60 | |
| 61 | check_params(args, hf_config) |
| 62 | |
| 63 | model_safetensors_index_path = os.path.join(model_id, "model.safetensors.index.json") |
| 64 | with open(model_safetensors_index_path, "r") as f: |
| 65 | model_safetensors_index = json.load(f) |
| 66 | weight_map = model_safetensors_index['weight_map'] |
| 67 | vision_map = {} |
| 68 | for key, value in weight_map.items(): |
| 69 | if 'vision_tower' in key or 'image_newline' in key or 'multi_modal_projector' in key: |
| 70 | new_key = key.replace('.weight','').replace('.bias','') |
| 71 | if new_key not in vision_map: |
| 72 | vision_map[new_key] = value |
| 73 | device_map = { |
| 74 | 'language_model.model.embed_tokens': 'cuda:0', |
| 75 | 'language_model.model.norm': f'cuda:{args.world_size - 1}', |
| 76 | 'language_model.lm_head': f'cuda:{args.world_size - 1}' |
| 77 | } |
| 78 | for key, value in vision_map.items(): |
| 79 | device_map[key] = f'cuda:0' |
| 80 | device_map['vision_tower.vision_model.post_layernorm'] = f'cuda:0' |
| 81 | layers_per_device = hf_config.text_config.num_hidden_layers // args.world_size |
| 82 | for i in range(args.world_size): |
| 83 | for j in range(layers_per_device): |
| 84 | device_map[f'language_model.model.layers.{i * layers_per_device + j}'] = f'cuda:{i}' |
| 85 | |
| 86 | messages = [ |
| 87 | {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant created by Minimax based on MiniMax-VL-01 model."}]}, |
| 88 | {"role": "user", "content": [{"type": "image", "image": "placeholder"},{"type": "text", "text": "Describe this image."}]}, |
| 89 | ] |
| 90 | prompt = processor.tokenizer.apply_chat_template( |
| 91 | messages, tokenize=False, add_generation_prompt=True |
| 92 | ) |
| 93 | print(f"prompt: \n{prompt}") |
| 94 | raw_image = Image.open(args.image_path) |
| 95 | model_inputs = processor(images=[raw_image], text=prompt, return_tensors='pt').to('cuda').to(torch.bfloat16) |
| 96 | |
| 97 | quantized_model = AutoModelForCausalLM.from_pretrained( |
| 98 | model_id, |
| 99 | torch_dtype="bfloat16", |
| 100 | device_map=device_map, |
| 101 | quantization_config=quantization_config, |
| 102 | trust_remote_code=True, |
| 103 | offload_buffers=True, |
| 104 | ) |
| 105 | generation_config = GenerationConfig( |
no test coverage detected