(args)
| 932 | |
| 933 | |
| 934 | def build_kosmos_engine(args): |
| 935 | processor = AutoProcessor.from_pretrained(args.model_path) |
| 936 | raw_image = Image.new('RGB', [10, 10]) # dummy image |
| 937 | image = processor(text="dummy", images=raw_image, |
| 938 | return_tensors="pt")['pixel_values'].to( |
| 939 | args.device, torch.float16) |
| 940 | |
| 941 | class VisionEncoderWrapper(torch.nn.Module): |
| 942 | |
| 943 | def __init__(self, encoder, connector): |
| 944 | super().__init__() |
| 945 | self.encoder = encoder |
| 946 | self.connector = connector |
| 947 | |
| 948 | def forward(self, images): |
| 949 | vision_x = self.encoder(images, output_hidden_states=True) |
| 950 | img_features = self.encoder.model.post_layernorm( |
| 951 | vision_x.last_hidden_state) |
| 952 | img_features = F.normalize(img_features, dim=-1) |
| 953 | img_features, _ = self.connector(img_features) |
| 954 | return img_features |
| 955 | |
| 956 | model = AutoModelForVision2Seq.from_pretrained(args.model_path, |
| 957 | dtype=torch.float16) |
| 958 | wrapper = VisionEncoderWrapper( |
| 959 | model.vision_model.to(args.device), |
| 960 | model.image_to_text_projection.to(args.device)) |
| 961 | |
| 962 | export_onnx(wrapper, image, f'{args.output_dir}/onnx') |
| 963 | build_trt_engine( |
| 964 | args.model_type, |
| 965 | [image.shape[1], image.shape[2], image.shape[3]], # [3, H, W] |
| 966 | f'{args.output_dir}/onnx', |
| 967 | args.output_dir, |
| 968 | args.max_batch_size) |
| 969 | |
| 970 | |
| 971 | def build_phi_engine(args): |
no test coverage detected