(pretrained=False,
pretrained_name=None,
model_cls=XLMRobertaCLIP,
return_transforms=False,
return_tokenizer=False,
tokenizer_padding='eos',
dtype=torch.float32,
device='cpu',
**kwargs)
| 432 | |
| 433 | |
| 434 | def _clip(pretrained=False, |
| 435 | pretrained_name=None, |
| 436 | model_cls=XLMRobertaCLIP, |
| 437 | return_transforms=False, |
| 438 | return_tokenizer=False, |
| 439 | tokenizer_padding='eos', |
| 440 | dtype=torch.float32, |
| 441 | device='cpu', |
| 442 | **kwargs): |
| 443 | # init a model on device |
| 444 | with torch.device(device): |
| 445 | model = model_cls(**kwargs) |
| 446 | |
| 447 | # set device |
| 448 | model = model.to(dtype=dtype, device=device) |
| 449 | output = (model,) |
| 450 | |
| 451 | # init transforms |
| 452 | if return_transforms: |
| 453 | # mean and std |
| 454 | if 'siglip' in pretrained_name.lower(): |
| 455 | mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5] |
| 456 | else: |
| 457 | mean = [0.48145466, 0.4578275, 0.40821073] |
| 458 | std = [0.26862954, 0.26130258, 0.27577711] |
| 459 | |
| 460 | # transforms |
| 461 | transforms = T.Compose([ |
| 462 | T.Resize((model.image_size, model.image_size), |
| 463 | interpolation=T.InterpolationMode.BICUBIC), |
| 464 | T.ToTensor(), |
| 465 | T.Normalize(mean=mean, std=std) |
| 466 | ]) |
| 467 | output += (transforms,) |
| 468 | return output[0] if len(output) == 1 else output |
| 469 | |
| 470 | |
| 471 | def clip_xlm_roberta_vit_h_14( |
no test coverage detected