Get the audio embedding from the model Parameters ---------- data: a list of dict the audio input dict list from 'get_audio_feature' method Returns ---------- audio_embed: torch.Tensor a tensor of audio_embeds (N, D)
(self, data)
| 752 | return text_embeds |
| 753 | |
| 754 | def get_audio_embedding(self, data): |
| 755 | """Get the audio embedding from the model |
| 756 | |
| 757 | Parameters |
| 758 | ---------- |
| 759 | data: a list of dict |
| 760 | the audio input dict list from 'get_audio_feature' method |
| 761 | |
| 762 | Returns |
| 763 | ---------- |
| 764 | audio_embed: torch.Tensor |
| 765 | a tensor of audio_embeds (N, D) |
| 766 | |
| 767 | """ |
| 768 | device = next(self.parameters()).device |
| 769 | input_dict = {} |
| 770 | keys = data[0].keys() |
| 771 | for k in keys: |
| 772 | input_dict[k] = torch.cat([d[k].unsqueeze(0) for d in data], dim=0).to( |
| 773 | device |
| 774 | ) |
| 775 | |
| 776 | audio_embeds = self.audio_projection( |
| 777 | self.encode_audio(input_dict, device=device)["embedding"] |
| 778 | ) |
| 779 | audio_embeds = F.normalize(audio_embeds, dim=-1) |
| 780 | |
| 781 | return audio_embeds |
| 782 | |
| 783 | def audio_infer(self, audio, hopsize=None, device=None): |
| 784 | """Forward one audio and produce the audio embedding |
no test coverage detected