(self,
audio_path,
return_all_layers=False,
dtype=torch.float32)
| 64 | self.video_rate = 30 |
| 65 | |
| 66 | def extract_audio_feat(self, |
| 67 | audio_path, |
| 68 | return_all_layers=False, |
| 69 | dtype=torch.float32): |
| 70 | audio_input, sample_rate = librosa.load(audio_path, sr=16000) |
| 71 | |
| 72 | input_values = self.processor( |
| 73 | audio_input, sampling_rate=sample_rate, |
| 74 | return_tensors="pt").input_values |
| 75 | |
| 76 | # INFERENCE |
| 77 | |
| 78 | # retrieve logits & take argmax |
| 79 | res = self.model( |
| 80 | input_values.to(self.model.device), output_hidden_states=True) |
| 81 | if return_all_layers: |
| 82 | feat = torch.cat(res.hidden_states) |
| 83 | else: |
| 84 | feat = res.hidden_states[-1] |
| 85 | feat = linear_interpolation( |
| 86 | feat, input_fps=50, output_fps=self.video_rate) |
| 87 | |
| 88 | z = feat.to(dtype) # Encoding for the motion |
| 89 | return z |
| 90 | |
| 91 | def get_audio_embed_bucket(self, |
| 92 | audio_embed, |
no test coverage detected