Method extract_audio_feat

wan/modules/s2v/audio_encoder.py:66–89 · view source on GitHub ↗

(self,
                           audio_path,
                           return_all_layers=False,
                           dtype=torch.float32)

Source from the content-addressed store, hash-verified

64	self.video_rate = 30
65
66	def extract_audio_feat(self,
67	audio_path,
68	return_all_layers=False,
69	dtype=torch.float32):
70	audio_input, sample_rate = librosa.load(audio_path, sr=16000)
71
72	input_values = self.processor(
73	audio_input, sampling_rate=sample_rate,
74	return_tensors="pt").input_values
75
76	# INFERENCE
77
78	# retrieve logits & take argmax
79	res = self.model(
80	input_values.to(self.model.device), output_hidden_states=True)
81	if return_all_layers:
82	feat = torch.cat(res.hidden_states)
83	else:
84	feat = res.hidden_states[-1]
85	feat = linear_interpolation(
86	feat, input_fps=50, output_fps=self.video_rate)
87
88	z = feat.to(dtype) # Encoding for the motion
89	return z
90
91	def get_audio_embed_bucket(self,
92	audio_embed,

encode_audioMethod · 0.80

linear_interpolationFunction · 0.85

toMethod · 0.80

no test coverage detected