Extract filter-bank features from audio data. Args: data: Audio samples (list of numpy arrays or tensors). data_len: Lengths of each sample. data_type (str): Input type ("sound", "fbank"). frontend: Frontend instance for feature extraction. Returns:
(data, data_len=None, data_type: str = "sound", frontend=None, **kwargs)
| 275 | |
| 276 | |
| 277 | def extract_fbank(data, data_len=None, data_type: str = "sound", frontend=None, **kwargs): |
| 278 | """Extract filter-bank features from audio data. |
| 279 | |
| 280 | Args: |
| 281 | data: Audio samples (list of numpy arrays or tensors). |
| 282 | data_len: Lengths of each sample. |
| 283 | data_type (str): Input type ("sound", "fbank"). |
| 284 | frontend: Frontend instance for feature extraction. |
| 285 | |
| 286 | Returns: |
| 287 | tuple: (features_tensor, feature_lengths, feature_times) |
| 288 | """ |
| 289 | if isinstance(data, np.ndarray): |
| 290 | data = torch.from_numpy(data) |
| 291 | if len(data.shape) < 2: |
| 292 | data = data[None, :] # data: [batch, N] |
| 293 | elif data.shape[0] > 1: |
| 294 | data = data.mean(dim=0, keepdim=True) # convert stereo/multi-channel to mono |
| 295 | data_len = [data.shape[1]] if data_len is None else data_len |
| 296 | elif isinstance(data, torch.Tensor): |
| 297 | if len(data.shape) < 2: |
| 298 | data = data[None, :] # data: [batch, N] |
| 299 | elif data.shape[0] > 1: |
| 300 | data = data.mean(dim=0, keepdim=True) # convert stereo/multi-channel to mono |
| 301 | data_len = [data.shape[1]] if data_len is None else data_len |
| 302 | elif isinstance(data, (list, tuple)): |
| 303 | data_list, data_len = [], [] |
| 304 | for data_i in data: |
| 305 | if isinstance(data_i, np.ndarray): |
| 306 | data_i = torch.from_numpy(data_i) |
| 307 | data_list.append(data_i) |
| 308 | data_len.append(data_i.shape[0]) |
| 309 | data = pad_sequence(data_list, batch_first=True) # data: [batch, N] |
| 310 | |
| 311 | data, data_len = frontend(data, data_len, **kwargs) |
| 312 | |
| 313 | if isinstance(data_len, (list, tuple)): |
| 314 | data_len = torch.tensor([data_len]) |
| 315 | return data.to(torch.float32), data_len.to(torch.int32) |
| 316 | |
| 317 | |
| 318 | def _load_audio_ffmpeg(file: str, sr: int = 16000): |
no outgoing calls
no test coverage detected
searching dependent graphs…