| 196 | |
| 197 | |
| 198 | def add_audio(history, audio_path, data_stored): |
| 199 | audio, sampling_rate = librosa.load(audio_path, sr=16000) |
| 200 | input_features = audio_processor( |
| 201 | audio, sampling_rate, return_tensors="pt" |
| 202 | ).input_features # whisper training sampling rate, do not modify |
| 203 | input_features = torch.Tensor(input_features).to(device) |
| 204 | predicted_ids = audio_model.generate(input_features, |
| 205 | forced_decoder_ids=forced_decoder_ids) |
| 206 | text_input = audio_processor.batch_decode(predicted_ids, |
| 207 | skip_special_tokens=True) |
| 208 | text_input = str(text_input).strip('[]"') |
| 209 | data_stored = data_stored + [{'user_input': text_input}] |
| 210 | gr.update(value=data_stored, interactive=False) |
| 211 | history = history + [(text_input, None)] |
| 212 | |
| 213 | return history, data_stored |
| 214 | |
| 215 | |
| 216 | def add_file(history, file, txt, motion_uploaded): |