(self, inputs, chunk_length_s=0, stride_length_s=None)
| 58 | |
| 59 | # Copy of transformers `AutomaticSpeechRecognitionPipeline.preprocess` method with call to custom `chunk_iter` |
| 60 | def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None): |
| 61 | if isinstance(inputs, str): |
| 62 | if inputs.startswith("http://") or inputs.startswith("https://"): |
| 63 | # We need to actually check for a real protocol, otherwise it's impossible to use a local file |
| 64 | # like http_huggingface_co.png |
| 65 | inputs = requests.get(inputs).content |
| 66 | else: |
| 67 | with open(inputs, "rb") as f: |
| 68 | inputs = f.read() |
| 69 | |
| 70 | if isinstance(inputs, bytes): |
| 71 | inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate) |
| 72 | |
| 73 | stride = None |
| 74 | extra = {} |
| 75 | if isinstance(inputs, dict): |
| 76 | stride = inputs.pop("stride", None) |
| 77 | # Accepting `"array"` which is the key defined in `datasets` for |
| 78 | # better integration |
| 79 | if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)): |
| 80 | raise ValueError( |
| 81 | "When passing a dictionary to AutomaticSpeechRecognitionPipeline, the dict needs to contain a " |
| 82 | '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, ' |
| 83 | "containing the sampling_rate associated with that array" |
| 84 | ) |
| 85 | |
| 86 | _inputs = inputs.pop("raw", None) |
| 87 | if _inputs is None: |
| 88 | # Remove path which will not be used from `datasets`. |
| 89 | inputs.pop("path", None) |
| 90 | _inputs = inputs.pop("array", None) |
| 91 | in_sampling_rate = inputs.pop("sampling_rate") |
| 92 | extra = inputs |
| 93 | inputs = _inputs |
| 94 | if in_sampling_rate != self.feature_extractor.sampling_rate: |
| 95 | if is_torchaudio_available(): |
| 96 | from torchaudio import functional as F |
| 97 | else: |
| 98 | raise ImportError( |
| 99 | "torchaudio is required to resample audio samples in AutomaticSpeechRecognitionPipeline. " |
| 100 | "The torchaudio package can be installed through: `pip install torchaudio`." |
| 101 | ) |
| 102 | |
| 103 | inputs = F.resample( |
| 104 | torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate |
| 105 | ).numpy() |
| 106 | ratio = self.feature_extractor.sampling_rate / in_sampling_rate |
| 107 | else: |
| 108 | ratio = 1 |
| 109 | if stride is not None: |
| 110 | if stride[0] + stride[1] > inputs.shape[0]: |
| 111 | raise ValueError("Stride is too large for input") |
| 112 | |
| 113 | # Stride needs to get the chunk length here, it's going to get |
| 114 | # swallowed by the `feature_extractor` later, and then batching |
| 115 | # can add extra data in the inputs, so we need to keep track |
| 116 | # of the original length in the stride so we can cut properly. |
| 117 | stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio))) |
nothing calls this directly
no test coverage detected