Preprocess multimodal data and return a dict with ``input_ids`` and multimodal features. New-style models inherit this implementation. Legacy models override with `def preprocess(self, messages)`.
(self,
messages: list[dict],
input_prompt: str | list[int],
mm_processor_kwargs: dict[str, Any] | None = None)
| 121 | raise NotImplementedError() |
| 122 | |
| 123 | def preprocess(self, |
| 124 | messages: list[dict], |
| 125 | input_prompt: str | list[int], |
| 126 | mm_processor_kwargs: dict[str, Any] | None = None) -> dict[str, Any]: |
| 127 | """Preprocess multimodal data and return a dict with ``input_ids`` and |
| 128 | multimodal features. |
| 129 | |
| 130 | New-style models inherit this implementation. Legacy models override with `def preprocess(self, messages)`. |
| 131 | """ |
| 132 | |
| 133 | mm_items = self.collect_multimodal_items(messages) |
| 134 | |
| 135 | raw_images, raw_videos, video_metadatas = [], [], [] |
| 136 | raw_audios = [] |
| 137 | raw_time_series, sampling_rates = [], [] |
| 138 | for modality, data, params in mm_items: |
| 139 | if modality == Modality.IMAGE: |
| 140 | raw_images.append(data) |
| 141 | elif modality == Modality.VIDEO: |
| 142 | raw_videos.append(data) |
| 143 | video_metadatas.append(params.get('video_metadata', None)) |
| 144 | elif modality == Modality.AUDIO: |
| 145 | raw_audios.append(data[0] if isinstance(data, tuple) else data) |
| 146 | elif modality == Modality.TIME_SERIES: |
| 147 | raw_time_series.append(data) |
| 148 | sampling_rates.append(params.get('sampling_rate', None)) |
| 149 | else: |
| 150 | raise ValueError(f'unsupported modality {modality}') |
| 151 | |
| 152 | # get kwargs for processor |
| 153 | kwargs = {} |
| 154 | images_kwargs = {} |
| 155 | videos_kwargs = {} |
| 156 | audio_kwargs = {} |
| 157 | mm_processor_kwargs = mm_processor_kwargs or {} |
| 158 | if raw_images: |
| 159 | kwargs['images'] = raw_images |
| 160 | image_size = get_override_size(self.processor.image_processor, |
| 161 | mm_processor_kwargs.get('image'), |
| 162 | modality='image') |
| 163 | if image_size is not None: |
| 164 | images_kwargs['size'] = image_size |
| 165 | if raw_videos: |
| 166 | kwargs['videos'] = raw_videos |
| 167 | videos_kwargs['video_metadata'] = video_metadatas |
| 168 | # perform resize in hf processor, while sample frames has been done in video loader |
| 169 | videos_kwargs['do_resize'] = True |
| 170 | videos_kwargs['do_sample_frames'] = False |
| 171 | video_size = get_override_size(self.processor.video_processor, |
| 172 | mm_processor_kwargs.get('video'), |
| 173 | modality='video') |
| 174 | if video_size is not None: |
| 175 | videos_kwargs['size'] = video_size |
| 176 | if raw_audios: |
| 177 | kwargs['audio'] = raw_audios |
| 178 | audio_kwargs = dict(mm_processor_kwargs.get('audio') or {}) |
| 179 | feature_extractor = getattr(self.processor, 'feature_extractor', None) |
| 180 | sampling_rate = getattr(feature_extractor, 'sampling_rate', None) |
nothing calls this directly
no test coverage detected