(self, item, video_path, visual_traces)
| 179 | """ |
| 180 | |
| 181 | def _construct_conv(self, item, video_path, visual_traces): |
| 182 | |
| 183 | # NOTE: for pretraining on video, we always set num_crops to 1 to save memory cost |
| 184 | item['num_crops'] = 1 |
| 185 | |
| 186 | if video_path is None and visual_traces is None: |
| 187 | dummy_conversations = [] |
| 188 | dummy_conversations.append({'from': 'human', 'value': f"{self.image_placeholder}\nWhat is in this image?"}) |
| 189 | dummy_conversations.append({'from': 'gpt', 'value': "This is a blank image."}) |
| 190 | item['conversations'] = dummy_conversations |
| 191 | item['image'] = None |
| 192 | return item |
| 193 | |
| 194 | if 'image_size' not in item: |
| 195 | assert '(height,width)' in item, f"image_size not in item and (height,width) not in item" |
| 196 | item['image_size'] = item['(height,width)'][::-1] |
| 197 | |
| 198 | if isinstance(item['image_size'][0], torch.Tensor): |
| 199 | width, height = item['image_size'][0].item(), item['image_size'][1].item() |
| 200 | frame_start, frame_end = item['frame_interval'][0].item(), item['frame_interval'][1].item() |
| 201 | task_description = item['global_instructions'][0] |
| 202 | gpt_response = item['gpt_response'][0] |
| 203 | else: |
| 204 | width, height = item['image_size'] |
| 205 | frame_start, frame_end = item['frame_interval'] |
| 206 | task_description = item['global_instructions'] |
| 207 | gpt_response = item['gpt_response'] |
| 208 | |
| 209 | gpt_response = self._process_gpt_response(gpt_response, task_description) |
| 210 | |
| 211 | if self.mm_use_image_history: |
| 212 | # randomly sample at most 3 unique indices in range [0, frame_start) |
| 213 | frame_idx = torch.randperm(frame_start)[:3].sort().values.tolist() + [frame_start] |
| 214 | else: |
| 215 | frame_idx = [frame_start] |
| 216 | |
| 217 | item['image'] = self._get_frames_with_idx(video_path, frame_idx, (width, height)) |
| 218 | if item['image'] is None: |
| 219 | dummy_conversations = [] |
| 220 | dummy_conversations.append({'from': 'human', 'value': f"{self.image_placeholder}\nWhat is in this image?"}) |
| 221 | dummy_conversations.append({'from': 'gpt', 'value': "This is a blank image."}) |
| 222 | item['conversations'] = dummy_conversations |
| 223 | return item |
| 224 | |
| 225 | conv_user, conv_gpt, gpt_response_todo = self._construct_conv_semantic(item, gpt_response, len(item['image'])) |
| 226 | item['conversations'] = [ |
| 227 | {'from': 'human', 'value': conv_user}, |
| 228 | {'from': 'gpt', 'value': conv_gpt} |
| 229 | ] |
| 230 | |
| 231 | if not self.use_som_tom or random.random() < 0.2: |
| 232 | return item |
| 233 | |
| 234 | if visual_traces is None: |
| 235 | return item |
| 236 | |
| 237 | if len(visual_traces['pred_tracks'].shape) == 3: |
| 238 | visual_traces['pred_tracks'] = visual_traces['pred_tracks'][None] |
no test coverage detected