MCPcopy
hub / github.com/microsoft/Magma / _construct_conv

Method _construct_conv

data/conversations.py:181–518  ·  view source on GitHub ↗
(self, item, video_path, visual_traces)

Source from the content-addressed store, hash-verified

179 """
180
181 def _construct_conv(self, item, video_path, visual_traces):
182
183 # NOTE: for pretraining on video, we always set num_crops to 1 to save memory cost
184 item['num_crops'] = 1
185
186 if video_path is None and visual_traces is None:
187 dummy_conversations = []
188 dummy_conversations.append({'from': 'human', 'value': f"{self.image_placeholder}\nWhat is in this image?"})
189 dummy_conversations.append({'from': 'gpt', 'value': "This is a blank image."})
190 item['conversations'] = dummy_conversations
191 item['image'] = None
192 return item
193
194 if 'image_size' not in item:
195 assert '(height,width)' in item, f"image_size not in item and (height,width) not in item"
196 item['image_size'] = item['(height,width)'][::-1]
197
198 if isinstance(item['image_size'][0], torch.Tensor):
199 width, height = item['image_size'][0].item(), item['image_size'][1].item()
200 frame_start, frame_end = item['frame_interval'][0].item(), item['frame_interval'][1].item()
201 task_description = item['global_instructions'][0]
202 gpt_response = item['gpt_response'][0]
203 else:
204 width, height = item['image_size']
205 frame_start, frame_end = item['frame_interval']
206 task_description = item['global_instructions']
207 gpt_response = item['gpt_response']
208
209 gpt_response = self._process_gpt_response(gpt_response, task_description)
210
211 if self.mm_use_image_history:
212 # randomly sample at most 3 unique indices in range [0, frame_start)
213 frame_idx = torch.randperm(frame_start)[:3].sort().values.tolist() + [frame_start]
214 else:
215 frame_idx = [frame_start]
216
217 item['image'] = self._get_frames_with_idx(video_path, frame_idx, (width, height))
218 if item['image'] is None:
219 dummy_conversations = []
220 dummy_conversations.append({'from': 'human', 'value': f"{self.image_placeholder}\nWhat is in this image?"})
221 dummy_conversations.append({'from': 'gpt', 'value': "This is a blank image."})
222 item['conversations'] = dummy_conversations
223 return item
224
225 conv_user, conv_gpt, gpt_response_todo = self._construct_conv_semantic(item, gpt_response, len(item['image']))
226 item['conversations'] = [
227 {'from': 'human', 'value': conv_user},
228 {'from': 'gpt', 'value': conv_gpt}
229 ]
230
231 if not self.use_som_tom or random.random() < 0.2:
232 return item
233
234 if visual_traces is None:
235 return item
236
237 if len(visual_traces['pred_tracks'].shape) == 3:
238 visual_traces['pred_tracks'] = visual_traces['pred_tracks'][None]

Callers 6

__call__Method · 0.80
__call__Method · 0.80
__call__Method · 0.80
__call__Method · 0.80
__call__Method · 0.80
__call__Method · 0.80

Calls 10

_process_gpt_responseMethod · 0.95
_get_frames_with_idxMethod · 0.95
_get_frameMethod · 0.95
_construct_conv_somMethod · 0.95
som_promptingFunction · 0.90
cluster_traces_kmeansMethod · 0.80
visualizeMethod · 0.45
visual_trace_lengthMethod · 0.45

Tested by

no test coverage detected