hub / github.com/microsoft/Magma / _construct_conv

Method _construct_conv

data/conversations.py:181–518 · view source on GitHub ↗

(self, item, video_path, visual_traces)

Source from the content-addressed store, hash-verified

179	"""
180
181	def _construct_conv(self, item, video_path, visual_traces):
182
183	# NOTE: for pretraining on video, we always set num_crops to 1 to save memory cost
184	item['num_crops'] = 1
185
186	if video_path is None and visual_traces is None:
187	dummy_conversations = []
188	dummy_conversations.append({'from': 'human', 'value': f"{self.image_placeholder}\nWhat is in this image?"})
189	dummy_conversations.append({'from': 'gpt', 'value': "This is a blank image."})
190	item['conversations'] = dummy_conversations
191	item['image'] = None
192	return item
193
194	if 'image_size' not in item:
195	assert '(height,width)' in item, f"image_size not in item and (height,width) not in item"
196	item['image_size'] = item['(height,width)'][::-1]
197
198	if isinstance(item['image_size'][0], torch.Tensor):
199	width, height = item['image_size'][0].item(), item['image_size'][1].item()
200	frame_start, frame_end = item['frame_interval'][0].item(), item['frame_interval'][1].item()
201	task_description = item['global_instructions'][0]
202	gpt_response = item['gpt_response'][0]
203	else:
204	width, height = item['image_size']
205	frame_start, frame_end = item['frame_interval']
206	task_description = item['global_instructions']
207	gpt_response = item['gpt_response']
208
209	gpt_response = self._process_gpt_response(gpt_response, task_description)
210
211	if self.mm_use_image_history:
212	# randomly sample at most 3 unique indices in range [0, frame_start)
213	frame_idx = torch.randperm(frame_start)[:3].sort().values.tolist() + [frame_start]
214	else:
215	frame_idx = [frame_start]
216
217	item['image'] = self._get_frames_with_idx(video_path, frame_idx, (width, height))
218	if item['image'] is None:
219	dummy_conversations = []
220	dummy_conversations.append({'from': 'human', 'value': f"{self.image_placeholder}\nWhat is in this image?"})
221	dummy_conversations.append({'from': 'gpt', 'value': "This is a blank image."})
222	item['conversations'] = dummy_conversations
223	return item
224
225	conv_user, conv_gpt, gpt_response_todo = self._construct_conv_semantic(item, gpt_response, len(item['image']))
226	item['conversations'] = [
227	{'from': 'human', 'value': conv_user},
228	{'from': 'gpt', 'value': conv_gpt}
229	]
230
231	if not self.use_som_tom or random.random() < 0.2:
232	return item
233
234	if visual_traces is None:
235	return item
236
237	if len(visual_traces['pred_tracks'].shape) == 3:
238	visual_traces['pred_tracks'] = visual_traces['pred_tracks'][None]

Callers 6

__call__Method · 0.80

Calls 10

_process_gpt_responseMethod · 0.95

_get_frames_with_idxMethod · 0.95

_construct_conv_semanticMethod · 0.95

_get_frameMethod · 0.95

_construct_conv_somMethod · 0.95

som_promptingFunction · 0.90

cluster_traces_kmeansMethod · 0.80

remove_close_points_tensorMethod · 0.80

visualizeMethod · 0.45

visual_trace_lengthMethod · 0.45

Tested by

no test coverage detected