hub / github.com/microsoft/Magma / _construct_caption

Method _construct_caption

data/conversations.py:570–621 · view source on GitHub ↗

v4->v5: add trace of mark

(self, item, video_path, visual_traces)

Source from the content-addressed store, hash-verified

568	return None
569
570	def _construct_caption(self, item, video_path, visual_traces):
571	"""
572	v4->v5: add trace of mark
573	"""
574	if video_path is None and visual_traces is None:
575	dummy_conversations = []
576	dummy_conversations.append({'from': 'human', 'value': f"{self.image_placeholder}\nWhat is in this image?"})
577	dummy_conversations.append({'from': 'gpt', 'value': "This is a blank image."})
578	item['conversations'] = dummy_conversations
579	item['image'] = None
580	return item
581
582	if 'image_size' not in item:
583	assert '(height,width)' in item, f"image_size not in item and (height,width) not in item"
584	item['image_size'] = item['(height,width)'][::-1]
585
586	if isinstance(item['image_size'][0], torch.Tensor):
587	width, height = item['image_size'][0].item(), item['image_size'][1].item()
588	frame_start, frame_end = item['frame_interval'][0].item(), item['frame_interval'][1].item()
589	task_description = item['global_instructions'][0]
590	gpt_response = item['gpt_response'][0]
591	else:
592	width, height = item['image_size']
593	frame_start, frame_end = item['frame_interval']
594	task_description = item['global_instructions']
595	gpt_response = item['gpt_response']
596
597	gpt_response = self._process_gpt_response(gpt_response, task_description)
598
599	item['image'] = self._get_frames(video_path, frame_start, frame_end, (width, height))
600
601	if item['image'] is not None:
602	image_placeholder = ''.join([self.image_placeholder] * len(item['image']))
603	conv_user = (
604	f'{image_placeholder}\nWhat do you see in the first image? And what will the person do next?\n'
605	)
606	conv_gpt = gpt_response + '\n'
607	item['conversations'] = [
608	{'from': 'human', 'value': conv_user},
609	{'from': 'gpt', 'value': conv_gpt}
610	]
611	else:
612	image_placeholder = ''.join([self.image_placeholder])
613	conv_user = (
614	f'{image_placeholder}\nWhat is in this image?\n'
615	)
616	conv_gpt = "This is a blank image.\n"
617	item['conversations'] = [
618	{'from': 'human', 'value': conv_user},
619	{'from': 'gpt', 'value': conv_gpt}
620	]
621	return item

Callers 1

__call__Method · 0.80

Calls 2

_process_gpt_responseMethod · 0.95

_get_framesMethod · 0.95

Tested by

no test coverage detected