MCPcopy
hub / github.com/microsoft/Magma / _construct_caption

Method _construct_caption

data/conversations.py:570–621  ·  view source on GitHub ↗

v4->v5: add trace of mark

(self, item, video_path, visual_traces)

Source from the content-addressed store, hash-verified

568 return None
569
570 def _construct_caption(self, item, video_path, visual_traces):
571 """
572 v4->v5: add trace of mark
573 """
574 if video_path is None and visual_traces is None:
575 dummy_conversations = []
576 dummy_conversations.append({'from': 'human', 'value': f"{self.image_placeholder}\nWhat is in this image?"})
577 dummy_conversations.append({'from': 'gpt', 'value': "This is a blank image."})
578 item['conversations'] = dummy_conversations
579 item['image'] = None
580 return item
581
582 if 'image_size' not in item:
583 assert '(height,width)' in item, f"image_size not in item and (height,width) not in item"
584 item['image_size'] = item['(height,width)'][::-1]
585
586 if isinstance(item['image_size'][0], torch.Tensor):
587 width, height = item['image_size'][0].item(), item['image_size'][1].item()
588 frame_start, frame_end = item['frame_interval'][0].item(), item['frame_interval'][1].item()
589 task_description = item['global_instructions'][0]
590 gpt_response = item['gpt_response'][0]
591 else:
592 width, height = item['image_size']
593 frame_start, frame_end = item['frame_interval']
594 task_description = item['global_instructions']
595 gpt_response = item['gpt_response']
596
597 gpt_response = self._process_gpt_response(gpt_response, task_description)
598
599 item['image'] = self._get_frames(video_path, frame_start, frame_end, (width, height))
600
601 if item['image'] is not None:
602 image_placeholder = ''.join([self.image_placeholder] * len(item['image']))
603 conv_user = (
604 f'{image_placeholder}\nWhat do you see in the first image? And what will the person do next?\n'
605 )
606 conv_gpt = gpt_response + '\n'
607 item['conversations'] = [
608 {'from': 'human', 'value': conv_user},
609 {'from': 'gpt', 'value': conv_gpt}
610 ]
611 else:
612 image_placeholder = ''.join([self.image_placeholder])
613 conv_user = (
614 f'{image_placeholder}\nWhat is in this image?\n'
615 )
616 conv_gpt = "This is a blank image.\n"
617 item['conversations'] = [
618 {'from': 'human', 'value': conv_user},
619 {'from': 'gpt', 'value': conv_gpt}
620 ]
621 return item

Callers 1

__call__Method · 0.80

Calls 2

_process_gpt_responseMethod · 0.95
_get_framesMethod · 0.95

Tested by

no test coverage detected