v4->v5: add trace of mark
(self, item, video_path, visual_traces)
| 568 | return None |
| 569 | |
| 570 | def _construct_caption(self, item, video_path, visual_traces): |
| 571 | """ |
| 572 | v4->v5: add trace of mark |
| 573 | """ |
| 574 | if video_path is None and visual_traces is None: |
| 575 | dummy_conversations = [] |
| 576 | dummy_conversations.append({'from': 'human', 'value': f"{self.image_placeholder}\nWhat is in this image?"}) |
| 577 | dummy_conversations.append({'from': 'gpt', 'value': "This is a blank image."}) |
| 578 | item['conversations'] = dummy_conversations |
| 579 | item['image'] = None |
| 580 | return item |
| 581 | |
| 582 | if 'image_size' not in item: |
| 583 | assert '(height,width)' in item, f"image_size not in item and (height,width) not in item" |
| 584 | item['image_size'] = item['(height,width)'][::-1] |
| 585 | |
| 586 | if isinstance(item['image_size'][0], torch.Tensor): |
| 587 | width, height = item['image_size'][0].item(), item['image_size'][1].item() |
| 588 | frame_start, frame_end = item['frame_interval'][0].item(), item['frame_interval'][1].item() |
| 589 | task_description = item['global_instructions'][0] |
| 590 | gpt_response = item['gpt_response'][0] |
| 591 | else: |
| 592 | width, height = item['image_size'] |
| 593 | frame_start, frame_end = item['frame_interval'] |
| 594 | task_description = item['global_instructions'] |
| 595 | gpt_response = item['gpt_response'] |
| 596 | |
| 597 | gpt_response = self._process_gpt_response(gpt_response, task_description) |
| 598 | |
| 599 | item['image'] = self._get_frames(video_path, frame_start, frame_end, (width, height)) |
| 600 | |
| 601 | if item['image'] is not None: |
| 602 | image_placeholder = ''.join([self.image_placeholder] * len(item['image'])) |
| 603 | conv_user = ( |
| 604 | f'{image_placeholder}\nWhat do you see in the first image? And what will the person do next?\n' |
| 605 | ) |
| 606 | conv_gpt = gpt_response + '\n' |
| 607 | item['conversations'] = [ |
| 608 | {'from': 'human', 'value': conv_user}, |
| 609 | {'from': 'gpt', 'value': conv_gpt} |
| 610 | ] |
| 611 | else: |
| 612 | image_placeholder = ''.join([self.image_placeholder]) |
| 613 | conv_user = ( |
| 614 | f'{image_placeholder}\nWhat is in this image?\n' |
| 615 | ) |
| 616 | conv_gpt = "This is a blank image.\n" |
| 617 | item['conversations'] = [ |
| 618 | {'from': 'human', 'value': conv_user}, |
| 619 | {'from': 'gpt', 'value': conv_gpt} |
| 620 | ] |
| 621 | return item |
no test coverage detected