MCPcopy
hub / github.com/showlab/Show-o / __init__

Method __init__

llava/llava_pretrain_data.py:59–76  ·  view source on GitHub ↗
(self, tokenizer)

Source from the content-addressed store, hash-verified

57class LLaVAPretrainCaptioningDataset(Dataset):
58
59 def __init__(self, tokenizer):
60 super(LLaVAPretrainCaptioningDataset, self).__init__()
61
62 self.tokenizer = tokenizer
63
64 data_file_path = "/mnt/bn/vgfm2/test_dit/blip_laion_cc_sbu_558k.json"
65 self.image_root = "/mnt/bn/vgfm2/test_dit/pretraining_data"
66
67 with open(data_file_path, 'r') as f:
68 data = json.load(f)
69 self.list_data_dict = []
70 for item in data:
71 if 'image' in item.keys():
72 self.list_data_dict.append(item)
73
74 self.processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
75
76 print("Formatting llava captioning data")
77
78 def __len__(self):
79 return len(self.list_data_dict)

Callers

nothing calls this directly

Calls 1

from_pretrainedMethod · 0.45

Tested by

no test coverage detected