MCPcopy Index your code
hub / github.com/zai-org/CogView / StreamingRarDataset

Class StreamingRarDataset

preprocess/raw_datasets.py:147–224  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

145# self.zip.close()
146
147class StreamingRarDataset(IterableDataset):
148 def __init__(self, path, transform=None, default_size=256):
149 from PIL import ImageFile
150 ImageFile.LOAD_TRUNCATED_IMAGES = True
151 print("begin open rar")
152 self.rar = rarfile.RarFile(path)
153 print("finish open rar")
154 self.transform = transform
155 def callback_fn(file_buffer, filename):
156 try:
157 img = Image.open(file_buffer.get_bytes()).convert('RGB')
158 dirs, filename = os.path.split(filename)
159 filename = filename.split('.')[0]
160 if self.transform is not None:
161 img = self.transform(img)
162 return img, filename
163 except PIL.UnidentifiedImageError:
164 print("UnidentifiedImageError")
165 return torch.zeros((3, default_size, default_size)), "not_a_image"
166 self.callback_fn = callback_fn
167 # new handle
168 self.handle = None
169 self.callback_fn = callback_fn
170
171 def __len__(self):
172 return len(self.rar.filelist)
173 def __next__(self):
174 if self.pointer >= len(self.members):
175 raise StopIteration()
176 if self.handle == None:
177 archive = unrarlib.RAROpenArchiveDataEx(
178 self.rar.filename, mode=constants.RAR_OM_EXTRACT)
179 self.handle = self.rar._open(archive)
180 # callback to memory
181 self.data_storage = _ReadIntoMemory()
182 c_callback = unrarlib.UNRARCALLBACK(self.data_storage._callback)
183 unrarlib.RARSetCallback(self.handle, c_callback, 0)
184 handle = self.handle
185 try:
186 rarinfo = self.rar._read_header(handle)
187 while rarinfo is not None:
188 if rarinfo.filename == self.members[self.pointer]:
189 self.rar._process_current(handle, constants.RAR_TEST)
190 break
191 else:
192 self.rar._process_current(handle, constants.RAR_SKIP)
193 rarinfo = self.rar._read_header(handle)
194
195 if rarinfo is None:
196 self.data_storage = None
197
198 except unrarlib.UnrarException:
199 raise BadRarFile("Bad RAR archive data.")
200
201 if self.data_storage is None:
202 raise KeyError('There is no item named %r in the archive' % self.members[self.pointer])
203
204 # return file-like object

Callers 1

Calls

no outgoing calls

Tested by

no test coverage detected