MCPcopy
hub / github.com/deepspeedai/DeepSpeedExamples / __init__

Method __init__

bing_bert/turing/dataset.py:232–260  ·  view source on GitHub ↗
(self, tokenizer: BertTokenizer, folder: str, logger, max_seq_length, index, data_type: PretrainDataType = PretrainDataType.NUMPY, max_predictions_per_seq: int = 20)

Source from the content-addressed store, hash-verified

230
231class PreTrainingDataset(Dataset):
232 def __init__(self, tokenizer: BertTokenizer, folder: str, logger, max_seq_length, index, data_type: PretrainDataType = PretrainDataType.NUMPY, max_predictions_per_seq: int = 20):
233 self.tokenizer = tokenizer
234 self.dir_path = folder
235 self.max_seq_length = max_seq_length
236 self.len = 0
237 self.masked_lm_prob = 0.15
238 self.max_predictions_per_seq = max_predictions_per_seq
239 self.vocab_words = list(tokenizer.vocab.keys())
240
241 path = get_random_partition(self.dir_path, index)
242
243 logger.info(f"Loading Pretraining Data from {path}")
244 start = time.time()
245 # logger.info(f"Loading Pretraining Data from {path}")
246 # if data_type == PretrainDataType.CLEAN_BODY:
247 # self.data = CleanBodyDataCreator.load(path)
248 # elif data_type == PretrainDataType.WIKIPEDIA or data_type == PretrainDataType.BOOK_CORPUS:
249 # self.data = WikiNBookCorpusPretrainingDataCreator.load(path)
250 if data_type == PretrainDataType.VALIDATION:
251 self.data = WikiPretrainingDataCreator.load(path)
252 elif data_type == PretrainDataType.NUMPY:
253 self.data = NumpyPretrainingDataCreator.load(path)
254 self.len = len(self.data)
255 logger.info(
256 f"Data Loading Completed for Pretraining Data from {path} with {self.len} samples took {time.time()-start:.2f}s.")
257
258 self.len = len(self.data)
259 logger.info(
260 f"Data Loading Completed for Pretraining Data from {path} with {self.len} samples.")
261
262 def __len__(self):
263 return self.len

Callers

nothing calls this directly

Calls 3

get_random_partitionFunction · 0.85
infoMethod · 0.80
loadMethod · 0.45

Tested by

no test coverage detected