(self, tokenizer: BertTokenizer, folder: str, logger, max_seq_length, index, data_type: PretrainDataType = PretrainDataType.NUMPY, max_predictions_per_seq: int = 20)
| 230 | |
| 231 | class PreTrainingDataset(Dataset): |
| 232 | def __init__(self, tokenizer: BertTokenizer, folder: str, logger, max_seq_length, index, data_type: PretrainDataType = PretrainDataType.NUMPY, max_predictions_per_seq: int = 20): |
| 233 | self.tokenizer = tokenizer |
| 234 | self.dir_path = folder |
| 235 | self.max_seq_length = max_seq_length |
| 236 | self.len = 0 |
| 237 | self.masked_lm_prob = 0.15 |
| 238 | self.max_predictions_per_seq = max_predictions_per_seq |
| 239 | self.vocab_words = list(tokenizer.vocab.keys()) |
| 240 | |
| 241 | path = get_random_partition(self.dir_path, index) |
| 242 | |
| 243 | logger.info(f"Loading Pretraining Data from {path}") |
| 244 | start = time.time() |
| 245 | # logger.info(f"Loading Pretraining Data from {path}") |
| 246 | # if data_type == PretrainDataType.CLEAN_BODY: |
| 247 | # self.data = CleanBodyDataCreator.load(path) |
| 248 | # elif data_type == PretrainDataType.WIKIPEDIA or data_type == PretrainDataType.BOOK_CORPUS: |
| 249 | # self.data = WikiNBookCorpusPretrainingDataCreator.load(path) |
| 250 | if data_type == PretrainDataType.VALIDATION: |
| 251 | self.data = WikiPretrainingDataCreator.load(path) |
| 252 | elif data_type == PretrainDataType.NUMPY: |
| 253 | self.data = NumpyPretrainingDataCreator.load(path) |
| 254 | self.len = len(self.data) |
| 255 | logger.info( |
| 256 | f"Data Loading Completed for Pretraining Data from {path} with {self.len} samples took {time.time()-start:.2f}s.") |
| 257 | |
| 258 | self.len = len(self.data) |
| 259 | logger.info( |
| 260 | f"Data Loading Completed for Pretraining Data from {path} with {self.len} samples.") |
| 261 | |
| 262 | def __len__(self): |
| 263 | return self.len |
nothing calls this directly
no test coverage detected