(self, path, tokenizer: BertTokenizer, max_seq_length: int = 512, readin: int = 2000000, dupe_factor: int = 5, small_seq_prob: float = 0.1)
| 250 | |
| 251 | class CleanBodyDataCreator(PretrainingDataCreator): |
| 252 | def __init__(self, path, tokenizer: BertTokenizer, max_seq_length: int = 512, readin: int = 2000000, dupe_factor: int = 5, small_seq_prob: float = 0.1): |
| 253 | self.dupe_factor = dupe_factor |
| 254 | self.max_seq_length = max_seq_length |
| 255 | self.small_seq_prob = small_seq_prob |
| 256 | |
| 257 | documents = [] |
| 258 | instances = [] |
| 259 | with open(path, encoding='utf-8') as fd: |
| 260 | for i, line in enumerate(tqdm(fd)): |
| 261 | line = line.replace('\n', '') |
| 262 | url, cleanbody, rand_int = line.rstrip("\n").split("\t") |
| 263 | cleanbody = cleanbody.replace("#TAB#", " ").replace( |
| 264 | "#NULL#", "").replace("#HASH#", "#") |
| 265 | cleanbody_parts = cleanbody.split("#R##N#") |
| 266 | for document in cleanbody_parts: |
| 267 | lines = document.split("#N#") |
| 268 | document = [] |
| 269 | document_len = 0 |
| 270 | for seq in lines: |
| 271 | tok_seq = tokenizer.tokenize(seq) |
| 272 | if len(tok_seq) != 0: |
| 273 | document.append(tok_seq) |
| 274 | document_len += len(tok_seq) |
| 275 | if document_len >= 200: |
| 276 | documents.append(document) |
| 277 | |
| 278 | documents = [x for x in documents if x] |
| 279 | |
| 280 | self.documents = documents |
| 281 | for _ in range(self.dupe_factor): |
| 282 | for index in range(len(self.documents)): |
| 283 | instances.extend(self.create_training_instance(index)) |
| 284 | |
| 285 | shuffle(instances) |
| 286 | self.instances = instances |
| 287 | self.len = len(self.instances) |
| 288 | self.documents = None |
| 289 | documents = None |
| 290 | |
| 291 | |
| 292 | class WikiNBookCorpusPretrainingDataCreator(PretrainingDataCreator): |
nothing calls this directly
no test coverage detected