MCPcopy
hub / github.com/deepspeedai/DeepSpeedExamples / __init__

Method __init__

bing_bert/turing/sources.py:252–289  ·  view source on GitHub ↗
(self, path, tokenizer: BertTokenizer,  max_seq_length: int = 512, readin: int = 2000000, dupe_factor: int = 5, small_seq_prob: float = 0.1)

Source from the content-addressed store, hash-verified

250
251class CleanBodyDataCreator(PretrainingDataCreator):
252 def __init__(self, path, tokenizer: BertTokenizer, max_seq_length: int = 512, readin: int = 2000000, dupe_factor: int = 5, small_seq_prob: float = 0.1):
253 self.dupe_factor = dupe_factor
254 self.max_seq_length = max_seq_length
255 self.small_seq_prob = small_seq_prob
256
257 documents = []
258 instances = []
259 with open(path, encoding='utf-8') as fd:
260 for i, line in enumerate(tqdm(fd)):
261 line = line.replace('\n', '')
262 url, cleanbody, rand_int = line.rstrip("\n").split("\t")
263 cleanbody = cleanbody.replace("#TAB#", " ").replace(
264 "#NULL#", "").replace("#HASH#", "#")
265 cleanbody_parts = cleanbody.split("#R##N#")
266 for document in cleanbody_parts:
267 lines = document.split("#N#")
268 document = []
269 document_len = 0
270 for seq in lines:
271 tok_seq = tokenizer.tokenize(seq)
272 if len(tok_seq) != 0:
273 document.append(tok_seq)
274 document_len += len(tok_seq)
275 if document_len >= 200:
276 documents.append(document)
277
278 documents = [x for x in documents if x]
279
280 self.documents = documents
281 for _ in range(self.dupe_factor):
282 for index in range(len(self.documents)):
283 instances.extend(self.create_training_instance(index))
284
285 shuffle(instances)
286 self.instances = instances
287 self.len = len(self.instances)
288 self.documents = None
289 documents = None
290
291
292class WikiNBookCorpusPretrainingDataCreator(PretrainingDataCreator):

Callers

nothing calls this directly

Calls 4

appendMethod · 0.80
extendMethod · 0.80
tokenizeMethod · 0.45

Tested by

no test coverage detected