MCPcopy Index your code
hub / github.com/deepspeedai/DeepSpeedExamples / make_dataset

Function make_dataset

Megatron-LM/data_utils/__init__.py:61–121  ·  view source on GitHub ↗

function to create datasets+tokenizers for common options

(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.],
                delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None,
                tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None,
                model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, **kwargs)

Source from the content-addressed store, hash-verified

59 return corpus_name in corpora.NAMED_CORPORA
60
61def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.],
62 delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None,
63 tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None,
64 model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, **kwargs):
65 """function to create datasets+tokenizers for common options"""
66 if isinstance(process_fn, str):
67 process_fn = eval(process_fn)
68 if non_binary_cols is not None:
69 # multilabel dataset support (only for csvs)
70 label_key = non_binary_cols
71 def get_dataset_from_path(path_):
72 if lazy:
73 # get lazily loaded dataset
74 named_corpora = False
75 if supported_corpus(path_):
76 named_corpora = True
77 name = path_
78 path_ = corpora.NAMED_CORPORA[path_].PATH
79 if not exists_lazy(path_, data_type='data'):
80 # create cached version of dataset for lazy loading if it doesn't exist
81 text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
82 delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose)
83 make_lazy(path_, text.X, data_type='data')
84 text = lazy_array_loader(path_, data_type='data', map_fn=process_fn)
85 else:
86 # get dataset
87 text = get_dataset(path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
88 delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn)
89 return text
90 # get one or multiple datasets and concatenate
91 if isinstance(path, str):
92 path = [path]
93 datasets = [get_dataset_from_path(p) for p in path]
94 if len(datasets) == 1:
95 ds = datasets[0]
96 else:
97 ds = ConcatDataset(datasets)
98 # make tokenizer for dataset
99 if tokenizer is None:
100 tokenizer = make_tokenizer(tokenizer_type, ds, tokenizer_model_path, vocab_size, model_type,
101 pad_token, character_converage, **kwargs)
102
103 ds_type = ''
104 if 'ds_type' in kwargs:
105 ds_type = kwargs['ds_type']
106 ds.SetTokenizer(tokenizer)
107 # Split dataset into train/val/test (and wrap bert dataset)
108 if should_split(split):
109 ds = split_ds(ds, split)
110 if ds_type.lower() == 'bert':
111 presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
112 ds = [bert_sentencepair_dataset(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences) if d is not None else None for d in ds]
113 elif ds_type.lower() == 'gpt2':
114 ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
115 else:
116 if ds_type.lower() == 'bert':
117 presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
118 ds = bert_sentencepair_dataset(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences)

Callers

nothing calls this directly

Calls 8

get_dataset_from_pathFunction · 0.85
ConcatDatasetClass · 0.85
make_tokenizerFunction · 0.85
should_splitFunction · 0.85
split_dsFunction · 0.85
GPT2DatasetClass · 0.70
SetTokenizerMethod · 0.45

Tested by

no test coverage detected