MCPcopy Index your code
hub / github.com/deepspeedai/DeepSpeedExamples / __init__

Method __init__

Megatron-LM/data_utils/datasets.py:217–254  ·  view source on GitHub ↗
(self, path, tokenizer=None, preprocess_fn=None, delim=',',
                binarize_sent=False, drop_unlabeled=False, text_key='sentence', label_key='label',
                **kwargs)

Source from the content-addressed store, hash-verified

215 Y (np.ndarray): labels to train with
216 """
217 def __init__(self, path, tokenizer=None, preprocess_fn=None, delim=',',
218 binarize_sent=False, drop_unlabeled=False, text_key='sentence', label_key='label',
219 **kwargs):
220 self.is_lazy = False
221 self.preprocess_fn = preprocess_fn
222 self.SetTokenizer(tokenizer)
223 self.path = path
224 self.delim = delim
225 self.text_key = text_key
226 self.label_key = label_key
227 self.drop_unlabeled = drop_unlabeled
228
229 if '.tsv' in self.path:
230 self.delim = '\t'
231
232
233 self.X = []
234 self.Y = []
235 try:
236 cols = [text_key]
237 if isinstance(label_key, list):
238 cols += label_key
239 else:
240 cols += [label_key]
241 data = pd.read_csv(self.path, sep=self.delim, usecols=cols, encoding='latin-1')
242 except:
243 data = pd.read_csv(self.path, sep=self.delim, usecols=[text_key], encoding='latin-1')
244
245 data = data.dropna(axis=0)
246
247 self.X = data[text_key].values.tolist()
248 try:
249 self.Y = data[label_key].values
250 except Exception as e:
251 self.Y = np.ones(len(self.X))*-1
252
253 if binarize_sent:
254 self.Y = binarize_labels(self.Y, hard=binarize_sent)
255
256 def SetTokenizer(self, tokenizer):
257 if tokenizer is None:

Callers 1

__init__Method · 0.45

Calls 1

SetTokenizerMethod · 0.95

Tested by

no test coverage detected