MCPcopy Index your code
hub / github.com/deepspeedai/DeepSpeedExamples / __getitem__

Method __getitem__

Megatron-LM/data_utils/datasets.py:277–290  ·  view source on GitHub ↗

process+tokenize string and return string,label,and stringlen

(self, index)

Source from the content-addressed store, hash-verified

275 return len(self.X)
276
277 def __getitem__(self, index):
278 """process+tokenize string and return string,label,and stringlen"""
279 x = self.X[index]
280 if self.tokenizer is not None:
281 x = self.tokenizer.EncodeAsIds(x, self.preprocess_fn)
282 elif self.preprocess_fn is not None:
283 x = self.preprocess_fn(x)
284 y = self.Y[index]
285 if isinstance(y, str):
286 if self.tokenizer is not None:
287 y = self.tokenizer.EncodeAsIds(y, self.preprocess_fn)
288 elif self.preprocess_fn is not None:
289 y = self.preprocess_fn(y)
290 return {'text': x, 'length': len(x), 'label': y}
291
292 def write(self, writer_gen=None, path=None, skip_header=False):
293 """

Callers

nothing calls this directly

Calls 1

EncodeAsIdsMethod · 0.45

Tested by

no test coverage detected