MCPcopy
hub / github.com/deepspeedai/DeepSpeedExamples / __getitem__

Method __getitem__

bing_bert/turing/dataset.py:199–228  ·  view source on GitHub ↗
(self, index)

Source from the content-addressed store, hash-verified

197 return self.len
198
199 def __getitem__(self, index):
200 i = index % self.len
201
202 query, instance, label = self.data.all_pairs[i]
203 label = float(label)
204
205 instances = instance.split('<sep>')
206
207 query_tokens = self.tokenizer.tokenize(query)
208 instances = [self.tokenizer.tokenize(x) for x in instances]
209 instance_tokens = []
210 for x in instances:
211 instance_tokens.extend(x)
212 instance_tokens.append('[SEP]')
213
214 instance_tokens = instance_tokens[:-1]
215 # instance_tokens = self.tokenizer.tokenize(instance)
216
217 if(len(query_tokens) > self.max_seq_len//2):
218 query_tokens = query_tokens[0: self.max_seq_len//2]
219
220 max_instance_tokens = self.max_seq_len - \
221 len(query_tokens) - 3 # Removing 3 for SEP and CLS
222
223 if(len(instance_tokens) > max_instance_tokens):
224 instance_tokens = instance_tokens[0:max_instance_tokens]
225
226 input_ids, input_mask, sequence_ids = encode_sequence(
227 query_tokens, instance_tokens, self.max_seq_len, self.tokenizer)
228 return tuple([map_to_torch([BatchType.RANKING_BATCH]), input_ids, input_mask, sequence_ids, map_to_torch_float([label])])
229
230
231class PreTrainingDataset(Dataset):

Callers

nothing calls this directly

Calls 6

encode_sequenceFunction · 0.85
map_to_torchFunction · 0.85
map_to_torch_floatFunction · 0.85
extendMethod · 0.80
appendMethod · 0.80
tokenizeMethod · 0.45

Tested by

no test coverage detected