(self, index)
| 197 | return self.len |
| 198 | |
| 199 | def __getitem__(self, index): |
| 200 | i = index % self.len |
| 201 | |
| 202 | query, instance, label = self.data.all_pairs[i] |
| 203 | label = float(label) |
| 204 | |
| 205 | instances = instance.split('<sep>') |
| 206 | |
| 207 | query_tokens = self.tokenizer.tokenize(query) |
| 208 | instances = [self.tokenizer.tokenize(x) for x in instances] |
| 209 | instance_tokens = [] |
| 210 | for x in instances: |
| 211 | instance_tokens.extend(x) |
| 212 | instance_tokens.append('[SEP]') |
| 213 | |
| 214 | instance_tokens = instance_tokens[:-1] |
| 215 | # instance_tokens = self.tokenizer.tokenize(instance) |
| 216 | |
| 217 | if(len(query_tokens) > self.max_seq_len//2): |
| 218 | query_tokens = query_tokens[0: self.max_seq_len//2] |
| 219 | |
| 220 | max_instance_tokens = self.max_seq_len - \ |
| 221 | len(query_tokens) - 3 # Removing 3 for SEP and CLS |
| 222 | |
| 223 | if(len(instance_tokens) > max_instance_tokens): |
| 224 | instance_tokens = instance_tokens[0:max_instance_tokens] |
| 225 | |
| 226 | input_ids, input_mask, sequence_ids = encode_sequence( |
| 227 | query_tokens, instance_tokens, self.max_seq_len, self.tokenizer) |
| 228 | return tuple([map_to_torch([BatchType.RANKING_BATCH]), input_ids, input_mask, sequence_ids, map_to_torch_float([label])]) |
| 229 | |
| 230 | |
| 231 | class PreTrainingDataset(Dataset): |
nothing calls this directly
no test coverage detected