Class for loading datasets from a json dump. Purpose: Useful for loading data for unsupervised modeling or transfer tasks Arguments: path (str): path to json file with dataset. tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None
| 313 | c.writerow(row) |
| 314 | |
| 315 | class json_dataset(data.Dataset): |
| 316 | """ |
| 317 | Class for loading datasets from a json dump. |
| 318 | Purpose: Useful for loading data for unsupervised modeling or transfer tasks |
| 319 | Arguments: |
| 320 | path (str): path to json file with dataset. |
| 321 | tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None |
| 322 | preprocess_fn (callable): callable function that process a string into desired format. |
| 323 | Takes string, maxlen=None, encode=None as arguments. Default: process_str |
| 324 | text_key (str): key to get text from json dictionary. Default: 'sentence' |
| 325 | label_key (str): key to get label from json dictionary. Default: 'label' |
| 326 | Attributes: |
| 327 | all_strs (list): list of all strings from the dataset |
| 328 | all_labels (list): list of all labels from the dataset (if they have it) |
| 329 | """ |
| 330 | def __init__(self, path, tokenizer=None, preprocess_fn=None, binarize_sent=False, |
| 331 | text_key='sentence', label_key='label', loose_json=False, **kwargs): |
| 332 | self.is_lazy = False |
| 333 | self.preprocess_fn = preprocess_fn |
| 334 | self.path = path |
| 335 | self.SetTokenizer(tokenizer) |
| 336 | self.X = [] |
| 337 | self.Y = [] |
| 338 | self.text_key = text_key |
| 339 | self.label_key = label_key |
| 340 | self.loose_json = loose_json |
| 341 | |
| 342 | for j in self.load_json_stream(self.path): |
| 343 | s = j[text_key] |
| 344 | self.X.append(s) |
| 345 | self.Y.append(j[label_key]) |
| 346 | |
| 347 | if binarize_sent: |
| 348 | self.Y = binarize_labels(self.Y, hard=binarize_sent) |
| 349 | |
| 350 | def SetTokenizer(self, tokenizer): |
| 351 | if tokenizer is None: |
| 352 | self.using_tokenizer = False |
| 353 | if not hasattr(self, '_tokenizer'): |
| 354 | self._tokenizer = tokenizer |
| 355 | else: |
| 356 | self.using_tokenizer = True |
| 357 | self._tokenizer = tokenizer |
| 358 | |
| 359 | def GetTokenizer(self): |
| 360 | return self._tokenizer |
| 361 | |
| 362 | @property |
| 363 | def tokenizer(self): |
| 364 | if self.using_tokenizer: |
| 365 | return self._tokenizer |
| 366 | return None |
| 367 | |
| 368 | def __getitem__(self, index): |
| 369 | """gets the index'th string from the dataset""" |
| 370 | x = self.X[index] |
| 371 | if self.tokenizer is not None: |
| 372 | x = self.tokenizer.EncodeAsIds(x, self.preprocess_fn) |