Class for loading datasets from csv files. Purpose: Useful for loading data for unsupervised modeling or transfer tasks Arguments: path (str): Path to csv file with dataset. tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None pr
| 197 | return rtn_ds |
| 198 | |
| 199 | class csv_dataset(data.Dataset): |
| 200 | """ |
| 201 | Class for loading datasets from csv files. |
| 202 | Purpose: Useful for loading data for unsupervised modeling or transfer tasks |
| 203 | Arguments: |
| 204 | path (str): Path to csv file with dataset. |
| 205 | tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None |
| 206 | preprocess_fn (callable): Callable that process a string into desired format. |
| 207 | delim (str): delimiter for csv. Default: ',' |
| 208 | binarize_sent (bool): binarize label values to 0 or 1 if they\'re on a different scale. Default: False |
| 209 | drop_unlabeled (bool): drop rows with unlabelled values. Always fills remaining empty |
| 210 | columns with -1 (regardless if rows are dropped based on value) Default: False |
| 211 | text_key (str): key to get text from csv. Default: 'sentence' |
| 212 | label_key (str): key to get label from json dictionary. Default: 'label' |
| 213 | Attributes: |
| 214 | X (list): all strings from the csv file |
| 215 | Y (np.ndarray): labels to train with |
| 216 | """ |
| 217 | def __init__(self, path, tokenizer=None, preprocess_fn=None, delim=',', |
| 218 | binarize_sent=False, drop_unlabeled=False, text_key='sentence', label_key='label', |
| 219 | **kwargs): |
| 220 | self.is_lazy = False |
| 221 | self.preprocess_fn = preprocess_fn |
| 222 | self.SetTokenizer(tokenizer) |
| 223 | self.path = path |
| 224 | self.delim = delim |
| 225 | self.text_key = text_key |
| 226 | self.label_key = label_key |
| 227 | self.drop_unlabeled = drop_unlabeled |
| 228 | |
| 229 | if '.tsv' in self.path: |
| 230 | self.delim = '\t' |
| 231 | |
| 232 | |
| 233 | self.X = [] |
| 234 | self.Y = [] |
| 235 | try: |
| 236 | cols = [text_key] |
| 237 | if isinstance(label_key, list): |
| 238 | cols += label_key |
| 239 | else: |
| 240 | cols += [label_key] |
| 241 | data = pd.read_csv(self.path, sep=self.delim, usecols=cols, encoding='latin-1') |
| 242 | except: |
| 243 | data = pd.read_csv(self.path, sep=self.delim, usecols=[text_key], encoding='latin-1') |
| 244 | |
| 245 | data = data.dropna(axis=0) |
| 246 | |
| 247 | self.X = data[text_key].values.tolist() |
| 248 | try: |
| 249 | self.Y = data[label_key].values |
| 250 | except Exception as e: |
| 251 | self.Y = np.ones(len(self.X))*-1 |
| 252 | |
| 253 | if binarize_sent: |
| 254 | self.Y = binarize_labels(self.Y, hard=binarize_sent) |
| 255 | |
| 256 | def SetTokenizer(self, tokenizer): |