| 215 | Y (np.ndarray): labels to train with |
| 216 | """ |
| 217 | def __init__(self, path, tokenizer=None, preprocess_fn=None, delim=',', |
| 218 | binarize_sent=False, drop_unlabeled=False, text_key='sentence', label_key='label', |
| 219 | **kwargs): |
| 220 | self.is_lazy = False |
| 221 | self.preprocess_fn = preprocess_fn |
| 222 | self.SetTokenizer(tokenizer) |
| 223 | self.path = path |
| 224 | self.delim = delim |
| 225 | self.text_key = text_key |
| 226 | self.label_key = label_key |
| 227 | self.drop_unlabeled = drop_unlabeled |
| 228 | |
| 229 | if '.tsv' in self.path: |
| 230 | self.delim = '\t' |
| 231 | |
| 232 | |
| 233 | self.X = [] |
| 234 | self.Y = [] |
| 235 | try: |
| 236 | cols = [text_key] |
| 237 | if isinstance(label_key, list): |
| 238 | cols += label_key |
| 239 | else: |
| 240 | cols += [label_key] |
| 241 | data = pd.read_csv(self.path, sep=self.delim, usecols=cols, encoding='latin-1') |
| 242 | except: |
| 243 | data = pd.read_csv(self.path, sep=self.delim, usecols=[text_key], encoding='latin-1') |
| 244 | |
| 245 | data = data.dropna(axis=0) |
| 246 | |
| 247 | self.X = data[text_key].values.tolist() |
| 248 | try: |
| 249 | self.Y = data[label_key].values |
| 250 | except Exception as e: |
| 251 | self.Y = np.ones(len(self.X))*-1 |
| 252 | |
| 253 | if binarize_sent: |
| 254 | self.Y = binarize_labels(self.Y, hard=binarize_sent) |
| 255 | |
| 256 | def SetTokenizer(self, tokenizer): |
| 257 | if tokenizer is None: |