(self)
| 162 | self.feature_map = feature_mapper |
| 163 | |
| 164 | def _build_split(self): |
| 165 | full_lines = [] |
| 166 | self.data = [] |
| 167 | |
| 168 | _mkdir_if_not_exist(self.config.get("runner.train_data_dir")) |
| 169 | _mkdir_if_not_exist(self.config.get("runner.test_data_dir")) |
| 170 | |
| 171 | train_file = open( |
| 172 | os.path.join( |
| 173 | self.config.get("runner.train_data_dir"), 'train_data.txt'), |
| 174 | 'w') |
| 175 | test_file = open( |
| 176 | os.path.join( |
| 177 | self.config.get("runner.test_data_dir"), 'test_data.txt'), 'w') |
| 178 | |
| 179 | features = {} # dict for all feature columns and target column. |
| 180 | |
| 181 | feature_mapper = self.feature_map |
| 182 | sample_cnt = 0 |
| 183 | for file in [self.file_object]: |
| 184 | with open(file, "r") as rf: |
| 185 | train_cnt = 0 |
| 186 | test_cnt = 0 |
| 187 | rf.readline() |
| 188 | pbar = tqdm(rf, mininterval=1, smoothing=0.1) |
| 189 | pbar.set_description( |
| 190 | 'Split avazu dataset: train_dataset and test_dataset') |
| 191 | for line in pbar: |
| 192 | sample_cnt += 1 |
| 193 | |
| 194 | values = line.rstrip('\n').split(',') |
| 195 | |
| 196 | if len(values) != len(self.field_names) + 1: |
| 197 | continue |
| 198 | |
| 199 | features = { |
| 200 | self.idx_to_field_name[idx]: |
| 201 | feature_mapper[self.idx_to_field_name[idx]][value] |
| 202 | for idx, value in enumerate(values) |
| 203 | if self.idx_to_field_name[idx] != 'click' and value in |
| 204 | feature_mapper[self.idx_to_field_name[idx]] |
| 205 | } |
| 206 | features.update({'target': values[-1]}) |
| 207 | |
| 208 | if "14103000" in values[22]: |
| 209 | test_cnt += 1 |
| 210 | value_n = 0 |
| 211 | for k, v in features.items(): |
| 212 | value_n += 1 |
| 213 | if value_n == len(list(features.values())): |
| 214 | test_file.write(str(v) + '\n') |
| 215 | else: |
| 216 | test_file.write(str(v) + ',') |
| 217 | else: |
| 218 | train_cnt += 1 |
| 219 | value_n = 0 |
| 220 | for k, v in features.items(): |
| 221 | value_n += 1 |
no test coverage detected