| 119 | self.target_name = header[1] # target name. |
| 120 | |
| 121 | def _get_feature_map(self): |
| 122 | if not self.rebuild_feature_map and Path( |
| 123 | self.feature_map_cache).exists(): |
| 124 | with open(self.feature_map_cache, 'rb') as f: |
| 125 | feature_mapper = pickle.load(f) |
| 126 | else: |
| 127 | feature_cnts = defaultdict(lambda: defaultdict(int)) |
| 128 | with open(self.file_object) as f: |
| 129 | f.readline() |
| 130 | pbar = tqdm(f, mininterval=1, smoothing=0.1) |
| 131 | pbar.set_description('Create avazu dataset: counting features') |
| 132 | for line in pbar: |
| 133 | values = line.rstrip('\n').split(',') |
| 134 | if len(values) != len(self.field_names) + 1: |
| 135 | continue |
| 136 | for k, v in self.field_info.items(): |
| 137 | if k not in ['click']: |
| 138 | feature_cnts[k][values[v]] += 1 |
| 139 | feature_mapper = { |
| 140 | field_name: { |
| 141 | feature_name |
| 142 | for feature_name, c in cnt.items() |
| 143 | if c >= self.min_threshold |
| 144 | } |
| 145 | for field_name, cnt in feature_cnts.items() |
| 146 | } |
| 147 | feature_mapper['id'] = { |
| 148 | feature_name |
| 149 | for feature_name, c in feature_cnts['id'].items() |
| 150 | } |
| 151 | feature_mapper = { |
| 152 | field_name: |
| 153 | {feature_name: idx |
| 154 | for idx, feature_name in enumerate(cnt)} |
| 155 | for field_name, cnt in feature_mapper.items() |
| 156 | } |
| 157 | |
| 158 | shutil.rmtree(self.feature_map_cache, ignore_errors=True) |
| 159 | with open(self.feature_map_cache, 'wb') as f: |
| 160 | pickle.dump(feature_mapper, f) |
| 161 | |
| 162 | self.feature_map = feature_mapper |
| 163 | |
| 164 | def _build_split(self): |
| 165 | full_lines = [] |