MCPcopy
hub / github.com/PaddlePaddle/PaddleRec / _get_feature_map

Method _get_feature_map

datasets/Avazu_flen/preprocess.py:121–162  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

119 self.target_name = header[1] # target name.
120
121 def _get_feature_map(self):
122 if not self.rebuild_feature_map and Path(
123 self.feature_map_cache).exists():
124 with open(self.feature_map_cache, 'rb') as f:
125 feature_mapper = pickle.load(f)
126 else:
127 feature_cnts = defaultdict(lambda: defaultdict(int))
128 with open(self.file_object) as f:
129 f.readline()
130 pbar = tqdm(f, mininterval=1, smoothing=0.1)
131 pbar.set_description('Create avazu dataset: counting features')
132 for line in pbar:
133 values = line.rstrip('\n').split(',')
134 if len(values) != len(self.field_names) + 1:
135 continue
136 for k, v in self.field_info.items():
137 if k not in ['click']:
138 feature_cnts[k][values[v]] += 1
139 feature_mapper = {
140 field_name: {
141 feature_name
142 for feature_name, c in cnt.items()
143 if c >= self.min_threshold
144 }
145 for field_name, cnt in feature_cnts.items()
146 }
147 feature_mapper['id'] = {
148 feature_name
149 for feature_name, c in feature_cnts['id'].items()
150 }
151 feature_mapper = {
152 field_name:
153 {feature_name: idx
154 for idx, feature_name in enumerate(cnt)}
155 for field_name, cnt in feature_mapper.items()
156 }
157
158 shutil.rmtree(self.feature_map_cache, ignore_errors=True)
159 with open(self.feature_map_cache, 'wb') as f:
160 pickle.dump(feature_mapper, f)
161
162 self.feature_map = feature_mapper
163
164 def _build_split(self):
165 full_lines = []

Callers 1

initMethod · 0.95

Calls 2

loadMethod · 0.80
dumpMethod · 0.80

Tested by

no test coverage detected