MCPcopy
hub / github.com/PaddlePaddle/PaddleRec / _build_split

Method _build_split

datasets/Avazu_flen/preprocess.py:164–229  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

162 self.feature_map = feature_mapper
163
164 def _build_split(self):
165 full_lines = []
166 self.data = []
167
168 _mkdir_if_not_exist(self.config.get("runner.train_data_dir"))
169 _mkdir_if_not_exist(self.config.get("runner.test_data_dir"))
170
171 train_file = open(
172 os.path.join(
173 self.config.get("runner.train_data_dir"), 'train_data.txt'),
174 'w')
175 test_file = open(
176 os.path.join(
177 self.config.get("runner.test_data_dir"), 'test_data.txt'), 'w')
178
179 features = {} # dict for all feature columns and target column.
180
181 feature_mapper = self.feature_map
182 sample_cnt = 0
183 for file in [self.file_object]:
184 with open(file, "r") as rf:
185 train_cnt = 0
186 test_cnt = 0
187 rf.readline()
188 pbar = tqdm(rf, mininterval=1, smoothing=0.1)
189 pbar.set_description(
190 'Split avazu dataset: train_dataset and test_dataset')
191 for line in pbar:
192 sample_cnt += 1
193
194 values = line.rstrip('\n').split(',')
195
196 if len(values) != len(self.field_names) + 1:
197 continue
198
199 features = {
200 self.idx_to_field_name[idx]:
201 feature_mapper[self.idx_to_field_name[idx]][value]
202 for idx, value in enumerate(values)
203 if self.idx_to_field_name[idx] != 'click' and value in
204 feature_mapper[self.idx_to_field_name[idx]]
205 }
206 features.update({'target': values[-1]})
207
208 if "14103000" in values[22]:
209 test_cnt += 1
210 value_n = 0
211 for k, v in features.items():
212 value_n += 1
213 if value_n == len(list(features.values())):
214 test_file.write(str(v) + '\n')
215 else:
216 test_file.write(str(v) + ',')
217 else:
218 train_cnt += 1
219 value_n = 0
220 for k, v in features.items():
221 value_n += 1

Callers 1

initMethod · 0.95

Calls 2

updateMethod · 0.80
_mkdir_if_not_existFunction · 0.70

Tested by

no test coverage detected