hub / github.com/PaddlePaddle/PaddleRec / _build_split

Method _build_split

datasets/Avazu_flen/preprocess.py:164–229 · view source on GitHub ↗

(self)

Source from the content-addressed store, hash-verified

162	self.feature_map = feature_mapper
163
164	def _build_split(self):
165	full_lines = []
166	self.data = []
167
168	_mkdir_if_not_exist(self.config.get("runner.train_data_dir"))
169	_mkdir_if_not_exist(self.config.get("runner.test_data_dir"))
170
171	train_file = open(
172	os.path.join(
173	self.config.get("runner.train_data_dir"), 'train_data.txt'),
174	'w')
175	test_file = open(
176	os.path.join(
177	self.config.get("runner.test_data_dir"), 'test_data.txt'), 'w')
178
179	features = {} # dict for all feature columns and target column.
180
181	feature_mapper = self.feature_map
182	sample_cnt = 0
183	for file in [self.file_object]:
184	with open(file, "r") as rf:
185	train_cnt = 0
186	test_cnt = 0
187	rf.readline()
188	pbar = tqdm(rf, mininterval=1, smoothing=0.1)
189	pbar.set_description(
190	'Split avazu dataset: train_dataset and test_dataset')
191	for line in pbar:
192	sample_cnt += 1
193
194	values = line.rstrip('\n').split(',')
195
196	if len(values) != len(self.field_names) + 1:
197	continue
198
199	features = {
200	self.idx_to_field_name[idx]:
201	feature_mapper[self.idx_to_field_name[idx]][value]
202	for idx, value in enumerate(values)
203	if self.idx_to_field_name[idx] != 'click' and value in
204	feature_mapper[self.idx_to_field_name[idx]]
205	}
206	features.update({'target': values[-1]})
207
208	if "14103000" in values[22]:
209	test_cnt += 1
210	value_n = 0
211	for k, v in features.items():
212	value_n += 1
213	if value_n == len(list(features.values())):
214	test_file.write(str(v) + '\n')
215	else:
216	test_file.write(str(v) + ',')
217	else:
218	train_cnt += 1
219	value_n = 0
220	for k, v in features.items():
221	value_n += 1

Callers 1

initMethod · 0.95

Calls 2

updateMethod · 0.80

_mkdir_if_not_existFunction · 0.70

Tested by

no test coverage detected