hub / github.com/PaddlePaddle/PaddleRec / process_train

Method process_train

datasets/ali-cpp_aitm/process_public_data.py:41–124 · view source on GitHub ↗

(self)

Source from the content-addressed store, hash-verified

39	pass
40
41	def process_train(self):
42	c = 0
43	common_feat_dict = {}
44	with open(common_feat_path.format('train'), 'r') as fr:
45	for line in fr:
46	line_list = line.strip().split(',')
47	kv = np.array(re.split('\x01\|\x02\|\x03', line_list[2]))
48	key = kv[range(0, len(kv), 3)]
49	value = kv[range(1, len(kv), 3)]
50	feat_dict = dict(zip(key, value))
51	common_feat_dict[line_list[0]] = feat_dict
52	c += 1
53	if c % 100000 == 0:
54	print(c)
55	print('join feats...')
56	c = 0
57	vocabulary = dict(
58	zip(use_columns, [{} for _ in range(len(use_columns))]))
59	with open(data_path.format('train') + '.tmp', 'w') as fw:
60	fw.write('click,purchase,' + ','.join(use_columns) + '\n')
61	with open(data_path.format('train'), 'r') as fr:
62	for line in fr:
63	line_list = line.strip().split(',')
64	if line_list[1] == '0' and line_list[2] == '1':
65	continue
66	kv = np.array(re.split('\x01\|\x02\|\x03', line_list[5]))
67	key = kv[range(0, len(kv), 3)]
68	value = kv[range(1, len(kv), 3)]
69	feat_dict = dict(zip(key, value))
70	feat_dict.update(common_feat_dict[line_list[3]])
71	feats = line_list[1:3]
72	for k in use_columns:
73	feats.append(feat_dict.get(k, '0'))
74	fw.write(','.join(feats) + '\n')
75	for k, v in feat_dict.items():
76	if k in use_columns:
77	if v in vocabulary[k]:
78	vocabulary[k][v] += 1
79	else:
80	vocabulary[k][v] = 0
81	c += 1
82	if c % 100000 == 0:
83	print(c)
84	print('before filter low freq:')
85	for k, v in vocabulary.items():
86	print(k + ':' + str(len(v)))
87	new_vocabulary = dict(
88	zip(use_columns, [set() for _ in range(len(use_columns))]))
89	for k, v in vocabulary.items():
90	for k1, v1 in v.items():
91	if v1 > 10:
92	new_vocabulary[k].add(k1)
93	vocabulary = new_vocabulary
94	print('after filter low freq:')
95	for k, v in vocabulary.items():
96	print(k + ':' + str(len(v)))
97	joblib.dump(vocabulary, enum_path, compress=3)
98

Callers 1

process_public_data.pyFile · 0.80

Calls 3

updateMethod · 0.80

dumpMethod · 0.80

loadMethod · 0.80

Tested by

no test coverage detected