MCPcopy
hub / github.com/PaddlePaddle/PaddleRec / process_test

Method process_test

datasets/ali-cpp_aitm/process_public_data.py:126–181  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

124 print(c)
125
126 def process_test(self):
127 c = 0
128 common_feat_dict = {}
129 with open(common_feat_path.format('test'), 'r') as fr:
130 for line in fr:
131 line_list = line.strip().split(',')
132 kv = np.array(re.split('\x01|\x02|\x03', line_list[2]))
133 key = kv[range(0, len(kv), 3)]
134 value = kv[range(1, len(kv), 3)]
135 feat_dict = dict(zip(key, value))
136 common_feat_dict[line_list[0]] = feat_dict
137 c += 1
138 if c % 100000 == 0:
139 print(c)
140 print('join feats...')
141 c = 0
142 with open(data_path.format('test') + '.tmp', 'w') as fw:
143 fw.write('click,purchase,' + ','.join(use_columns) + '\n')
144 with open(data_path.format('test'), 'r') as fr:
145 for line in fr:
146 line_list = line.strip().split(',')
147 if line_list[1] == '0' and line_list[2] == '1':
148 continue
149 kv = np.array(re.split('\x01|\x02|\x03', line_list[5]))
150 key = kv[range(0, len(kv), 3)]
151 value = kv[range(1, len(kv), 3)]
152 feat_dict = dict(zip(key, value))
153 feat_dict.update(common_feat_dict[line_list[3]])
154 feats = line_list[1:3]
155 for k in use_columns:
156 feats.append(str(feat_dict.get(k, '0')))
157 fw.write(','.join(feats) + '\n')
158 c += 1
159 if c % 100000 == 0:
160 print(c)
161
162 print('encode feats...')
163 vocabulary = joblib.load(enum_path)
164 feat_map = {}
165 for feat in use_columns:
166 feat_map[feat] = dict(
167 zip(vocabulary[feat], range(1, len(vocabulary[feat]) + 1)))
168 c = 0
169 with open(write_path + '.test', 'w') as fw:
170 fw.write('click,purchase,' + ','.join(use_columns) + '\n')
171 with open(data_path.format('test') + '.tmp', 'r') as fr:
172 fr.readline() # remove header
173 for line in fr:
174 line_list = line.strip().split(',')
175 new_line = line_list[:2]
176 for value, feat in zip(line_list[2:], use_columns):
177 new_line.append(str(feat_map[feat].get(value, '0')))
178 fw.write(','.join(new_line) + '\n')
179 c += 1
180 if c % 100000 == 0:
181 print(c)
182
183

Callers 1

Calls 2

updateMethod · 0.80
loadMethod · 0.80

Tested by

no test coverage detected