(self)
| 124 | print(c) |
| 125 | |
| 126 | def process_test(self): |
| 127 | c = 0 |
| 128 | common_feat_dict = {} |
| 129 | with open(common_feat_path.format('test'), 'r') as fr: |
| 130 | for line in fr: |
| 131 | line_list = line.strip().split(',') |
| 132 | kv = np.array(re.split('\x01|\x02|\x03', line_list[2])) |
| 133 | key = kv[range(0, len(kv), 3)] |
| 134 | value = kv[range(1, len(kv), 3)] |
| 135 | feat_dict = dict(zip(key, value)) |
| 136 | common_feat_dict[line_list[0]] = feat_dict |
| 137 | c += 1 |
| 138 | if c % 100000 == 0: |
| 139 | print(c) |
| 140 | print('join feats...') |
| 141 | c = 0 |
| 142 | with open(data_path.format('test') + '.tmp', 'w') as fw: |
| 143 | fw.write('click,purchase,' + ','.join(use_columns) + '\n') |
| 144 | with open(data_path.format('test'), 'r') as fr: |
| 145 | for line in fr: |
| 146 | line_list = line.strip().split(',') |
| 147 | if line_list[1] == '0' and line_list[2] == '1': |
| 148 | continue |
| 149 | kv = np.array(re.split('\x01|\x02|\x03', line_list[5])) |
| 150 | key = kv[range(0, len(kv), 3)] |
| 151 | value = kv[range(1, len(kv), 3)] |
| 152 | feat_dict = dict(zip(key, value)) |
| 153 | feat_dict.update(common_feat_dict[line_list[3]]) |
| 154 | feats = line_list[1:3] |
| 155 | for k in use_columns: |
| 156 | feats.append(str(feat_dict.get(k, '0'))) |
| 157 | fw.write(','.join(feats) + '\n') |
| 158 | c += 1 |
| 159 | if c % 100000 == 0: |
| 160 | print(c) |
| 161 | |
| 162 | print('encode feats...') |
| 163 | vocabulary = joblib.load(enum_path) |
| 164 | feat_map = {} |
| 165 | for feat in use_columns: |
| 166 | feat_map[feat] = dict( |
| 167 | zip(vocabulary[feat], range(1, len(vocabulary[feat]) + 1))) |
| 168 | c = 0 |
| 169 | with open(write_path + '.test', 'w') as fw: |
| 170 | fw.write('click,purchase,' + ','.join(use_columns) + '\n') |
| 171 | with open(data_path.format('test') + '.tmp', 'r') as fr: |
| 172 | fr.readline() # remove header |
| 173 | for line in fr: |
| 174 | line_list = line.strip().split(',') |
| 175 | new_line = line_list[:2] |
| 176 | for value, feat in zip(line_list[2:], use_columns): |
| 177 | new_line.append(str(feat_map[feat].get(value, '0'))) |
| 178 | fw.write(','.join(new_line) + '\n') |
| 179 | c += 1 |
| 180 | if c % 100000 == 0: |
| 181 | print(c) |
| 182 | |
| 183 |
no test coverage detected