(self)
| 39 | pass |
| 40 | |
| 41 | def process_train(self): |
| 42 | c = 0 |
| 43 | common_feat_dict = {} |
| 44 | with open(common_feat_path.format('train'), 'r') as fr: |
| 45 | for line in fr: |
| 46 | line_list = line.strip().split(',') |
| 47 | kv = np.array(re.split('\x01|\x02|\x03', line_list[2])) |
| 48 | key = kv[range(0, len(kv), 3)] |
| 49 | value = kv[range(1, len(kv), 3)] |
| 50 | feat_dict = dict(zip(key, value)) |
| 51 | common_feat_dict[line_list[0]] = feat_dict |
| 52 | c += 1 |
| 53 | if c % 100000 == 0: |
| 54 | print(c) |
| 55 | print('join feats...') |
| 56 | c = 0 |
| 57 | vocabulary = dict( |
| 58 | zip(use_columns, [{} for _ in range(len(use_columns))])) |
| 59 | with open(data_path.format('train') + '.tmp', 'w') as fw: |
| 60 | fw.write('click,purchase,' + ','.join(use_columns) + '\n') |
| 61 | with open(data_path.format('train'), 'r') as fr: |
| 62 | for line in fr: |
| 63 | line_list = line.strip().split(',') |
| 64 | if line_list[1] == '0' and line_list[2] == '1': |
| 65 | continue |
| 66 | kv = np.array(re.split('\x01|\x02|\x03', line_list[5])) |
| 67 | key = kv[range(0, len(kv), 3)] |
| 68 | value = kv[range(1, len(kv), 3)] |
| 69 | feat_dict = dict(zip(key, value)) |
| 70 | feat_dict.update(common_feat_dict[line_list[3]]) |
| 71 | feats = line_list[1:3] |
| 72 | for k in use_columns: |
| 73 | feats.append(feat_dict.get(k, '0')) |
| 74 | fw.write(','.join(feats) + '\n') |
| 75 | for k, v in feat_dict.items(): |
| 76 | if k in use_columns: |
| 77 | if v in vocabulary[k]: |
| 78 | vocabulary[k][v] += 1 |
| 79 | else: |
| 80 | vocabulary[k][v] = 0 |
| 81 | c += 1 |
| 82 | if c % 100000 == 0: |
| 83 | print(c) |
| 84 | print('before filter low freq:') |
| 85 | for k, v in vocabulary.items(): |
| 86 | print(k + ':' + str(len(v))) |
| 87 | new_vocabulary = dict( |
| 88 | zip(use_columns, [set() for _ in range(len(use_columns))])) |
| 89 | for k, v in vocabulary.items(): |
| 90 | for k1, v1 in v.items(): |
| 91 | if v1 > 10: |
| 92 | new_vocabulary[k].add(k1) |
| 93 | vocabulary = new_vocabulary |
| 94 | print('after filter low freq:') |
| 95 | for k, v in vocabulary.items(): |
| 96 | print(k + ':' + str(len(v))) |
| 97 | joblib.dump(vocabulary, enum_path, compress=3) |
| 98 |
no test coverage detected