MCPcopy
hub / github.com/PaddlePaddle/PaddleRec / process

Class process

datasets/ali-cpp_aitm/process_public_data.py:37–181  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

35
36
37class process(object):
38 def __init__(self):
39 pass
40
41 def process_train(self):
42 c = 0
43 common_feat_dict = {}
44 with open(common_feat_path.format('train'), 'r') as fr:
45 for line in fr:
46 line_list = line.strip().split(',')
47 kv = np.array(re.split('\x01|\x02|\x03', line_list[2]))
48 key = kv[range(0, len(kv), 3)]
49 value = kv[range(1, len(kv), 3)]
50 feat_dict = dict(zip(key, value))
51 common_feat_dict[line_list[0]] = feat_dict
52 c += 1
53 if c % 100000 == 0:
54 print(c)
55 print('join feats...')
56 c = 0
57 vocabulary = dict(
58 zip(use_columns, [{} for _ in range(len(use_columns))]))
59 with open(data_path.format('train') + '.tmp', 'w') as fw:
60 fw.write('click,purchase,' + ','.join(use_columns) + '\n')
61 with open(data_path.format('train'), 'r') as fr:
62 for line in fr:
63 line_list = line.strip().split(',')
64 if line_list[1] == '0' and line_list[2] == '1':
65 continue
66 kv = np.array(re.split('\x01|\x02|\x03', line_list[5]))
67 key = kv[range(0, len(kv), 3)]
68 value = kv[range(1, len(kv), 3)]
69 feat_dict = dict(zip(key, value))
70 feat_dict.update(common_feat_dict[line_list[3]])
71 feats = line_list[1:3]
72 for k in use_columns:
73 feats.append(feat_dict.get(k, '0'))
74 fw.write(','.join(feats) + '\n')
75 for k, v in feat_dict.items():
76 if k in use_columns:
77 if v in vocabulary[k]:
78 vocabulary[k][v] += 1
79 else:
80 vocabulary[k][v] = 0
81 c += 1
82 if c % 100000 == 0:
83 print(c)
84 print('before filter low freq:')
85 for k, v in vocabulary.items():
86 print(k + ':' + str(len(v)))
87 new_vocabulary = dict(
88 zip(use_columns, [set() for _ in range(len(use_columns))]))
89 for k, v in vocabulary.items():
90 for k1, v1 in v.items():
91 if v1 > 10:
92 new_vocabulary[k].add(k1)
93 vocabulary = new_vocabulary
94 print('after filter low freq:')

Callers 1

Calls

no outgoing calls

Tested by

no test coverage detected