MCPcopy
hub / github.com/PaddlePaddle/PaddleRec / FreqVocab

Class FreqVocab

datasets/Beauty/data_augment_candi_gen.py:54–106  ·  view source on GitHub ↗

Runs end-to-end tokenziation.

Source from the content-addressed store, hash-verified

52
53
54class FreqVocab(object):
55 """Runs end-to-end tokenziation."""
56
57 def __init__(self, user_to_list):
58 self.counter = Counter()
59 self.user_set = set()
60 for u, item_list in user_to_list.items():
61 self.counter.update(item_list)
62 self.user_set.add(str(u))
63
64 self.user_count = len(self.user_set)
65 self.item_count = len(self.counter.keys())
66 self.special_tokens = {"[pad]", "[MASK]", '[NO_USE]'}
67 self.token_to_ids = {} # index begin from 1
68 for token, count in self.counter.most_common():
69 self.token_to_ids[token] = len(self.token_to_ids) + 1
70
71 for token in self.special_tokens:
72 self.token_to_ids[token] = len(self.token_to_ids) + 1
73
74 self.id_to_tokens = {v: k for k, v in self.token_to_ids.items()}
75 self.vocab_words = list(self.token_to_ids.keys())
76
77 def convert_tokens_to_ids(self, tokens):
78 return convert_by_vocab(self.token_to_ids, tokens)
79
80 def convert_ids_to_tokens(self, ids):
81 return convert_by_vocab(self.id_to_tokens, ids)
82
83 def get_vocab_words(self):
84 return self.vocab_words # not in order
85
86 def get_item_count(self):
87 return self.item_count
88
89 def get_user_count(self):
90 return self.user_count
91
92 def get_items(self):
93 return list(self.counter.keys())
94
95 def get_users(self):
96 return self.user_set
97
98 def get_special_token_count(self):
99 return len(self.special_tokens)
100
101 def get_special_token(self):
102 return self.special_tokens
103
104 def get_vocab_size(self):
105 return self.get_item_count() + self.get_special_token_count(
106 ) + 1 #self.get_user_count()
107
108
109random_seed = 12345

Callers 1

mainFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected