Runs end-to-end tokenziation.
| 52 | |
| 53 | |
| 54 | class FreqVocab(object): |
| 55 | """Runs end-to-end tokenziation.""" |
| 56 | |
| 57 | def __init__(self, user_to_list): |
| 58 | self.counter = Counter() |
| 59 | self.user_set = set() |
| 60 | for u, item_list in user_to_list.items(): |
| 61 | self.counter.update(item_list) |
| 62 | self.user_set.add(str(u)) |
| 63 | |
| 64 | self.user_count = len(self.user_set) |
| 65 | self.item_count = len(self.counter.keys()) |
| 66 | self.special_tokens = {"[pad]", "[MASK]", '[NO_USE]'} |
| 67 | self.token_to_ids = {} # index begin from 1 |
| 68 | for token, count in self.counter.most_common(): |
| 69 | self.token_to_ids[token] = len(self.token_to_ids) + 1 |
| 70 | |
| 71 | for token in self.special_tokens: |
| 72 | self.token_to_ids[token] = len(self.token_to_ids) + 1 |
| 73 | |
| 74 | self.id_to_tokens = {v: k for k, v in self.token_to_ids.items()} |
| 75 | self.vocab_words = list(self.token_to_ids.keys()) |
| 76 | |
| 77 | def convert_tokens_to_ids(self, tokens): |
| 78 | return convert_by_vocab(self.token_to_ids, tokens) |
| 79 | |
| 80 | def convert_ids_to_tokens(self, ids): |
| 81 | return convert_by_vocab(self.id_to_tokens, ids) |
| 82 | |
| 83 | def get_vocab_words(self): |
| 84 | return self.vocab_words # not in order |
| 85 | |
| 86 | def get_item_count(self): |
| 87 | return self.item_count |
| 88 | |
| 89 | def get_user_count(self): |
| 90 | return self.user_count |
| 91 | |
| 92 | def get_items(self): |
| 93 | return list(self.counter.keys()) |
| 94 | |
| 95 | def get_users(self): |
| 96 | return self.user_set |
| 97 | |
| 98 | def get_special_token_count(self): |
| 99 | return len(self.special_tokens) |
| 100 | |
| 101 | def get_special_token(self): |
| 102 | return self.special_tokens |
| 103 | |
| 104 | def get_vocab_size(self): |
| 105 | return self.get_item_count() + self.get_special_token_count( |
| 106 | ) + 1 #self.get_user_count() |
| 107 | |
| 108 | |
| 109 | random_seed = 12345 |