MCPcopy
hub / github.com/lucidrains/DALLE-pytorch / bpe

Method bpe

dalle_pytorch/tokenizer.py:78–117  ·  view source on GitHub ↗
(self, token)

Source from the content-addressed store, hash-verified

76 re.IGNORECASE)
77
78 def bpe(self, token):
79 if token in self.cache:
80 return self.cache[token]
81 word = tuple(token[:-1]) + (token[-1] + '</w>',)
82 pairs = get_pairs(word)
83
84 if not pairs:
85 return token + '</w>'
86
87 while True:
88 bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
89 if bigram not in self.bpe_ranks:
90 break
91 first, second = bigram
92 new_word = []
93 i = 0
94 while i < len(word):
95 try:
96 j = word.index(first, i)
97 new_word.extend(word[i:j])
98 i = j
99 except:
100 new_word.extend(word[i:])
101 break
102
103 if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
104 new_word.append(first + second)
105 i += 2
106 else:
107 new_word.append(word[i])
108 i += 1
109 new_word = tuple(new_word)
110 word = new_word
111 if len(word) == 1:
112 break
113 else:
114 pairs = get_pairs(word)
115 word = ' '.join(word)
116 self.cache[token] = word
117 return word
118
119 def encode(self, text):
120 bpe_tokens = []

Callers 1

encodeMethod · 0.95

Calls 1

get_pairsFunction · 0.85

Tested by

no test coverage detected