Transform a single text string to a list of byte-pair IDs
(self, text)
| 308 | return [self._transform(string) for string in text] |
| 309 | |
| 310 | def _transform(self, text): |
| 311 | """Transform a single text string to a list of byte-pair IDs""" |
| 312 | P = self.parameters |
| 313 | _bytes = tokenize_bytes_raw(text, encoding=P["encoding"]) |
| 314 | |
| 315 | encoded = [] |
| 316 | for w in _bytes: |
| 317 | l, r = 0, len(w) |
| 318 | w = [int(i) for i in w.split(" ")] |
| 319 | |
| 320 | while l < len(w): |
| 321 | candidate = tuple(w[l:r]) |
| 322 | |
| 323 | if len(candidate) > 1 and candidate in self.byte2token: |
| 324 | # candidate is a collection of several bytes and is in our |
| 325 | # vocab |
| 326 | encoded.append(self.byte2token[candidate]) |
| 327 | l, r = r, len(w) |
| 328 | elif len(candidate) == 1: |
| 329 | # candidate is a single byte and should always be in our |
| 330 | # vocab |
| 331 | encoded.append(candidate[0]) |
| 332 | l, r = r, len(w) |
| 333 | else: |
| 334 | # candidate is not in vocab, so we decrease our context |
| 335 | # window by 1 and try again |
| 336 | r -= 1 |
| 337 | return encoded |
| 338 | |
| 339 | def inverse_transform(self, codes): |
| 340 | """ |
no test coverage detected