(tokenIds, add_bos_token = true, add_preceding_space = true)
| 385 | } |
| 386 | |
| 387 | decode(tokenIds, add_bos_token = true, add_preceding_space = true) { |
| 388 | const utf8byteVals = []; |
| 389 | const startIndex = add_bos_token ? 1 : 0; |
| 390 | for (let i = startIndex; i < tokenIds.length; i++) { |
| 391 | const tokenId = tokenIds[i]; |
| 392 | const tokenString = this.vocabById[tokenId]; |
| 393 | if (tokenString.startsWith("<0x") && tokenString.endsWith(">")) { |
| 394 | // Special case |
| 395 | const utf8byte = hexToUtf8Byte(tokenString); |
| 396 | utf8byteVals.push(utf8byte); |
| 397 | } else { |
| 398 | // Typical case |
| 399 | const utf8bytes = this.utf8Encoder.encode(tokenString); |
| 400 | utf8bytes.forEach((utf8Byte) => utf8byteVals.push(utf8Byte)); |
| 401 | } |
| 402 | } |
| 403 | const uint8Array = new Uint8Array(utf8byteVals); |
| 404 | const decodedString = this.utf8Decoder.decode(uint8Array); |
| 405 | const spacesFixed = decodedString.replaceAll(this.vocabById[29871], " "); |
| 406 | // Note that preceding space must be removed here at string level, not earlier at token level, because multiple consecutive spaces are represented as single token. |
| 407 | return add_preceding_space ? spacesFixed.slice(1) : spacesFixed; |
| 408 | } |
| 409 | |
| 410 | defaultTests(tokenizer) { |
| 411 | function isEqual(arr1, arr2) { |
nothing calls this directly
no test coverage detected