Method decode

core/llm/llamaTokenizer.js:387–408 · view source on GitHub ↗

(tokenIds, add_bos_token = true, add_preceding_space = true)

Source from the content-addressed store, hash-verified

385	}
386
387	decode(tokenIds, add_bos_token = true, add_preceding_space = true) {
388	const utf8byteVals = [];
389	const startIndex = add_bos_token ? 1 : 0;
390	for (let i = startIndex; i < tokenIds.length; i++) {
391	const tokenId = tokenIds[i];
392	const tokenString = this.vocabById[tokenId];
393	if (tokenString.startsWith("<0x") && tokenString.endsWith(">")) {
394	// Special case
395	const utf8byte = hexToUtf8Byte(tokenString);
396	utf8byteVals.push(utf8byte);
397	} else {
398	// Typical case
399	const utf8bytes = this.utf8Encoder.encode(tokenString);
400	utf8bytes.forEach((utf8Byte) => utf8byteVals.push(utf8Byte));
401	}
402	}
403	const uint8Array = new Uint8Array(utf8byteVals);
404	const decodedString = this.utf8Decoder.decode(uint8Array);
405	const spacesFixed = decodedString.replaceAll(this.vocabById[29871], " ");
406	// Note that preceding space must be removed here at string level, not earlier at token level, because multiple consecutive spaces are represented as single token.
407	return add_preceding_space ? spacesFixed.slice(1) : spacesFixed;
408	}
409
410	defaultTests(tokenizer) {
411	function isEqual(arr1, arr2) {

nothing calls this directly

hexToUtf8ByteFunction · 0.70

pushMethod · 0.65

encodeMethod · 0.65

decodeMethod · 0.65

no test coverage detected