| 241 | } |
| 242 | |
| 243 | encode( |
| 244 | prompt, |
| 245 | add_bos_token = true, |
| 246 | add_preceding_space = true, |
| 247 | log_performance = false, |
| 248 | ) { |
| 249 | let startTime = null; |
| 250 | if (log_performance) { |
| 251 | startTime = performance.now(); |
| 252 | } |
| 253 | |
| 254 | if (!this.vocabById || !this.vocabByString || !this.merges) { |
| 255 | console.log("Tokenizer not initialized properly!"); |
| 256 | return; |
| 257 | } |
| 258 | if (prompt.length === 0) { |
| 259 | return []; |
| 260 | } |
| 261 | // Initially each character is transformed to a tokenId, later there will be merges of these. |
| 262 | const tokenIds = this.mapCharactersToTokenIds( |
| 263 | prompt, |
| 264 | add_bos_token, |
| 265 | add_preceding_space, |
| 266 | ); |
| 267 | |
| 268 | // Set up priority queue to efficiently iterate merge possibilities in priority order |
| 269 | const mergeQueue = new PriorityQueue((a, b) => { |
| 270 | return a.mergePrio < b.mergePrio; |
| 271 | }); |
| 272 | |
| 273 | const addToMergeQueue = (leftNode) => { |
| 274 | const mergeIdentifierString = this.getMergeIdentifierString( |
| 275 | leftNode.tokenId, |
| 276 | leftNode.next.tokenId, |
| 277 | ); |
| 278 | // Merge priority is primarily determined by the location of the merge in the "merges" data, |
| 279 | // secondarily determined by the relative position of the node in the linked list |
| 280 | // (We want to perform equal merges from left to right) |
| 281 | const mergePrio = |
| 282 | this.merges.get(mergeIdentifierString) + |
| 283 | leftNode.origPos / prompt.length; |
| 284 | if (mergePrio) { |
| 285 | // If mergePrio not found in merges, that means this merge is not possible according to vocabulary. |
| 286 | leftNode.mergePrio = mergePrio; |
| 287 | leftNode.mergeToString = mergeIdentifierString.replace(" ", ""); |
| 288 | mergeQueue.push(leftNode); |
| 289 | } |
| 290 | }; |
| 291 | |
| 292 | // Fill merge queue from initial merge possibilities and construct linked list |
| 293 | let firstTokenNode = { |
| 294 | origPos: 0, |
| 295 | tokenId: tokenIds[0], |
| 296 | prev: null, |
| 297 | next: null, |
| 298 | }; |
| 299 | let prevTokenNode = firstTokenNode; |
| 300 | for (let i = 1; i < tokenIds.length; i++) { |