(cls, tiktoken_name)
| 198 | |
| 199 | @classmethod |
| 200 | def from_pretrained(cls, tiktoken_name): |
| 201 | # https://github.com/openai/tiktoken/blob/eedc8563/tiktoken_ext/openai_public.py |
| 202 | enc = tiktoken.get_encoding(tiktoken_name) |
| 203 | # tiktoken calls the special document delimiter token "<|endoftext|>" |
| 204 | # yes this is confusing because this token is almost always PREPENDED to the beginning of the document |
| 205 | # it most often is used to signal the start of a new sequence to the LLM during inference etc. |
| 206 | # so in nanoChat we always use "<|bos|>" short for "beginning of sequence", but historically it is often called "<|endoftext|>". |
| 207 | return cls(enc, "<|endoftext|>") |
| 208 | |
| 209 | def get_vocab_size(self): |
| 210 | return self.enc.n_vocab |
no outgoing calls
no test coverage detected