()
| 93 | |
| 94 | |
| 95 | def o200k_base(): |
| 96 | mergeable_ranks = load_tiktoken_bpe( |
| 97 | "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken", |
| 98 | expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d", |
| 99 | ) |
| 100 | special_tokens = {ENDOFTEXT: 199999, ENDOFPROMPT: 200018} |
| 101 | # This regex could be made more efficient. If I was the one working on this encoding, I would |
| 102 | # have done a few other things differently too, e.g. I think you can allocate tokens more |
| 103 | # efficiently across languages. |
| 104 | pat_str = "|".join( |
| 105 | [ |
| 106 | r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", |
| 107 | r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", |
| 108 | r"""\p{N}{1,3}""", |
| 109 | r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""", |
| 110 | r"""\s*[\r\n]+""", |
| 111 | r"""\s+(?!\S)""", |
| 112 | r"""\s+""", |
| 113 | ] |
| 114 | ) |
| 115 | return { |
| 116 | "name": "o200k_base", |
| 117 | "pat_str": pat_str, |
| 118 | "mergeable_ranks": mergeable_ranks, |
| 119 | "special_tokens": special_tokens, |
| 120 | } |
| 121 | |
| 122 | |
| 123 | def o200k_harmony(): |
no test coverage detected
searching dependent graphs…