MCPcopy
hub / github.com/openai/tiktoken / o200k_base

Function o200k_base

tiktoken_ext/openai_public.py:95–120  ·  view source on GitHub ↗
()

Source from the content-addressed store, hash-verified

93
94
95def o200k_base():
96 mergeable_ranks = load_tiktoken_bpe(
97 "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
98 expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d",
99 )
100 special_tokens = {ENDOFTEXT: 199999, ENDOFPROMPT: 200018}
101 # This regex could be made more efficient. If I was the one working on this encoding, I would
102 # have done a few other things differently too, e.g. I think you can allocate tokens more
103 # efficiently across languages.
104 pat_str = "|".join(
105 [
106 r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
107 r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
108 r"""\p{N}{1,3}""",
109 r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""",
110 r"""\s*[\r\n]+""",
111 r"""\s+(?!\S)""",
112 r"""\s+""",
113 ]
114 )
115 return {
116 "name": "o200k_base",
117 "pat_str": pat_str,
118 "mergeable_ranks": mergeable_ranks,
119 "special_tokens": special_tokens,
120 }
121
122
123def o200k_harmony():

Callers 1

o200k_harmonyFunction · 0.85

Calls 1

load_tiktoken_bpeFunction · 0.90

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…