MCPcopy
hub / github.com/city96/ComfyUI-GGUF / gguf_tekken_tokenizer_loader

Function gguf_tekken_tokenizer_loader

loader.py:385–426  ·  view source on GitHub ↗
(path, temb_shape)

Source from the content-addressed store, hash-verified

383 return torch.ByteTensor(list(spm.SerializeToString()))
384
385def gguf_tekken_tokenizer_loader(path, temb_shape):
386 # convert ggml (hf) tokenizer metadata to tekken/comfy data
387 logging.info("Attempting to recreate tekken tokenizer from GGUF file metadata...")
388 import json
389 import base64
390 from transformers.convert_slow_tokenizer import bytes_to_unicode
391
392 reader = gguf.GGUFReader(path)
393
394 model_str = get_field(reader, "tokenizer.ggml.model", str)
395 if model_str == "gpt2":
396 if temb_shape == (131072, 5120): # probably Mistral
397 data = {
398 "config": {"num_vocab_tokens": 150000, "default_vocab_size": 131072},
399 "vocab": [],
400 "special_tokens": [],
401 }
402 else:
403 raise NotImplementedError("Unknown model, can't set tokenizer!")
404 else:
405 raise NotImplementedError("Unknown model, can't set tokenizer!")
406
407 tokens = get_list_field(reader, "tokenizer.ggml.tokens", str)
408 toktypes = get_list_field(reader, "tokenizer.ggml.token_type", int)
409
410 decoder = {v: k for k, v in bytes_to_unicode().items()}
411 for idx, (token, toktype) in enumerate(zip(tokens, toktypes)):
412 if toktype == 3:
413 data["special_tokens"].append(
414 {'rank': idx, 'token_str': token, 'is_control': True}
415 )
416 else:
417 tok = bytes([decoder[char] for char in token])
418 data["vocab"].append({
419 "rank": len(data["vocab"]),
420 "token_bytes": base64.b64encode(tok).decode("ascii"),
421 "token_str": tok.decode("utf-8", errors="replace") # ?
422 })
423
424 logging.info(f"Created tekken tokenizer with vocab size of {len(data['vocab'])} (+{len(data['special_tokens'])})")
425 del reader
426 return torch.ByteTensor(list(json.dumps(data).encode('utf-8')))
427
428def gguf_gemma3_tokenizer_loader(path):
429 #TODO: merge into gguf_tokenizer_loader

Callers 1

gguf_clip_loaderFunction · 0.85

Calls 2

get_fieldFunction · 0.85
get_list_fieldFunction · 0.85

Tested by

no test coverage detected