hub / github.com/city96/ComfyUI-GGUF / gguf_tekken_tokenizer_loader

Function gguf_tekken_tokenizer_loader

loader.py:385–426 · view source on GitHub ↗

(path, temb_shape)

Source from the content-addressed store, hash-verified

383	return torch.ByteTensor(list(spm.SerializeToString()))
384
385	def gguf_tekken_tokenizer_loader(path, temb_shape):
386	# convert ggml (hf) tokenizer metadata to tekken/comfy data
387	logging.info("Attempting to recreate tekken tokenizer from GGUF file metadata...")
388	import json
389	import base64
390	from transformers.convert_slow_tokenizer import bytes_to_unicode
391
392	reader = gguf.GGUFReader(path)
393
394	model_str = get_field(reader, "tokenizer.ggml.model", str)
395	if model_str == "gpt2":
396	if temb_shape == (131072, 5120): # probably Mistral
397	data = {
398	"config": {"num_vocab_tokens": 150000, "default_vocab_size": 131072},
399	"vocab": [],
400	"special_tokens": [],
401	}
402	else:
403	raise NotImplementedError("Unknown model, can't set tokenizer!")
404	else:
405	raise NotImplementedError("Unknown model, can't set tokenizer!")
406
407	tokens = get_list_field(reader, "tokenizer.ggml.tokens", str)
408	toktypes = get_list_field(reader, "tokenizer.ggml.token_type", int)
409
410	decoder = {v: k for k, v in bytes_to_unicode().items()}
411	for idx, (token, toktype) in enumerate(zip(tokens, toktypes)):
412	if toktype == 3:
413	data["special_tokens"].append(
414	{'rank': idx, 'token_str': token, 'is_control': True}
415	)
416	else:
417	tok = bytes([decoder[char] for char in token])
418	data["vocab"].append({
419	"rank": len(data["vocab"]),
420	"token_bytes": base64.b64encode(tok).decode("ascii"),
421	"token_str": tok.decode("utf-8", errors="replace") # ?
422	})
423
424	logging.info(f"Created tekken tokenizer with vocab size of {len(data['vocab'])} (+{len(data['special_tokens'])})")
425	del reader
426	return torch.ByteTensor(list(json.dumps(data).encode('utf-8')))
427
428	def gguf_gemma3_tokenizer_loader(path):
429	#TODO: merge into gguf_tokenizer_loader

Callers 1

gguf_clip_loaderFunction · 0.85

Calls 2

get_fieldFunction · 0.85

get_list_fieldFunction · 0.85

Tested by

no test coverage detected