(path, temb_shape)
| 383 | return torch.ByteTensor(list(spm.SerializeToString())) |
| 384 | |
| 385 | def gguf_tekken_tokenizer_loader(path, temb_shape): |
| 386 | # convert ggml (hf) tokenizer metadata to tekken/comfy data |
| 387 | logging.info("Attempting to recreate tekken tokenizer from GGUF file metadata...") |
| 388 | import json |
| 389 | import base64 |
| 390 | from transformers.convert_slow_tokenizer import bytes_to_unicode |
| 391 | |
| 392 | reader = gguf.GGUFReader(path) |
| 393 | |
| 394 | model_str = get_field(reader, "tokenizer.ggml.model", str) |
| 395 | if model_str == "gpt2": |
| 396 | if temb_shape == (131072, 5120): # probably Mistral |
| 397 | data = { |
| 398 | "config": {"num_vocab_tokens": 150000, "default_vocab_size": 131072}, |
| 399 | "vocab": [], |
| 400 | "special_tokens": [], |
| 401 | } |
| 402 | else: |
| 403 | raise NotImplementedError("Unknown model, can't set tokenizer!") |
| 404 | else: |
| 405 | raise NotImplementedError("Unknown model, can't set tokenizer!") |
| 406 | |
| 407 | tokens = get_list_field(reader, "tokenizer.ggml.tokens", str) |
| 408 | toktypes = get_list_field(reader, "tokenizer.ggml.token_type", int) |
| 409 | |
| 410 | decoder = {v: k for k, v in bytes_to_unicode().items()} |
| 411 | for idx, (token, toktype) in enumerate(zip(tokens, toktypes)): |
| 412 | if toktype == 3: |
| 413 | data["special_tokens"].append( |
| 414 | {'rank': idx, 'token_str': token, 'is_control': True} |
| 415 | ) |
| 416 | else: |
| 417 | tok = bytes([decoder[char] for char in token]) |
| 418 | data["vocab"].append({ |
| 419 | "rank": len(data["vocab"]), |
| 420 | "token_bytes": base64.b64encode(tok).decode("ascii"), |
| 421 | "token_str": tok.decode("utf-8", errors="replace") # ? |
| 422 | }) |
| 423 | |
| 424 | logging.info(f"Created tekken tokenizer with vocab size of {len(data['vocab'])} (+{len(data['special_tokens'])})") |
| 425 | del reader |
| 426 | return torch.ByteTensor(list(json.dumps(data).encode('utf-8'))) |
| 427 | |
| 428 | def gguf_gemma3_tokenizer_loader(path): |
| 429 | #TODO: merge into gguf_tokenizer_loader |
no test coverage detected