MCPcopy
hub / github.com/city96/ComfyUI-GGUF / gguf_tokenizer_loader

Function gguf_tokenizer_loader

loader.py:337–383  ·  view source on GitHub ↗
(path, temb_shape)

Source from the content-addressed store, hash-verified

335 return vsd
336
337def gguf_tokenizer_loader(path, temb_shape):
338 # convert gguf tokenizer to spiece
339 logging.info("Attempting to recreate sentencepiece tokenizer from GGUF file metadata...")
340 try:
341 from sentencepiece import sentencepiece_model_pb2 as model
342 except ImportError:
343 raise ImportError("Please make sure sentencepiece and protobuf are installed.\npip install sentencepiece protobuf")
344 spm = model.ModelProto()
345
346 reader = gguf.GGUFReader(path)
347
348 if get_field(reader, "tokenizer.ggml.model", str) == "t5":
349 if temb_shape == (256384, 4096): # probably UMT5
350 spm.trainer_spec.model_type == 1 # Unigram (do we have a T5 w/ BPE?)
351 else:
352 raise NotImplementedError("Unknown model, can't set tokenizer!")
353 else:
354 raise NotImplementedError("Unknown model, can't set tokenizer!")
355
356 spm.normalizer_spec.add_dummy_prefix = get_field(reader, "tokenizer.ggml.add_space_prefix", bool)
357 spm.normalizer_spec.remove_extra_whitespaces = get_field(reader, "tokenizer.ggml.remove_extra_whitespaces", bool)
358
359 tokens = get_list_field(reader, "tokenizer.ggml.tokens", str)
360 scores = get_list_field(reader, "tokenizer.ggml.scores", float)
361 toktypes = get_list_field(reader, "tokenizer.ggml.token_type", int)
362
363 for idx, (token, score, toktype) in enumerate(zip(tokens, scores, toktypes)):
364 # # These aren't present in the original?
365 # if toktype == 5 and idx >= temb_shape[0]%1000):
366 # continue
367
368 piece = spm.SentencePiece()
369 piece.piece = token
370 piece.score = score
371 piece.type = toktype
372 spm.pieces.append(piece)
373
374 # unsure if any of these are correct
375 spm.trainer_spec.byte_fallback = True
376 spm.trainer_spec.vocab_size = len(tokens) # split off unused?
377 spm.trainer_spec.max_sentence_length = 4096
378 spm.trainer_spec.eos_id = get_field(reader, "tokenizer.ggml.eos_token_id", int)
379 spm.trainer_spec.pad_id = get_field(reader, "tokenizer.ggml.padding_token_id", int)
380
381 logging.info(f"Created tokenizer with vocab size of {len(spm.pieces)}")
382 del reader
383 return torch.ByteTensor(list(spm.SerializeToString()))
384
385def gguf_tekken_tokenizer_loader(path, temb_shape):
386 # convert ggml (hf) tokenizer metadata to tekken/comfy data

Callers 1

gguf_clip_loaderFunction · 0.85

Calls 2

get_fieldFunction · 0.85
get_list_fieldFunction · 0.85

Tested by

no test coverage detected