MCPcopy
hub / github.com/city96/ComfyUI-GGUF / gguf_gemma3_tokenizer_loader

Function gguf_gemma3_tokenizer_loader

loader.py:428–468  ·  view source on GitHub ↗
(path)

Source from the content-addressed store, hash-verified

426 return torch.ByteTensor(list(json.dumps(data).encode('utf-8')))
427
428def gguf_gemma3_tokenizer_loader(path):
429 #TODO: merge into gguf_tokenizer_loader
430 logging.info("Attempting to recreate sentencepiece tokenizer from GGUF file metadata...")
431 try:
432 from sentencepiece import sentencepiece_model_pb2 as model
433 except ImportError:
434 raise ImportError("Please install sentencepiece and protobuf.\npip install sentencepiece protobuf")
435 spm = model.ModelProto()
436 reader = gguf.GGUFReader(path)
437
438 spm.normalizer_spec.name = "identity"
439 spm.normalizer_spec.add_dummy_prefix = False
440 spm.trainer_spec.model_type = 2
441 spm.trainer_spec.input_format = "tsv"
442 spm.trainer_spec.byte_fallback = True
443 spm.trainer_spec.max_sentence_length = 4192
444 spm.trainer_spec.bos_piece = "<bos>"
445
446 tokens = get_list_field(reader, "tokenizer.ggml.tokens", str)
447 scores = get_list_field(reader, "tokenizer.ggml.scores", float)
448 toktype = get_list_field(reader, "tokenizer.ggml.token_type", int)
449
450 if not tokens or not scores or not toktype:
451 raise ValueError("Missing tokenizer metadata")
452
453 for idx in range(len(tokens)):
454 piece = spm.SentencePiece()
455 piece.piece = tokens[idx]
456 if idx == 3: # UNK position
457 piece.type = 2 # UNK Token
458 piece.score = 0.0 # UNK Score
459 else:
460 piece.type = toktype[idx]
461 piece.score = scores[idx]
462 spm.pieces.append(piece)
463
464 spm.trainer_spec.vocab_size = len(spm.pieces)
465 logging.info(f"Created tokenizer with vocab size of {len(spm.pieces)}")
466
467 del reader
468 return torch.ByteTensor(list(spm.SerializeToString()))
469
470def gguf_clip_loader(path):
471 sd, extra = gguf_sd_loader(path, is_text_model=True)

Callers 1

gguf_clip_loaderFunction · 0.85

Calls 1

get_list_fieldFunction · 0.85

Tested by

no test coverage detected