(path, temb_shape)
| 335 | return vsd |
| 336 | |
| 337 | def gguf_tokenizer_loader(path, temb_shape): |
| 338 | # convert gguf tokenizer to spiece |
| 339 | logging.info("Attempting to recreate sentencepiece tokenizer from GGUF file metadata...") |
| 340 | try: |
| 341 | from sentencepiece import sentencepiece_model_pb2 as model |
| 342 | except ImportError: |
| 343 | raise ImportError("Please make sure sentencepiece and protobuf are installed.\npip install sentencepiece protobuf") |
| 344 | spm = model.ModelProto() |
| 345 | |
| 346 | reader = gguf.GGUFReader(path) |
| 347 | |
| 348 | if get_field(reader, "tokenizer.ggml.model", str) == "t5": |
| 349 | if temb_shape == (256384, 4096): # probably UMT5 |
| 350 | spm.trainer_spec.model_type == 1 # Unigram (do we have a T5 w/ BPE?) |
| 351 | else: |
| 352 | raise NotImplementedError("Unknown model, can't set tokenizer!") |
| 353 | else: |
| 354 | raise NotImplementedError("Unknown model, can't set tokenizer!") |
| 355 | |
| 356 | spm.normalizer_spec.add_dummy_prefix = get_field(reader, "tokenizer.ggml.add_space_prefix", bool) |
| 357 | spm.normalizer_spec.remove_extra_whitespaces = get_field(reader, "tokenizer.ggml.remove_extra_whitespaces", bool) |
| 358 | |
| 359 | tokens = get_list_field(reader, "tokenizer.ggml.tokens", str) |
| 360 | scores = get_list_field(reader, "tokenizer.ggml.scores", float) |
| 361 | toktypes = get_list_field(reader, "tokenizer.ggml.token_type", int) |
| 362 | |
| 363 | for idx, (token, score, toktype) in enumerate(zip(tokens, scores, toktypes)): |
| 364 | # # These aren't present in the original? |
| 365 | # if toktype == 5 and idx >= temb_shape[0]%1000): |
| 366 | # continue |
| 367 | |
| 368 | piece = spm.SentencePiece() |
| 369 | piece.piece = token |
| 370 | piece.score = score |
| 371 | piece.type = toktype |
| 372 | spm.pieces.append(piece) |
| 373 | |
| 374 | # unsure if any of these are correct |
| 375 | spm.trainer_spec.byte_fallback = True |
| 376 | spm.trainer_spec.vocab_size = len(tokens) # split off unused? |
| 377 | spm.trainer_spec.max_sentence_length = 4096 |
| 378 | spm.trainer_spec.eos_id = get_field(reader, "tokenizer.ggml.eos_token_id", int) |
| 379 | spm.trainer_spec.pad_id = get_field(reader, "tokenizer.ggml.padding_token_id", int) |
| 380 | |
| 381 | logging.info(f"Created tokenizer with vocab size of {len(spm.pieces)}") |
| 382 | del reader |
| 383 | return torch.ByteTensor(list(spm.SerializeToString())) |
| 384 | |
| 385 | def gguf_tekken_tokenizer_loader(path, temb_shape): |
| 386 | # convert ggml (hf) tokenizer metadata to tekken/comfy data |
no test coverage detected