(path)
| 426 | return torch.ByteTensor(list(json.dumps(data).encode('utf-8'))) |
| 427 | |
| 428 | def gguf_gemma3_tokenizer_loader(path): |
| 429 | #TODO: merge into gguf_tokenizer_loader |
| 430 | logging.info("Attempting to recreate sentencepiece tokenizer from GGUF file metadata...") |
| 431 | try: |
| 432 | from sentencepiece import sentencepiece_model_pb2 as model |
| 433 | except ImportError: |
| 434 | raise ImportError("Please install sentencepiece and protobuf.\npip install sentencepiece protobuf") |
| 435 | spm = model.ModelProto() |
| 436 | reader = gguf.GGUFReader(path) |
| 437 | |
| 438 | spm.normalizer_spec.name = "identity" |
| 439 | spm.normalizer_spec.add_dummy_prefix = False |
| 440 | spm.trainer_spec.model_type = 2 |
| 441 | spm.trainer_spec.input_format = "tsv" |
| 442 | spm.trainer_spec.byte_fallback = True |
| 443 | spm.trainer_spec.max_sentence_length = 4192 |
| 444 | spm.trainer_spec.bos_piece = "<bos>" |
| 445 | |
| 446 | tokens = get_list_field(reader, "tokenizer.ggml.tokens", str) |
| 447 | scores = get_list_field(reader, "tokenizer.ggml.scores", float) |
| 448 | toktype = get_list_field(reader, "tokenizer.ggml.token_type", int) |
| 449 | |
| 450 | if not tokens or not scores or not toktype: |
| 451 | raise ValueError("Missing tokenizer metadata") |
| 452 | |
| 453 | for idx in range(len(tokens)): |
| 454 | piece = spm.SentencePiece() |
| 455 | piece.piece = tokens[idx] |
| 456 | if idx == 3: # UNK position |
| 457 | piece.type = 2 # UNK Token |
| 458 | piece.score = 0.0 # UNK Score |
| 459 | else: |
| 460 | piece.type = toktype[idx] |
| 461 | piece.score = scores[idx] |
| 462 | spm.pieces.append(piece) |
| 463 | |
| 464 | spm.trainer_spec.vocab_size = len(spm.pieces) |
| 465 | logging.info(f"Created tokenizer with vocab size of {len(spm.pieces)}") |
| 466 | |
| 467 | del reader |
| 468 | return torch.ByteTensor(list(spm.SerializeToString())) |
| 469 | |
| 470 | def gguf_clip_loader(path): |
| 471 | sd, extra = gguf_sd_loader(path, is_text_model=True) |
no test coverage detected