MCPcopy Index your code
hub / github.com/microsoft/BitNet / __init__

Method __init__

utils/convert.py:498–526  ·  view source on GitHub ↗
(self, base_path: Path)

Source from the content-addressed store, hash-verified

496 name = "spm"
497
498 def __init__(self, base_path: Path):
499 added_tokens: dict[str, int] = {}
500 if (fname_tokenizer := base_path / 'tokenizer.model').exists():
501 # normal location
502 try:
503 with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
504 added_tokens = json.load(f)
505 except FileNotFoundError:
506 pass
507 elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
508 # not found in alternate location either
509 raise FileNotFoundError('Cannot find tokenizer.model')
510
511 self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
512 vocab_size = self.sentencepiece_tokenizer.vocab_size()
513
514 new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
515 expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
516 actual_new_ids = sorted(new_tokens.keys())
517
518 if expected_new_ids != actual_new_ids:
519 raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
520
521 # Token pieces that were added to the base vocabulary.
522 self.added_tokens_dict = added_tokens
523 self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
524 self.vocab_size_base = vocab_size
525 self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
526 self.fname_tokenizer = fname_tokenizer
527
528 def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
529 tokenizer = self.sentencepiece_tokenizer

Callers

nothing calls this directly

Calls 1

loadMethod · 0.45

Tested by

no test coverage detected