| 496 | name = "spm" |
| 497 | |
| 498 | def __init__(self, base_path: Path): |
| 499 | added_tokens: dict[str, int] = {} |
| 500 | if (fname_tokenizer := base_path / 'tokenizer.model').exists(): |
| 501 | # normal location |
| 502 | try: |
| 503 | with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f: |
| 504 | added_tokens = json.load(f) |
| 505 | except FileNotFoundError: |
| 506 | pass |
| 507 | elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists(): |
| 508 | # not found in alternate location either |
| 509 | raise FileNotFoundError('Cannot find tokenizer.model') |
| 510 | |
| 511 | self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) |
| 512 | vocab_size = self.sentencepiece_tokenizer.vocab_size() |
| 513 | |
| 514 | new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} |
| 515 | expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) |
| 516 | actual_new_ids = sorted(new_tokens.keys()) |
| 517 | |
| 518 | if expected_new_ids != actual_new_ids: |
| 519 | raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") |
| 520 | |
| 521 | # Token pieces that were added to the base vocabulary. |
| 522 | self.added_tokens_dict = added_tokens |
| 523 | self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] |
| 524 | self.vocab_size_base = vocab_size |
| 525 | self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) |
| 526 | self.fname_tokenizer = fname_tokenizer |
| 527 | |
| 528 | def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: |
| 529 | tokenizer = self.sentencepiece_tokenizer |