| 421 | name = "bpe" |
| 422 | |
| 423 | def __init__(self, base_path: Path): |
| 424 | added_tokens: dict[str, int] = {} |
| 425 | |
| 426 | if (fname_tokenizer := base_path / 'vocab.json').exists(): |
| 427 | # "slow" tokenizer |
| 428 | with open(fname_tokenizer, encoding="utf-8") as f: |
| 429 | self.vocab = json.load(f) |
| 430 | |
| 431 | try: |
| 432 | # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. |
| 433 | with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f: |
| 434 | added_tokens = json.load(f) |
| 435 | except FileNotFoundError: |
| 436 | pass |
| 437 | else: |
| 438 | # "fast" tokenizer |
| 439 | fname_tokenizer = base_path / FAST_TOKENIZER_FILE |
| 440 | |
| 441 | # if this fails, FileNotFoundError propagates to caller |
| 442 | with open(fname_tokenizer, encoding="utf-8") as f: |
| 443 | tokenizer_json = json.load(f) |
| 444 | |
| 445 | tokenizer_model: dict[str, Any] = tokenizer_json['model'] |
| 446 | if ( |
| 447 | tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False) |
| 448 | or tokenizer_json['decoder']['type'] != 'ByteLevel' |
| 449 | ): |
| 450 | raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer') |
| 451 | |
| 452 | self.vocab = tokenizer_model["vocab"] |
| 453 | |
| 454 | if (added := tokenizer_json.get('added_tokens')) is not None: |
| 455 | # Added tokens here can be duplicates of the main vocabulary. |
| 456 | added_tokens = {item['content']: item['id'] |
| 457 | for item in added |
| 458 | if item['content'] not in self.vocab} |
| 459 | |
| 460 | vocab_size = len(self.vocab) |
| 461 | expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) |
| 462 | actual_ids = sorted(added_tokens.values()) |
| 463 | if expected_ids != actual_ids: |
| 464 | expected_end_id = vocab_size + len(actual_ids) - 1 |
| 465 | raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range " |
| 466 | f"{vocab_size} - {expected_end_id}; got {actual_ids}") |
| 467 | |
| 468 | items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) |
| 469 | self.added_tokens_dict = added_tokens |
| 470 | self.added_tokens_list = [text for (text, idx) in items] |
| 471 | self.vocab_size_base = vocab_size |
| 472 | self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) |
| 473 | self.fname_tokenizer = fname_tokenizer |
| 474 | |
| 475 | def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: |
| 476 | reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} |