MCPcopy Index your code
hub / github.com/microsoft/BitNet / __init__

Method __init__

utils/convert.py:423–473  ·  view source on GitHub ↗
(self, base_path: Path)

Source from the content-addressed store, hash-verified

421 name = "bpe"
422
423 def __init__(self, base_path: Path):
424 added_tokens: dict[str, int] = {}
425
426 if (fname_tokenizer := base_path / 'vocab.json').exists():
427 # "slow" tokenizer
428 with open(fname_tokenizer, encoding="utf-8") as f:
429 self.vocab = json.load(f)
430
431 try:
432 # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
433 with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
434 added_tokens = json.load(f)
435 except FileNotFoundError:
436 pass
437 else:
438 # "fast" tokenizer
439 fname_tokenizer = base_path / FAST_TOKENIZER_FILE
440
441 # if this fails, FileNotFoundError propagates to caller
442 with open(fname_tokenizer, encoding="utf-8") as f:
443 tokenizer_json = json.load(f)
444
445 tokenizer_model: dict[str, Any] = tokenizer_json['model']
446 if (
447 tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
448 or tokenizer_json['decoder']['type'] != 'ByteLevel'
449 ):
450 raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
451
452 self.vocab = tokenizer_model["vocab"]
453
454 if (added := tokenizer_json.get('added_tokens')) is not None:
455 # Added tokens here can be duplicates of the main vocabulary.
456 added_tokens = {item['content']: item['id']
457 for item in added
458 if item['content'] not in self.vocab}
459
460 vocab_size = len(self.vocab)
461 expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
462 actual_ids = sorted(added_tokens.values())
463 if expected_ids != actual_ids:
464 expected_end_id = vocab_size + len(actual_ids) - 1
465 raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
466 f"{vocab_size} - {expected_end_id}; got {actual_ids}")
467
468 items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
469 self.added_tokens_dict = added_tokens
470 self.added_tokens_list = [text for (text, idx) in items]
471 self.vocab_size_base = vocab_size
472 self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
473 self.fname_tokenizer = fname_tokenizer
474
475 def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
476 reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}

Callers

nothing calls this directly

Calls 1

loadMethod · 0.45

Tested by

no test coverage detected