Method init

utils/convert.py:498–526 · view source on GitHub ↗

(self, base_path: Path)

Source from the content-addressed store, hash-verified

496	name = "spm"
497
498	def __init__(self, base_path: Path):
499	added_tokens: dict[str, int] = {}
500	if (fname_tokenizer := base_path / 'tokenizer.model').exists():
501	# normal location
502	try:
503	with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
504	added_tokens = json.load(f)
505	except FileNotFoundError:
506	pass
507	elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
508	# not found in alternate location either
509	raise FileNotFoundError('Cannot find tokenizer.model')
510
511	self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
512	vocab_size = self.sentencepiece_tokenizer.vocab_size()
513
514	new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
515	expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
516	actual_new_ids = sorted(new_tokens.keys())
517
518	if expected_new_ids != actual_new_ids:
519	raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
520
521	# Token pieces that were added to the base vocabulary.
522	self.added_tokens_dict = added_tokens
523	self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
524	self.vocab_size_base = vocab_size
525	self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
526	self.fname_tokenizer = fname_tokenizer
527
528	def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
529	tokenizer = self.sentencepiece_tokenizer

nothing calls this directly

loadMethod · 0.45

no test coverage detected