Function get_tokenizer

src/alignment/model_utils.py:66–94 · view source on GitHub ↗

Get the tokenizer for the model.

(
    model_args: ModelArguments, data_args: DataArguments, auto_set_chat_template: bool = True
)

Source from the content-addressed store, hash-verified

64
65
66	def get_tokenizer(
67	model_args: ModelArguments, data_args: DataArguments, auto_set_chat_template: bool = True
68	) -> PreTrainedTokenizer:
69	"""Get the tokenizer for the model."""
70	tokenizer = AutoTokenizer.from_pretrained(
71	(
72	model_args.model_name_or_path
73	if model_args.tokenizer_name_or_path is None
74	else model_args.tokenizer_name_or_path
75	),
76	revision=model_args.model_revision,
77	trust_remote_code=model_args.trust_remote_code,
78	)
79	if tokenizer.pad_token_id is None:
80	tokenizer.pad_token_id = tokenizer.eos_token_id
81
82	if data_args.truncation_side is not None:
83	tokenizer.truncation_side = data_args.truncation_side
84
85	# Set reasonable default for models without max length
86	if tokenizer.model_max_length > 100_000:
87	tokenizer.model_max_length = 2048
88
89	if data_args.chat_template is not None:
90	tokenizer.chat_template = data_args.chat_template
91	elif auto_set_chat_template and tokenizer.get_chat_template() is None:
92	tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
93
94	return tokenizer
95
96
97	def get_peft_config(model_args: ModelArguments) -> PeftConfig \| None:

mainFunction · 0.90

test_right_truncation_sideMethod · 0.90

test_left_truncation_sideMethod · 0.90

test_default_chat_templateMethod · 0.90

test_chatml_chat_templateMethod · 0.90

setUpMethod · 0.90

no outgoing calls

test_right_truncation_sideMethod · 0.72

test_left_truncation_sideMethod · 0.72

test_default_chat_templateMethod · 0.72

test_chatml_chat_templateMethod · 0.72

setUpMethod · 0.72