MCPcopy
hub / github.com/deepseek-ai/DeepSeek-MoE / _tokenize_fn

Function _tokenize_fn

finetune/finetune.py:115–136  ·  view source on GitHub ↗

Tokenize a list of strings.

(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer)

Source from the content-addressed store, hash-verified

113
114
115def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
116 """Tokenize a list of strings."""
117 tokenized_list = [
118 tokenizer(
119 text,
120 # return_tensors="pt",
121 max_length=tokenizer.model_max_length,
122 truncation=True,
123 )
124 for text in strings
125 ]
126 input_ids = labels = [np.array(tokenized.input_ids) for tokenized in tokenized_list]
127 input_ids_lens = labels_lens = [
128 len(tokenized.input_ids) for tokenized in tokenized_list
129 ]
130
131 return dict(
132 input_ids=input_ids,
133 labels=labels,
134 input_ids_lens=input_ids_lens,
135 labels_lens=labels_lens,
136 )
137
138
139def preprocess(

Callers 1

preprocessFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected