MCPcopy
hub / github.com/policy-gradient/GRPO-Zero / Tokenizer

Class Tokenizer

tokenizer.py:10–38  ·  view source on GitHub ↗

Tokenizer with chat template supported using jinja2 engine

Source from the content-addressed store, hash-verified

8
9
10class Tokenizer:
11 """Tokenizer with chat template supported using jinja2 engine"""
12
13 def __init__(self, tokenizer_path: str):
14 super().__init__()
15 tokenizer_config_path = Path(tokenizer_path).parent / "tokenizer_config.json"
16 self.tokenizer_config = json.load(open(tokenizer_config_path))
17 self.tokenizer = TokenizerBase.from_file(tokenizer_path)
18 self.chat_template = Environment().from_string(
19 self.tokenizer_config["chat_template"]
20 )
21 self.eos_token = self.tokenizer_config["eos_token"]
22 self.eos_token_id = self.tokenizer.token_to_id(self.eos_token)
23 self.pad_token = self.tokenizer_config["pad_token"]
24 self.pad_token_id = self.tokenizer.token_to_id(self.pad_token)
25
26 def encode_chat(self, messages: List[Dict[str, str]]) -> str:
27 return self.chat_template.render(messages=messages, add_generation_prompt=True)
28
29 def encode_chat_with_response_prompt(
30 self, messages: List[Dict[str, str]], prompt: str
31 ) -> str:
32 return self.encode_chat(messages) + prompt
33
34 def tokenize(self, text: str) -> Encoding:
35 return self.tokenizer.encode(text)
36
37 def detokenize(self, token_ids: List[int]) -> str:
38 return self.tokenizer.decode(token_ids, skip_special_tokens=False)

Callers 1

mainFunction · 0.90

Calls

no outgoing calls

Tested by

no test coverage detected