Encode text to token ids (simple simulation).
(self, text: str, add_special_tokens: bool = True)
| 31 | self.bos_token_id = 1 |
| 32 | |
| 33 | def encode(self, text: str, add_special_tokens: bool = True) -> List[int]: |
| 34 | """Encode text to token ids (simple simulation).""" |
| 35 | # Simple simulation: each word becomes a token |
| 36 | tokens = [] |
| 37 | if add_special_tokens: |
| 38 | tokens.append(self.bos_token_id) |
| 39 | # Simulate tokenization by splitting on spaces |
| 40 | for i, word in enumerate(text.split()): |
| 41 | # Use hash to get a consistent token id for each word |
| 42 | token_id = (hash(word) % (self.vocab_size - 10)) + 10 |
| 43 | tokens.append(token_id) |
| 44 | return tokens |
| 45 | |
| 46 | def decode( |
| 47 | self, |
no test coverage detected