(self, model_type='bpe', vocab_size=None, corpus=None, model_path=None, character_coverage=1.0,
**kwargs)
| 670 | """Trains and uses sentencepiece for text tokenization""" |
| 671 | |
| 672 | def __init__(self, model_type='bpe', vocab_size=None, corpus=None, model_path=None, character_coverage=1.0, |
| 673 | **kwargs): |
| 674 | self.character_coverage = character_coverage |
| 675 | self.model_type = model_type.lower() |
| 676 | self.spm_model = model_path |
| 677 | self.num_text_tokens = vocab_size |
| 678 | make_train = not SentencePieceTokenizer.exists(self.spm_model) |
| 679 | if make_train: |
| 680 | assert corpus is not None and self.num_text_tokens is not None |
| 681 | self.Train(corpus, self.num_text_tokens) |
| 682 | self._tokens = [] |
| 683 | self._vocab = {} |
| 684 | self.load_spm_model() |
| 685 | super(SentencePieceTokenizer, self).__init__() |
| 686 | |
| 687 | def __len__(self): |
| 688 | return self.num_text_tokens |
nothing calls this directly
no test coverage detected