Helper function to instantiate a tokenizer given common combinations of options.
(tokenizer_type, corpus, model_path=None, vocab_size=None, model_type='bpe', pad_token=0, character_coverage=1.0, command_tokens=None, type_tokens=None, **kwargs)
| 29 | import regex as re |
| 30 | |
| 31 | def make_tokenizer(tokenizer_type, corpus, model_path=None, vocab_size=None, model_type='bpe', pad_token=0, character_coverage=1.0, command_tokens=None, type_tokens=None, **kwargs): |
| 32 | """ |
| 33 | Helper function to instantiate a tokenizer given common combinations of options. |
| 34 | """ |
| 35 | tokenizer_class = tokenizer_type |
| 36 | if isinstance(tokenizer_class, str): |
| 37 | tokenizer_class = eval(tokenizer_class) |
| 38 | if tokenizer_class is BertWordPieceTokenizer: |
| 39 | return BertWordPieceTokenizer(model_type, **kwargs) |
| 40 | elif tokenizer_class is GPT2BPETokenizer: |
| 41 | return GPT2BPETokenizer(**kwargs) |
| 42 | text_tokenizer = tokenizer_class(corpus=corpus, vocab_size=vocab_size, model_path=model_path, model_type=model_type, |
| 43 | pad_token=pad_token, character_coverage=character_coverage) |
| 44 | return Tokenizer(text_tokenizer, command_tokens, type_tokens) |
| 45 | |
| 46 | class Tokenization(object): |
| 47 | """ |
no test coverage detected