| 8 | |
| 9 | |
| 10 | class TextTokenizer: |
| 11 | type: str |
| 12 | |
| 13 | def __init__(self, type: str = 'nltk'): |
| 14 | self.type = type |
| 15 | self.WORD_TOKENIZER_FUNCTIONS = { |
| 16 | 'nltk': self.__tokenize_word_nltk |
| 17 | } |
| 18 | self.SENTENCE_TOKENIZER_FUNCTIONS = { |
| 19 | 'nltk': self.__tokenize_sentence_nltk |
| 20 | } |
| 21 | |
| 22 | def __tokenize_word_nltk(self, text) -> list: |
| 23 | paragraphs = [p for p in text.split('\n') if p] |
| 24 | result = [] |
| 25 | for paragraph in paragraphs: |
| 26 | tokens = word_tokenize(paragraph) |
| 27 | for token in tokens: |
| 28 | result.append({'value': token, 'tag': ''}) |
| 29 | result.append({'value': '\n', 'tag': ''}) |
| 30 | return result |
| 31 | |
| 32 | def __tokenize_sentence_nltk(self, text) -> list: |
| 33 | tokens = sent_tokenize(text) |
| 34 | result = [] |
| 35 | for token in tokens: |
| 36 | result.append({'value': token, 'tag': ''}) |
| 37 | return result |
| 38 | |
| 39 | def get_word_tokenizer_function(self) -> Callable: |
| 40 | return self.WORD_TOKENIZER_FUNCTIONS.get(self.type) |
| 41 | |
| 42 | def get_sentence_tokenizer_function(self) -> Callable: |
| 43 | return self.SENTENCE_TOKENIZER_FUNCTIONS.get(self.type) |
| 44 | |
| 45 | def tokenize_words(self, text) -> list: |
| 46 | tokenize_words = self.get_word_tokenizer_function() |
| 47 | result = tokenize_words(text) |
| 48 | return result |
| 49 | |
| 50 | def tokenize_sentences(self, text) -> list: |
| 51 | tokenize_senteces = self.get_sentence_tokenizer_function() |
| 52 | result = tokenize_senteces(text) |
| 53 | return result |
no outgoing calls