MCPcopy
hub / github.com/diffgram/diffgram / TextTokenizer

Class TextTokenizer

walrus/methods/text_data/text_tokenizer.py:10–53  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

8
9
10class TextTokenizer:
11 type: str
12
13 def __init__(self, type: str = 'nltk'):
14 self.type = type
15 self.WORD_TOKENIZER_FUNCTIONS = {
16 'nltk': self.__tokenize_word_nltk
17 }
18 self.SENTENCE_TOKENIZER_FUNCTIONS = {
19 'nltk': self.__tokenize_sentence_nltk
20 }
21
22 def __tokenize_word_nltk(self, text) -> list:
23 paragraphs = [p for p in text.split('\n') if p]
24 result = []
25 for paragraph in paragraphs:
26 tokens = word_tokenize(paragraph)
27 for token in tokens:
28 result.append({'value': token, 'tag': ''})
29 result.append({'value': '\n', 'tag': ''})
30 return result
31
32 def __tokenize_sentence_nltk(self, text) -> list:
33 tokens = sent_tokenize(text)
34 result = []
35 for token in tokens:
36 result.append({'value': token, 'tag': ''})
37 return result
38
39 def get_word_tokenizer_function(self) -> Callable:
40 return self.WORD_TOKENIZER_FUNCTIONS.get(self.type)
41
42 def get_sentence_tokenizer_function(self) -> Callable:
43 return self.SENTENCE_TOKENIZER_FUNCTIONS.get(self.type)
44
45 def tokenize_words(self, text) -> list:
46 tokenize_words = self.get_word_tokenizer_function()
47 result = tokenize_words(text)
48 return result
49
50 def tokenize_sentences(self, text) -> list:
51 tokenize_senteces = self.get_sentence_tokenizer_function()
52 result = tokenize_senteces(text)
53 return result

Calls

no outgoing calls