Class TextTokenizer

walrus/methods/text_data/text_tokenizer.py:10–53 · view source on GitHub ↗

Source from the content-addressed store, hash-verified

8
9
10	class TextTokenizer:
11	type: str
12
13	def __init__(self, type: str = 'nltk'):
14	self.type = type
15	self.WORD_TOKENIZER_FUNCTIONS = {
16	'nltk': self.__tokenize_word_nltk
17	}
18	self.SENTENCE_TOKENIZER_FUNCTIONS = {
19	'nltk': self.__tokenize_sentence_nltk
20	}
21
22	def __tokenize_word_nltk(self, text) -> list:
23	paragraphs = [p for p in text.split('\n') if p]
24	result = []
25	for paragraph in paragraphs:
26	tokens = word_tokenize(paragraph)
27	for token in tokens:
28	result.append({'value': token, 'tag': ''})
29	result.append({'value': '\n', 'tag': ''})
30	return result
31
32	def __tokenize_sentence_nltk(self, text) -> list:
33	tokens = sent_tokenize(text)
34	result = []
35	for token in tokens:
36	result.append({'value': token, 'tag': ''})
37	return result
38
39	def get_word_tokenizer_function(self) -> Callable:
40	return self.WORD_TOKENIZER_FUNCTIONS.get(self.type)
41
42	def get_sentence_tokenizer_function(self) -> Callable:
43	return self.SENTENCE_TOKENIZER_FUNCTIONS.get(self.type)
44
45	def tokenize_words(self, text) -> list:
46	tokenize_words = self.get_word_tokenizer_function()
47	result = tokenize_words(text)
48	return result
49
50	def tokenize_sentences(self, text) -> list:
51	tokenize_senteces = self.get_sentence_tokenizer_function()
52	result = tokenize_senteces(text)
53	return result

test___tokenize_word_nltkMethod · 0.90

test__tokenize_sentence_nltkMethod · 0.90

test_get_word_tokenizer_functionMethod · 0.90

test_get_sentence_tokenizer_functionMethod · 0.90

test_tokenize_wordsMethod · 0.90

test_tokenize_sentencesMethod · 0.90

save_text_tokensMethod · 0.90

no outgoing calls

test___tokenize_word_nltkMethod · 0.72

test__tokenize_sentence_nltkMethod · 0.72

test_get_word_tokenizer_functionMethod · 0.72

test_get_sentence_tokenizer_functionMethod · 0.72

test_tokenize_wordsMethod · 0.72

test_tokenize_sentencesMethod · 0.72