Method SplitTextToWords

segmenter.go:211–251 · view source on GitHub ↗

SplitTextToWords splits a string to token words

(text Text)

Source from the content-addressed store, hash-verified

209
210	// SplitTextToWords splits a string to token words
211	func (seg *Segmenter) SplitTextToWords(text Text) []Text {
212	output := make([]Text, 0, len(text)/3)
213	current, alphanumericStart := 0, 0
214	inAlphanumeric := true
215
216	for current < len(text) {
217	r, size := utf8.DecodeRune(text[current:])
218	isNum := unicode.IsNumber(r) && !seg.Num
219	isAlpha := unicode.IsLetter(r) && !seg.Alpha
220	if size <= 2 && (isAlpha \|\| isNum) {
221	// Currently is Latin alphabet or numbers (not in CJK)
222	if !inAlphanumeric {
223	alphanumericStart = current
224	inAlphanumeric = true
225	}
226
227	if seg.AlphaNum {
228	output = append(output, toLow(text[current:current+size]))
229	}
230	} else {
231	if inAlphanumeric {
232	inAlphanumeric = false
233	if current != 0 && !seg.AlphaNum {
234	output = append(output, toLow(text[alphanumericStart:current]))
235	}
236	}
237
238	output = append(output, text[current:current+size])
239	}
240	current += size
241	}
242
243	// process last byte is alpha and num
244	if inAlphanumeric && !seg.AlphaNum {
245	if current != 0 {
246	output = append(output, toLow(text[alphanumericStart:current]))
247	}
248	}
249
250	return output
251	}
252
253	func toLow(text []byte) []byte {
254	if ToLower {

TestSplitFunction · 0.95

ToTokenMethod · 0.95

RemoveTokenMethod · 0.95

LoadDictMapMethod · 0.95

ReaderMethod · 0.95

ReaderTFIDFMethod · 0.95

LoadDictStrMethod · 0.95

LoadTFIDFDictStrMethod · 0.95

internalSegmentMethod · 0.95

SplitWordsFunction · 0.95

toLowFunction · 0.85

TestSplitFunction · 0.76