MCPcopy
hub / github.com/go-ego/gse / SplitTextToWords

Method SplitTextToWords

segmenter.go:211–251  ·  view source on GitHub ↗

SplitTextToWords splits a string to token words

(text Text)

Source from the content-addressed store, hash-verified

209
210// SplitTextToWords splits a string to token words
211func (seg *Segmenter) SplitTextToWords(text Text) []Text {
212 output := make([]Text, 0, len(text)/3)
213 current, alphanumericStart := 0, 0
214 inAlphanumeric := true
215
216 for current < len(text) {
217 r, size := utf8.DecodeRune(text[current:])
218 isNum := unicode.IsNumber(r) && !seg.Num
219 isAlpha := unicode.IsLetter(r) && !seg.Alpha
220 if size <= 2 && (isAlpha || isNum) {
221 // Currently is Latin alphabet or numbers (not in CJK)
222 if !inAlphanumeric {
223 alphanumericStart = current
224 inAlphanumeric = true
225 }
226
227 if seg.AlphaNum {
228 output = append(output, toLow(text[current:current+size]))
229 }
230 } else {
231 if inAlphanumeric {
232 inAlphanumeric = false
233 if current != 0 && !seg.AlphaNum {
234 output = append(output, toLow(text[alphanumericStart:current]))
235 }
236 }
237
238 output = append(output, text[current:current+size])
239 }
240 current += size
241 }
242
243 // process last byte is alpha and num
244 if inAlphanumeric && !seg.AlphaNum {
245 if current != 0 {
246 output = append(output, toLow(text[alphanumericStart:current]))
247 }
248 }
249
250 return output
251}
252
253func toLow(text []byte) []byte {
254 if ToLower {

Callers 10

TestSplitFunction · 0.95
ToTokenMethod · 0.95
RemoveTokenMethod · 0.95
LoadDictMapMethod · 0.95
ReaderMethod · 0.95
ReaderTFIDFMethod · 0.95
LoadDictStrMethod · 0.95
LoadTFIDFDictStrMethod · 0.95
internalSegmentMethod · 0.95
SplitWordsFunction · 0.95

Calls 1

toLowFunction · 0.85

Tested by 1

TestSplitFunction · 0.76