SplitTextToWords splits a string to token words
(text Text)
| 209 | |
| 210 | // SplitTextToWords splits a string to token words |
| 211 | func (seg *Segmenter) SplitTextToWords(text Text) []Text { |
| 212 | output := make([]Text, 0, len(text)/3) |
| 213 | current, alphanumericStart := 0, 0 |
| 214 | inAlphanumeric := true |
| 215 | |
| 216 | for current < len(text) { |
| 217 | r, size := utf8.DecodeRune(text[current:]) |
| 218 | isNum := unicode.IsNumber(r) && !seg.Num |
| 219 | isAlpha := unicode.IsLetter(r) && !seg.Alpha |
| 220 | if size <= 2 && (isAlpha || isNum) { |
| 221 | // Currently is Latin alphabet or numbers (not in CJK) |
| 222 | if !inAlphanumeric { |
| 223 | alphanumericStart = current |
| 224 | inAlphanumeric = true |
| 225 | } |
| 226 | |
| 227 | if seg.AlphaNum { |
| 228 | output = append(output, toLow(text[current:current+size])) |
| 229 | } |
| 230 | } else { |
| 231 | if inAlphanumeric { |
| 232 | inAlphanumeric = false |
| 233 | if current != 0 && !seg.AlphaNum { |
| 234 | output = append(output, toLow(text[alphanumericStart:current])) |
| 235 | } |
| 236 | } |
| 237 | |
| 238 | output = append(output, text[current:current+size]) |
| 239 | } |
| 240 | current += size |
| 241 | } |
| 242 | |
| 243 | // process last byte is alpha and num |
| 244 | if inAlphanumeric && !seg.AlphaNum { |
| 245 | if current != 0 { |
| 246 | output = append(output, toLow(text[alphanumericStart:current])) |
| 247 | } |
| 248 | } |
| 249 | |
| 250 | return output |
| 251 | } |
| 252 | |
| 253 | func toLow(text []byte) []byte { |
| 254 | if ToLower { |