将文本划分成字元
(text Text)
| 256 | |
| 257 | // 将文本划分成字元 |
| 258 | func splitTextToWords(text Text) []Text { |
| 259 | output := make([]Text, 0, len(text)/3) |
| 260 | current := 0 |
| 261 | inAlphanumeric := true |
| 262 | alphanumericStart := 0 |
| 263 | for current < len(text) { |
| 264 | r, size := utf8.DecodeRune(text[current:]) |
| 265 | if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) { |
| 266 | // 当前是拉丁字母或数字(非中日韩文字) |
| 267 | if !inAlphanumeric { |
| 268 | alphanumericStart = current |
| 269 | inAlphanumeric = true |
| 270 | } |
| 271 | } else { |
| 272 | if inAlphanumeric { |
| 273 | inAlphanumeric = false |
| 274 | if current != 0 { |
| 275 | output = append(output, toLower(text[alphanumericStart:current])) |
| 276 | } |
| 277 | } |
| 278 | output = append(output, text[current:current+size]) |
| 279 | } |
| 280 | current += size |
| 281 | } |
| 282 | |
| 283 | // 处理最后一个字元是英文的情况 |
| 284 | if inAlphanumeric { |
| 285 | if current != 0 { |
| 286 | output = append(output, toLower(text[alphanumericStart:current])) |
| 287 | } |
| 288 | } |
| 289 | |
| 290 | return output |
| 291 | } |
| 292 | |
| 293 | // 将英文词转化为小写 |
| 294 | func toLower(text []byte) []byte { |