MCPcopy Index your code
hub / github.com/huichen/sego / splitTextToWords

Function splitTextToWords

segmenter.go:258–291  ·  view source on GitHub ↗

将文本划分成字元

(text Text)

Source from the content-addressed store, hash-verified

256
257// 将文本划分成字元
258func splitTextToWords(text Text) []Text {
259 output := make([]Text, 0, len(text)/3)
260 current := 0
261 inAlphanumeric := true
262 alphanumericStart := 0
263 for current < len(text) {
264 r, size := utf8.DecodeRune(text[current:])
265 if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) {
266 // 当前是拉丁字母或数字(非中日韩文字)
267 if !inAlphanumeric {
268 alphanumericStart = current
269 inAlphanumeric = true
270 }
271 } else {
272 if inAlphanumeric {
273 inAlphanumeric = false
274 if current != 0 {
275 output = append(output, toLower(text[alphanumericStart:current]))
276 }
277 }
278 output = append(output, text[current:current+size])
279 }
280 current += size
281 }
282
283 // 处理最后一个字元是英文的情况
284 if inAlphanumeric {
285 if current != 0 {
286 output = append(output, toLower(text[alphanumericStart:current]))
287 }
288 }
289
290 return output
291}
292
293// 将英文词转化为小写
294func toLower(text []byte) []byte {

Callers 3

TestSplitFunction · 0.85
LoadDictionaryMethod · 0.85
internalSegmentMethod · 0.85

Calls 1

toLowerFunction · 0.85

Tested by 1

TestSplitFunction · 0.68