hub / github.com/huichen/sego / LoadDictionary

Method LoadDictionary

segmenter.go:44–126 · view source on GitHub ↗

从文件中载入词典可以载入多个词典文件，文件名用","分隔，排在前面的词典优先载入分词，比如 "用户词典.txt,通用词典.txt" 当一个分词既出现在用户词典也出现在通用词典中，则优先使用用户词典。词典的格式为（每个分词一行）：分词文本频率词性

(files string)

Source from the content-addressed store, hash-verified

42	// 词典的格式为（每个分词一行）：
43	// 分词文本频率词性
44	func (seg *Segmenter) LoadDictionary(files string) {
45	seg.dict = NewDictionary()
46	for _, file := range strings.Split(files, ",") {
47	log.Printf("载入sego词典 %s", file)
48	dictFile, err := os.Open(file)
49	defer dictFile.Close()
50	if err != nil {
51	log.Fatalf("无法载入字典文件 \"%s\" \n", file)
52	}
53
54	reader := bufio.NewReader(dictFile)
55	var text string
56	var freqText string
57	var frequency int
58	var pos string
59
60	// 逐行读入分词
61	for {
62	size, _ := fmt.Fscanln(reader, &text, &freqText, &pos)
63
64	if size == 0 {
65	// 文件结束
66	break
67	} else if size < 2 {
68	// 无效行
69	continue
70	} else if size == 2 {
71	// 没有词性标注时设为空字符串
72	pos = ""
73	}
74
75	// 解析词频
76	var err error
77	frequency, err = strconv.Atoi(freqText)
78	if err != nil {
79	continue
80	}
81
82	// 过滤频率太小的词
83	if frequency < minTokenFrequency {
84	continue
85	}
86
87	// 将分词添加到字典中
88	words := splitTextToWords([]byte(text))
89	token := Token{text: words, frequency: frequency, pos: pos}
90	seg.dict.addToken(token)
91	}
92	}
93
94	// 计算每个分词的路径值，路径值含义见Token结构体的注释
95	logTotalFrequency := float32(math.Log2(float64(seg.dict.totalFrequency)))
96	for i := range seg.dict.tokens {
97	token := &seg.dict.tokens[i]
98	token.distance = logTotalFrequency - float32(math.Log2(float64(token.frequency)))
99	}
100
101	// 对每个分词进行细致划分，用于搜索引擎模式，该模式用法见Token结构体的注释。

Callers 7

TestSegmentFunction · 0.95

Test_Token_SplitFunction · 0.95

mainFunction · 0.95

TestLargeDictionaryFunction · 0.80

mainFunction · 0.80

Calls 5

segmentWordsMethod · 0.95

NewDictionaryFunction · 0.85

splitTextToWordsFunction · 0.85

addTokenMethod · 0.80

CloseMethod · 0.45

Tested by 3

TestSegmentFunction · 0.76

Test_Token_SplitFunction · 0.76

TestLargeDictionaryFunction · 0.64