processTokens handles the common preprocessing steps for both QueryTokens and Tokens
(v interface{})
| 445 | |
| 446 | // processTokens handles the common preprocessing steps for both QueryTokens and Tokens |
| 447 | func (t NGramTokenizer) processTokens(v interface{}) (analysis.TokenStream, error) { |
| 448 | str, ok := v.(string) |
| 449 | if !ok || str == "" { |
| 450 | return nil, nil |
| 451 | } |
| 452 | lang := LangBase(t.lang) |
| 453 | |
| 454 | // Step 1: Lowercase, normalize, basic tokenization |
| 455 | tokens := fulltextAnalyzer.Analyze([]byte(str)) |
| 456 | |
| 457 | // Step 2: Remove stopwords |
| 458 | tokens = filterStopwords(lang, tokens) |
| 459 | |
| 460 | // Step 3: Apply stemming |
| 461 | tokens = filterStemmers(lang, tokens) |
| 462 | |
| 463 | return tokens, nil |
| 464 | } |
| 465 | |
| 466 | // generateNGrams creates n-grams from tokens and returns deduplicated results |
| 467 | func (t NGramTokenizer) generateNGrams(tokens analysis.TokenStream, ngramFunc func(analysis.TokenStream, func(string))) []string { |
no test coverage detected