MCPcopy
hub / github.com/dgraph-io/dgraph / processTokens

Method processTokens

tok/tok.go:447–464  ·  view source on GitHub ↗

processTokens handles the common preprocessing steps for both QueryTokens and Tokens

(v interface{})

Source from the content-addressed store, hash-verified

445
446// processTokens handles the common preprocessing steps for both QueryTokens and Tokens
447func (t NGramTokenizer) processTokens(v interface{}) (analysis.TokenStream, error) {
448 str, ok := v.(string)
449 if !ok || str == "" {
450 return nil, nil
451 }
452 lang := LangBase(t.lang)
453
454 // Step 1: Lowercase, normalize, basic tokenization
455 tokens := fulltextAnalyzer.Analyze([]byte(str))
456
457 // Step 2: Remove stopwords
458 tokens = filterStopwords(lang, tokens)
459
460 // Step 3: Apply stemming
461 tokens = filterStemmers(lang, tokens)
462
463 return tokens, nil
464}
465
466// generateNGrams creates n-grams from tokens and returns deduplicated results
467func (t NGramTokenizer) generateNGrams(tokens analysis.TokenStream, ngramFunc func(analysis.TokenStream, func(string))) []string {

Callers 2

QueryTokensMethod · 0.95
TokensMethod · 0.95

Calls 3

LangBaseFunction · 0.85
filterStopwordsFunction · 0.85
filterStemmersFunction · 0.85

Tested by

no test coverage detected