Split a string at any whitespace characters, optionally removing punctuation and stop-words in the process.
(
line, lowercase=True, filter_stopwords=True, filter_punctuation=True, **kwargs,
)
| 62 | |
| 63 | |
| 64 | def tokenize_whitespace( |
| 65 | line, lowercase=True, filter_stopwords=True, filter_punctuation=True, **kwargs, |
| 66 | ): |
| 67 | """ |
| 68 | Split a string at any whitespace characters, optionally removing |
| 69 | punctuation and stop-words in the process. |
| 70 | """ |
| 71 | line = line.lower() if lowercase else line |
| 72 | words = line.split() |
| 73 | line = [strip_punctuation(w) for w in words] if filter_punctuation else line |
| 74 | return remove_stop_words(words) if filter_stopwords else words |
| 75 | |
| 76 | |
| 77 | def tokenize_words( |
nothing calls this directly
no test coverage detected