Split a string into individual words, optionally removing punctuation and stop-words in the process.
(
line, lowercase=True, filter_stopwords=True, filter_punctuation=True, **kwargs,
)
| 75 | |
| 76 | |
| 77 | def tokenize_words( |
| 78 | line, lowercase=True, filter_stopwords=True, filter_punctuation=True, **kwargs, |
| 79 | ): |
| 80 | """ |
| 81 | Split a string into individual words, optionally removing punctuation and |
| 82 | stop-words in the process. |
| 83 | """ |
| 84 | REGEX = _WORD_REGEX if filter_punctuation else _WORD_REGEX_W_PUNC |
| 85 | words = REGEX.findall(line.lower() if lowercase else line) |
| 86 | return remove_stop_words(words) if filter_stopwords else words |
| 87 | |
| 88 | |
| 89 | def tokenize_words_bytes( |