Split a string into individual words, optionally removing punctuation and stop-words in the process. Translate each word into a list of bytes.
(
line,
lowercase=True,
filter_stopwords=True,
filter_punctuation=True,
encoding="utf-8",
**kwargs,
)
| 87 | |
| 88 | |
| 89 | def tokenize_words_bytes( |
| 90 | line, |
| 91 | lowercase=True, |
| 92 | filter_stopwords=True, |
| 93 | filter_punctuation=True, |
| 94 | encoding="utf-8", |
| 95 | **kwargs, |
| 96 | ): |
| 97 | """ |
| 98 | Split a string into individual words, optionally removing punctuation and |
| 99 | stop-words in the process. Translate each word into a list of bytes. |
| 100 | """ |
| 101 | words = tokenize_words( |
| 102 | line, |
| 103 | lowercase=lowercase, |
| 104 | filter_stopwords=filter_stopwords, |
| 105 | filter_punctuation=filter_punctuation, |
| 106 | **kwargs, |
| 107 | ) |
| 108 | words = [" ".join([str(i) for i in w.encode(encoding)]) for w in words] |
| 109 | return words |
| 110 | |
| 111 | |
| 112 | def tokenize_bytes_raw(line, encoding="utf-8", splitter=None, **kwargs): |
nothing calls this directly
no test coverage detected