r"""Basic preprocessing of text. Steps: * Replace \n and \t with whitespace * Only keep alpha-numerical characters
(self, documents: np.ndarray)
| 4802 | return probabilities |
| 4803 | |
| 4804 | def _preprocess_text(self, documents: np.ndarray) -> List[str]: |
| 4805 | r"""Basic preprocessing of text. |
| 4806 | |
| 4807 | Steps: |
| 4808 | * Replace \n and \t with whitespace |
| 4809 | * Only keep alpha-numerical characters |
| 4810 | """ |
| 4811 | cleaned_documents = [doc.replace("\n", " ") for doc in documents] |
| 4812 | cleaned_documents = [doc.replace("\t", " ") for doc in cleaned_documents] |
| 4813 | if self.language == "english": |
| 4814 | cleaned_documents = [re.sub(r"[^A-Za-z0-9 ]+", "", doc) for doc in cleaned_documents] |
| 4815 | cleaned_documents = [doc if doc != "" else "emptydoc" for doc in cleaned_documents] |
| 4816 | return cleaned_documents |
| 4817 | |
| 4818 | @staticmethod |
| 4819 | def _top_n_idx_sparse(matrix: csr_matrix, n: int) -> np.ndarray: |
no outgoing calls