Transform a sequence of documents to a document-term matrix. Parameters ---------- X : iterable over raw text documents, length = n_samples Samples. Each sample must be a text document (either bytes or unicode strings, file name or file object dependi
(self, X)
| 863 | return self |
| 864 | |
| 865 | def transform(self, X): |
| 866 | """Transform a sequence of documents to a document-term matrix. |
| 867 | |
| 868 | Parameters |
| 869 | ---------- |
| 870 | X : iterable over raw text documents, length = n_samples |
| 871 | Samples. Each sample must be a text document (either bytes or |
| 872 | unicode strings, file name or file object depending on the |
| 873 | constructor argument) which will be tokenized and hashed. |
| 874 | |
| 875 | Returns |
| 876 | ------- |
| 877 | X : sparse matrix of shape (n_samples, n_features) |
| 878 | Document-term matrix. |
| 879 | """ |
| 880 | if isinstance(X, str): |
| 881 | raise ValueError( |
| 882 | "Iterable over raw text documents expected, string object received." |
| 883 | ) |
| 884 | |
| 885 | self._validate_ngram_range() |
| 886 | |
| 887 | analyzer = self.build_analyzer() |
| 888 | X = self._get_hasher().transform(analyzer(doc) for doc in X) |
| 889 | if self.binary: |
| 890 | X.data.fill(1) |
| 891 | if self.norm is not None: |
| 892 | X = normalize(X, norm=self.norm, copy=False) |
| 893 | return _align_api_if_sparse(X) |
| 894 | |
| 895 | def fit_transform(self, X, y=None): |
| 896 | """Transform a sequence of documents to a document-term matrix. |