| 1006 | |
| 1007 | |
| 1008 | class Vocabulary: |
| 1009 | def __init__( |
| 1010 | self, |
| 1011 | lowercase=True, |
| 1012 | min_count=None, |
| 1013 | max_tokens=None, |
| 1014 | filter_stopwords=True, |
| 1015 | filter_punctuation=True, |
| 1016 | tokenizer="words", |
| 1017 | ): |
| 1018 | """ |
| 1019 | An object for compiling and encoding the unique tokens in a text corpus. |
| 1020 | |
| 1021 | Parameters |
| 1022 | ---------- |
| 1023 | lowercase : bool |
| 1024 | Whether to convert each string to lowercase before tokenization. |
| 1025 | Default is True. |
| 1026 | min_count : int |
| 1027 | Minimum number of times a token must occur in order to be included |
| 1028 | in vocab. If `None`, include all tokens from `corpus_fp` in vocab. |
| 1029 | Default is None. |
| 1030 | max_tokens : int |
| 1031 | Only add the `max_tokens` most frequent tokens that occur more |
| 1032 | than `min_count` to the vocabulary. If None, add all tokens |
| 1033 | that occur more than than `min_count`. Default is None. |
| 1034 | filter_stopwords : bool |
| 1035 | Whether to remove stopwords before encoding the words in the |
| 1036 | corpus. Default is True. |
| 1037 | filter_punctuation : bool |
| 1038 | Whether to remove punctuation before encoding the words in the |
| 1039 | corpus. Default is True. |
| 1040 | tokenizer : {'whitespace', 'words', 'characters', 'bytes'} |
| 1041 | Strategy to follow when mapping strings to tokens. The |
| 1042 | `'whitespace'` tokenizer splits strings at whitespace characters. |
| 1043 | The `'words'` tokenizer splits strings using a "word" regex. The |
| 1044 | `'characters'` tokenizer splits strings into individual characters. |
| 1045 | The `'bytes'` tokenizer splits strings into a collection of |
| 1046 | individual bytes. |
| 1047 | """ |
| 1048 | self.hyperparameters = { |
| 1049 | "id": "Vocabulary", |
| 1050 | "encoding": None, |
| 1051 | "corpus_fps": None, |
| 1052 | "lowercase": lowercase, |
| 1053 | "min_count": min_count, |
| 1054 | "max_tokens": max_tokens, |
| 1055 | "filter_stopwords": filter_stopwords, |
| 1056 | "filter_punctuation": filter_punctuation, |
| 1057 | "tokenizer": tokenizer, |
| 1058 | } |
| 1059 | |
| 1060 | def __len__(self): |
| 1061 | """Return the number of tokens in the vocabulary""" |
| 1062 | return len(self._tokens) |
| 1063 | |
| 1064 | def __iter__(self): |
| 1065 | """Return an iterator over the tokens in the vocabulary""" |