| 138 | ) |
| 139 | |
| 140 | def process(self): |
| 141 | from nltk.corpus.reader import BracketParseCorpusReader |
| 142 | |
| 143 | # load vocab file |
| 144 | self._vocab = OrderedDict() |
| 145 | vocab_file = ( |
| 146 | self._vocab_file |
| 147 | if self._vocab_file is not None |
| 148 | else os.path.join(self.raw_path, "vocab.txt") |
| 149 | ) |
| 150 | with open(vocab_file, encoding="utf-8") as vf: |
| 151 | for line in vf.readlines(): |
| 152 | line = line.strip() |
| 153 | self._vocab[line] = len(self._vocab) |
| 154 | |
| 155 | # filter glove |
| 156 | if self._glove_embed_file is not None and os.path.exists( |
| 157 | self._glove_embed_file |
| 158 | ): |
| 159 | glove_emb = {} |
| 160 | with open(self._glove_embed_file, "r", encoding="utf-8") as pf: |
| 161 | for line in pf.readlines(): |
| 162 | sp = line.split(" ") |
| 163 | if sp[0].lower() in self._vocab: |
| 164 | glove_emb[sp[0].lower()] = np.asarray( |
| 165 | [float(x) for x in sp[1:]] |
| 166 | ) |
| 167 | files = ["{}.txt".format(self.mode)] |
| 168 | corpus = BracketParseCorpusReader(self.raw_path, files) |
| 169 | sents = corpus.parsed_sents(files[0]) |
| 170 | |
| 171 | # initialize with glove |
| 172 | pretrained_emb = [] |
| 173 | fail_cnt = 0 |
| 174 | for line in self._vocab.keys(): |
| 175 | if self._glove_embed_file is not None and os.path.exists( |
| 176 | self._glove_embed_file |
| 177 | ): |
| 178 | if not line.lower() in glove_emb: |
| 179 | fail_cnt += 1 |
| 180 | pretrained_emb.append( |
| 181 | glove_emb.get( |
| 182 | line.lower(), np.random.uniform(-0.05, 0.05, 300) |
| 183 | ) |
| 184 | ) |
| 185 | |
| 186 | self._pretrained_emb = None |
| 187 | if self._glove_embed_file is not None and os.path.exists( |
| 188 | self._glove_embed_file |
| 189 | ): |
| 190 | self._pretrained_emb = F.tensor(np.stack(pretrained_emb, 0)) |
| 191 | print( |
| 192 | "Miss word in GloVe {0:.4f}".format( |
| 193 | 1.0 * fail_cnt / len(self._pretrained_emb) |
| 194 | ) |
| 195 | ) |
| 196 | # build trees |
| 197 | self._trees = [] |