MCPcopy
hub / github.com/dmlc/dgl / process

Method process

python/dgl/data/tree.py:140–199  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

138 )
139
140 def process(self):
141 from nltk.corpus.reader import BracketParseCorpusReader
142
143 # load vocab file
144 self._vocab = OrderedDict()
145 vocab_file = (
146 self._vocab_file
147 if self._vocab_file is not None
148 else os.path.join(self.raw_path, "vocab.txt")
149 )
150 with open(vocab_file, encoding="utf-8") as vf:
151 for line in vf.readlines():
152 line = line.strip()
153 self._vocab[line] = len(self._vocab)
154
155 # filter glove
156 if self._glove_embed_file is not None and os.path.exists(
157 self._glove_embed_file
158 ):
159 glove_emb = {}
160 with open(self._glove_embed_file, "r", encoding="utf-8") as pf:
161 for line in pf.readlines():
162 sp = line.split(" ")
163 if sp[0].lower() in self._vocab:
164 glove_emb[sp[0].lower()] = np.asarray(
165 [float(x) for x in sp[1:]]
166 )
167 files = ["{}.txt".format(self.mode)]
168 corpus = BracketParseCorpusReader(self.raw_path, files)
169 sents = corpus.parsed_sents(files[0])
170
171 # initialize with glove
172 pretrained_emb = []
173 fail_cnt = 0
174 for line in self._vocab.keys():
175 if self._glove_embed_file is not None and os.path.exists(
176 self._glove_embed_file
177 ):
178 if not line.lower() in glove_emb:
179 fail_cnt += 1
180 pretrained_emb.append(
181 glove_emb.get(
182 line.lower(), np.random.uniform(-0.05, 0.05, 300)
183 )
184 )
185
186 self._pretrained_emb = None
187 if self._glove_embed_file is not None and os.path.exists(
188 self._glove_embed_file
189 ):
190 self._pretrained_emb = F.tensor(np.stack(pretrained_emb, 0))
191 print(
192 "Miss word in GloVe {0:.4f}".format(
193 1.0 * fail_cnt / len(self._pretrained_emb)
194 )
195 )
196 # build trees
197 self._trees = []

Callers

nothing calls this directly

Calls 7

_build_treeMethod · 0.95
formatMethod · 0.80
appendMethod · 0.80
uniformMethod · 0.80
joinMethod · 0.45
keysMethod · 0.45
getMethod · 0.45

Tested by

no test coverage detected