Method process

python/dgl/data/tree.py:140–199 · view source on GitHub ↗

(self)

Source from the content-addressed store, hash-verified

138	)
139
140	def process(self):
141	from nltk.corpus.reader import BracketParseCorpusReader
142
143	# load vocab file
144	self._vocab = OrderedDict()
145	vocab_file = (
146	self._vocab_file
147	if self._vocab_file is not None
148	else os.path.join(self.raw_path, "vocab.txt")
149	)
150	with open(vocab_file, encoding="utf-8") as vf:
151	for line in vf.readlines():
152	line = line.strip()
153	self._vocab[line] = len(self._vocab)
154
155	# filter glove
156	if self._glove_embed_file is not None and os.path.exists(
157	self._glove_embed_file
158	):
159	glove_emb = {}
160	with open(self._glove_embed_file, "r", encoding="utf-8") as pf:
161	for line in pf.readlines():
162	sp = line.split(" ")
163	if sp[0].lower() in self._vocab:
164	glove_emb[sp[0].lower()] = np.asarray(
165	[float(x) for x in sp[1:]]
166	)
167	files = ["{}.txt".format(self.mode)]
168	corpus = BracketParseCorpusReader(self.raw_path, files)
169	sents = corpus.parsed_sents(files[0])
170
171	# initialize with glove
172	pretrained_emb = []
173	fail_cnt = 0
174	for line in self._vocab.keys():
175	if self._glove_embed_file is not None and os.path.exists(
176	self._glove_embed_file
177	):
178	if not line.lower() in glove_emb:
179	fail_cnt += 1
180	pretrained_emb.append(
181	glove_emb.get(
182	line.lower(), np.random.uniform(-0.05, 0.05, 300)
183	)
184	)
185
186	self._pretrained_emb = None
187	if self._glove_embed_file is not None and os.path.exists(
188	self._glove_embed_file
189	):
190	self._pretrained_emb = F.tensor(np.stack(pretrained_emb, 0))
191	print(
192	"Miss word in GloVe {0:.4f}".format(
193	1.0 * fail_cnt / len(self._pretrained_emb)
194	)
195	)
196	# build trees
197	self._trees = []

Callers

nothing calls this directly

Calls 7

_build_treeMethod · 0.95

formatMethod · 0.80

appendMethod · 0.80

uniformMethod · 0.80

joinMethod · 0.45

keysMethod · 0.45

getMethod · 0.45

Tested by

no test coverage detected