MCPcopy Index your code
hub / github.com/THUDM/GLM / create_examples

Method create_examples

tasks/seq2seq/dataset.py:195–230  ·  view source on GitHub ↗
(self, split)

Source from the content-addressed store, hash-verified

193 self.tokenizer = tokenizer
194
195 def create_examples(self, split):
196 if split == "train":
197 filename = "train.json"
198 elif split == "dev":
199 filename = "dev.json"
200 elif split == "test":
201 filename = "test.json"
202 else:
203 raise NotImplementedError(split)
204 print_rank_0(f"Creating SQuAD-{split} dataset from {self.data_dir}")
205 example_list = []
206 idx = 0
207 with open(os.path.join(self.data_dir, filename), encoding='utf-8') as file:
208 dataset = json.load(file)
209 for paragraphs in dataset:
210 for paragraph in paragraphs['paragraphs']:
211 context = paragraph['context']
212 for qa in paragraph['qas']:
213 question = qa["question"]
214 answers = {answer["text"] for answer in qa["answers"]}
215 answer_starts = {answer["text"]: answer["answer_start"] for answer in qa["answers"]}
216 for answer in answers:
217 guid = "%s-%s" % (split, idx)
218 meta = {
219 "answer_start": answer_starts[answer],
220 "answer": answer,
221 "question": question,
222 "ref": self.tokenizer.DecodeIds(self.tokenizer.EncodeAsIds(question).tokenization)}
223 example = InputExample(guid=guid, text_a=context, meta=meta)
224 if idx < 10:
225 print_rank_0(
226 (context.encode('utf-8'), answer.encode('utf-8'), meta["ref"].encode('utf-8')))
227 example_list.append(example)
228 idx += 1
229 print_rank_0(f"Creating {len(example_list)} examples for {split}")
230 return example_list
231
232
233def generate_token_to_char_map(tokens, raw_text, tokenizer):

Callers

nothing calls this directly

Calls 7

print_rank_0Function · 0.90
InputExampleClass · 0.90
loadMethod · 0.80
appendMethod · 0.80
DecodeIdsMethod · 0.45
EncodeAsIdsMethod · 0.45
encodeMethod · 0.45

Tested by

no test coverage detected