MCPcopy Index your code
hub / github.com/THUDM/GLM / create_examples

Method create_examples

tasks/seq2seq/dataset.py:113–146  ·  view source on GitHub ↗
(self, split)

Source from the content-addressed store, hash-verified

111 self.tokenizer = tokenizer
112
113 def create_examples(self, split):
114 if split == "train":
115 filename = "train.json"
116 elif split == "dev":
117 filename = "dev.json"
118 elif split == "test":
119 filename = "test.json"
120 else:
121 raise NotImplementedError(split)
122 print_rank_0(f"Creating CMRC-{split} dataset from {self.data_dir}")
123 example_list = []
124 idx = 0
125 with open(os.path.join(self.data_dir, filename), encoding='utf-8') as file:
126 dataset = json.load(file)
127 for article in dataset['data']:
128 for paragraph in article['paragraphs']:
129 context = paragraph['context']
130 for qa in paragraph['qas']:
131 question = qa["question"]
132 answers = {answer['text'] for answer in qa["answers"]} if split != 'test' else {"FAKE_ANSWER"}
133 for answer in answers:
134 guid = "%s-%s" % (split, idx)
135 meta = {
136 "answer": answer,
137 "question": question,
138 "ref": self.tokenizer.DecodeIds(self.tokenizer.EncodeAsIds(answer).tokenization)}
139 example = InputExample(guid=guid, text_a=context, meta=meta)
140 if idx < 10:
141 print_rank_0(
142 (context.encode('utf-8'), answer.encode('utf-8'), meta["ref"].encode('utf-8')))
143 example_list.append(example)
144 idx += 1
145 print_rank_0(f"Creating {len(example_list)} examples for {split}")
146 return example_list
147
148
149class SQuADGenerationProcessor:

Callers 1

__init__Method · 0.45

Calls 7

print_rank_0Function · 0.90
InputExampleClass · 0.90
loadMethod · 0.80
appendMethod · 0.80
DecodeIdsMethod · 0.45
EncodeAsIdsMethod · 0.45
encodeMethod · 0.45

Tested by

no test coverage detected