MCPcopy
hub / github.com/bigcode-project/starcoder / __iter__

Method __iter__

finetune/finetune.py:163–191  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

161 self.output_column_name = output_column_name
162
163 def __iter__(self):
164 iterator = iter(self.dataset)
165 more_examples = True
166 while more_examples:
167 buffer, buffer_len = [], 0
168 while True:
169 if buffer_len >= self.max_buffer_size:
170 break
171 try:
172 buffer.append(prepare_sample_text(next(iterator), self.input_column_name, self.output_column_name))
173 buffer_len += len(buffer[-1])
174 except StopIteration:
175 if self.infinite:
176 iterator = iter(self.dataset)
177 else:
178 more_examples = False
179 break
180 tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
181 all_token_ids = []
182 for tokenized_input in tokenized_inputs:
183 all_token_ids.extend(tokenized_input + [self.concat_token_id])
184 for i in range(0, len(all_token_ids), self.seq_length):
185 input_ids = all_token_ids[i : i + self.seq_length]
186 if len(input_ids) == self.seq_length:
187 self.current_size += 1
188 yield {
189 "input_ids": torch.LongTensor(input_ids),
190 "labels": torch.LongTensor(input_ids),
191 }
192
193
194def create_datasets(tokenizer, args):

Callers

nothing calls this directly

Calls 1

prepare_sample_textFunction · 0.85

Tested by

no test coverage detected