If single sample bigger than the context window, then break up into smaller chunks
(self, sample)
| 2216 | return new_sources |
| 2217 | |
| 2218 | def chunk_large_sample(self, sample): |
| 2219 | |
| 2220 | """ If single sample bigger than the context window, then break up into smaller chunks """ |
| 2221 | |
| 2222 | chunks = [] |
| 2223 | max_size = self.context_window_size |
| 2224 | sample_len = self.token_counter(sample["text"]) |
| 2225 | |
| 2226 | chunk_count = sample_len // max_size |
| 2227 | if max_size * chunk_count < sample_len: |
| 2228 | chunk_count += 1 |
| 2229 | |
| 2230 | stopper = 0 |
| 2231 | base_dict = {} |
| 2232 | for key, values in sample.items(): |
| 2233 | base_dict.update({key:values}) |
| 2234 | |
| 2235 | sample_tokens = self.tokenize(sample["text"]) |
| 2236 | |
| 2237 | for x in range(0,chunk_count): |
| 2238 | starter = stopper |
| 2239 | stopper = min((x+1)*max_size,sample_len) |
| 2240 | new_chunk_tokens = sample_tokens[starter:stopper] |
| 2241 | new_dict = base_dict |
| 2242 | new_dict.update({"text":self.tokenizer.decode(new_chunk_tokens)}) |
| 2243 | chunks.append(new_dict) |
| 2244 | |
| 2245 | return chunks |
| 2246 |
no test coverage detected