(self, requests, profiler: Profiler, concurrency, temperature, top_p, top_k, stream_output,
skip_tokenize, skip_detokenize, cancel_rate)
| 203 | session_id += concurrency |
| 204 | |
| 205 | def process_request(self, requests, profiler: Profiler, concurrency, temperature, top_p, top_k, stream_output, |
| 206 | skip_tokenize, skip_detokenize, cancel_rate): |
| 207 | req_queue = Queue() |
| 208 | |
| 209 | # feed request to q |
| 210 | for prompt, input_len, output_len in requests: |
| 211 | cancel_after = output_len + 1 |
| 212 | if cancel_rate > 0: |
| 213 | if random.random() < cancel_rate: |
| 214 | cancel_after = random.randint(0, cancel_after) |
| 215 | sess = profiler.new_session(input_len, output_len) |
| 216 | req = [prompt, input_len, output_len, cancel_after, sess] |
| 217 | if skip_tokenize: |
| 218 | req[0] = self.tokenizer.encode(prompt) |
| 219 | req_queue.put(req) |
| 220 | for i in range(concurrency): |
| 221 | req_queue.put(None) |
| 222 | |
| 223 | # start threads |
| 224 | tasks = [] |
| 225 | for i in range(concurrency): |
| 226 | task = self._inference(req_queue, i, temperature, top_p, top_k, stream_output, skip_tokenize, |
| 227 | skip_detokenize, concurrency) |
| 228 | tasks.append(task) |
| 229 | |
| 230 | async def _gather_tasks(tasks): |
| 231 | profiler.start() |
| 232 | ret = await asyncio.gather(*tasks) |
| 233 | profiler.finish() |
| 234 | return ret |
| 235 | |
| 236 | self.pbar = tqdm(total=len(requests)) |
| 237 | |
| 238 | asyncio.run(_gather_tasks(tasks)) |
| 239 | |
| 240 | self.pbar.close() |
| 241 | |
| 242 | |
| 243 | def parse_args(): |
no test coverage detected