(self, requests, profiler: Profiler, temperature, top_p, top_k,
stream_output, ignore_eos, response_format=None)
| 206 | self.return_routed_experts = getattr(self.pipe.backend_config, 'enable_return_routed_experts', False) |
| 207 | |
| 208 | def process_request(self, requests, profiler: Profiler, temperature, top_p, top_k, |
| 209 | stream_output, ignore_eos, response_format=None): |
| 210 | |
| 211 | prompts = [prompt for prompt, _, _ in requests] |
| 212 | gen_configs = [ |
| 213 | GenerationConfig(temperature=temperature, |
| 214 | top_p=top_p, |
| 215 | top_k=top_k, |
| 216 | ignore_eos=ignore_eos, |
| 217 | do_sample=False, |
| 218 | return_routed_experts=self.return_routed_experts, |
| 219 | response_format=response_format, |
| 220 | max_new_tokens=output_len) for _, _, output_len in requests |
| 221 | ] |
| 222 | |
| 223 | sess: list[Session] = [] |
| 224 | for _, input_len, output_len in requests: |
| 225 | sess.append(profiler.new_session(input_len, output_len)) |
| 226 | |
| 227 | def _to_status(finish_reason): |
| 228 | if finish_reason in ('length', 'stop'): |
| 229 | return Session.SUCCESS |
| 230 | return Session.FAIL |
| 231 | |
| 232 | profiler.start() |
| 233 | for s in sess: |
| 234 | s.tick(0) |
| 235 | |
| 236 | if stream_output: |
| 237 | pbar = tqdm(total=len(requests)) |
| 238 | for output in self.pipe.stream_infer(prompts, gen_config=gen_configs, do_preprocess=False): |
| 239 | idx = output.index |
| 240 | n_token = output.generate_token_len |
| 241 | finish_reason = output.finish_reason |
| 242 | sess[idx].tick(n_token) |
| 243 | if finish_reason is not None: |
| 244 | sess[idx].finish(_to_status(finish_reason)) |
| 245 | pbar.update(1) |
| 246 | pbar.close() |
| 247 | else: |
| 248 | for output in self.pipe(prompts, gen_configs, do_preprocess=False, use_tqdm=True): |
| 249 | idx = output.index |
| 250 | n_token = output.generate_token_len |
| 251 | finish_reason = output.finish_reason |
| 252 | sess[idx].tick(n_token) |
| 253 | sess[idx].finish(_to_status(finish_reason)) |
| 254 | |
| 255 | profiler.finish() |
| 256 | |
| 257 | # Collect actual per-request output lengths for detailed comparison |
| 258 | actual_output_lens = [] |
| 259 | for i, s in enumerate(sess): |
| 260 | actual_output_lens.append(s.ns[-1] if s.status == Session.SUCCESS else 0) |
| 261 | if s.status != Session.SUCCESS: |
| 262 | logger.warning(f'Request {i}: {s.ns[-1]}/{s.req_output_len} tokens, finish != length/stop') |
| 263 | return actual_output_lens |
| 264 | |
| 265 |
no test coverage detected