(self, req_queue: Queue, session_id: int, temperature: float, top_p: float, top_k: int,
stream_output: bool, skip_tokenize: bool, skip_detokenize: bool, concurrency: int)
| 155 | self.pbar = None |
| 156 | |
| 157 | async def _inference(self, req_queue: Queue, session_id: int, temperature: float, top_p: float, top_k: int, |
| 158 | stream_output: bool, skip_tokenize: bool, skip_detokenize: bool, concurrency: int): |
| 159 | model_inst = self.tm_model.create_instance() |
| 160 | sess: Session = None |
| 161 | for prompt, _, output_seqlen, cancel_after, sess in iter(req_queue.get_nowait, None): |
| 162 | |
| 163 | sess.tick(0) |
| 164 | |
| 165 | if skip_tokenize: |
| 166 | input_ids = prompt |
| 167 | else: |
| 168 | input_ids = self.tokenizer(prompt).input_ids |
| 169 | |
| 170 | state = DetokenizeState(len(input_ids)) |
| 171 | |
| 172 | n_token = 0 |
| 173 | token_ids = input_ids.copy() |
| 174 | |
| 175 | generator = model_inst.async_stream_infer(session_id, |
| 176 | input_ids=input_ids, |
| 177 | gen_config=GenerationConfig(max_new_tokens=output_seqlen, |
| 178 | temperature=temperature, |
| 179 | top_p=top_p, |
| 180 | top_k=top_k, |
| 181 | ignore_eos=True), |
| 182 | sequence_start=True, |
| 183 | sequence_end=True, |
| 184 | stream_output=stream_output) |
| 185 | try: |
| 186 | async for outputs in generator: |
| 187 | n_token += len(outputs.token_ids) |
| 188 | token_ids += outputs.token_ids |
| 189 | if not skip_detokenize: |
| 190 | _, state = self.tokenizer.detokenize_incrementally(token_ids, state) |
| 191 | sess.tick(n_token) |
| 192 | if n_token > cancel_after: |
| 193 | break |
| 194 | sess.finish(Session.SUCCESS) |
| 195 | finally: |
| 196 | await generator.aclose() |
| 197 | |
| 198 | # for pytorch engine to restart a session |
| 199 | if self.backend == 'pytorch': |
| 200 | await model_inst.async_end(session_id) |
| 201 | |
| 202 | self.pbar.update(1) |
| 203 | session_id += concurrency |
| 204 | |
| 205 | def process_request(self, requests, profiler: Profiler, concurrency, temperature, top_p, top_k, stream_output, |
| 206 | skip_tokenize, skip_detokenize, cancel_rate): |
no test coverage detected