MCPcopy
hub / github.com/InternLM/lmdeploy / _inference

Method _inference

benchmark/profile_throughput.py:157–203  ·  view source on GitHub ↗
(self, req_queue: Queue, session_id: int, temperature: float, top_p: float, top_k: int,
                         stream_output: bool, skip_tokenize: bool, skip_detokenize: bool, concurrency: int)

Source from the content-addressed store, hash-verified

155 self.pbar = None
156
157 async def _inference(self, req_queue: Queue, session_id: int, temperature: float, top_p: float, top_k: int,
158 stream_output: bool, skip_tokenize: bool, skip_detokenize: bool, concurrency: int):
159 model_inst = self.tm_model.create_instance()
160 sess: Session = None
161 for prompt, _, output_seqlen, cancel_after, sess in iter(req_queue.get_nowait, None):
162
163 sess.tick(0)
164
165 if skip_tokenize:
166 input_ids = prompt
167 else:
168 input_ids = self.tokenizer(prompt).input_ids
169
170 state = DetokenizeState(len(input_ids))
171
172 n_token = 0
173 token_ids = input_ids.copy()
174
175 generator = model_inst.async_stream_infer(session_id,
176 input_ids=input_ids,
177 gen_config=GenerationConfig(max_new_tokens=output_seqlen,
178 temperature=temperature,
179 top_p=top_p,
180 top_k=top_k,
181 ignore_eos=True),
182 sequence_start=True,
183 sequence_end=True,
184 stream_output=stream_output)
185 try:
186 async for outputs in generator:
187 n_token += len(outputs.token_ids)
188 token_ids += outputs.token_ids
189 if not skip_detokenize:
190 _, state = self.tokenizer.detokenize_incrementally(token_ids, state)
191 sess.tick(n_token)
192 if n_token > cancel_after:
193 break
194 sess.finish(Session.SUCCESS)
195 finally:
196 await generator.aclose()
197
198 # for pytorch engine to restart a session
199 if self.backend == 'pytorch':
200 await model_inst.async_end(session_id)
201
202 self.pbar.update(1)
203 session_id += concurrency
204
205 def process_request(self, requests, profiler: Profiler, concurrency, temperature, top_p, top_k, stream_output,
206 skip_tokenize, skip_detokenize, cancel_rate):

Callers 1

process_requestMethod · 0.95

Calls 10

DetokenizeStateClass · 0.90
GenerationConfigClass · 0.90
create_instanceMethod · 0.45
tickMethod · 0.45
copyMethod · 0.45
async_stream_inferMethod · 0.45
finishMethod · 0.45
async_endMethod · 0.45
updateMethod · 0.45

Tested by

no test coverage detected