(
self, tools: list[dict], history: list[Conversation], **parameters
)
| 28 | self.engine = LLMEngine.from_engine_args(self.engine_args) |
| 29 | |
| 30 | def generate_stream( |
| 31 | self, tools: list[dict], history: list[Conversation], **parameters |
| 32 | ) -> Generator[tuple[str | dict, list[dict]]]: |
| 33 | chat_history = process_input(history, tools) |
| 34 | model_inputs = self.tokenizer.apply_chat_template(chat_history, add_generation_prompt=True, tokenize=False) |
| 35 | parameters["max_tokens"] = parameters.pop("max_new_tokens") |
| 36 | params_dict = { |
| 37 | "n": 1, |
| 38 | "best_of": 1, |
| 39 | "top_p": 1, |
| 40 | "top_k": -1, |
| 41 | "length_penalty": 1, |
| 42 | "stop_token_ids": [151329, 151336, 151338], |
| 43 | } |
| 44 | params_dict.update(parameters) |
| 45 | sampling_params = SamplingParams(**params_dict) |
| 46 | |
| 47 | self.engine.add_request(request_id=str(time.time()), inputs=model_inputs, params=sampling_params) |
| 48 | while self.engine.has_unfinished_requests(): |
| 49 | request_outputs = self.engine.step() |
| 50 | for request_output in request_outputs: |
| 51 | yield process_response(request_output.outputs[0].text, chat_history) |
nothing calls this directly
no test coverage detected