| 368 | return backend_pb2.EmbeddingResult(embeddings=embeds) |
| 369 | |
| 370 | async def _predict(self, request, context, streaming=False): |
| 371 | set_seed(request.Seed) |
| 372 | if request.TopP < 0 or request.TopP > 1: |
| 373 | request.TopP = 1 |
| 374 | |
| 375 | if request.TopK <= 0: |
| 376 | request.TopK = 50 |
| 377 | |
| 378 | if request.Temperature > 0 : |
| 379 | sample=True |
| 380 | else: |
| 381 | sample=False |
| 382 | request.TopP == None |
| 383 | request.TopK == None |
| 384 | request.Temperature == None |
| 385 | |
| 386 | prompt = request.Prompt |
| 387 | if not request.Prompt and request.UseTokenizerTemplate and request.Messages: |
| 388 | prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True) |
| 389 | |
| 390 | inputs = self.tokenizer(prompt, return_tensors="pt") |
| 391 | |
| 392 | if request.Tokens > 0: |
| 393 | max_tokens = request.Tokens |
| 394 | else: |
| 395 | max_tokens = self.max_tokens - inputs["input_ids"].size()[inputs["input_ids"].dim()-1] |
| 396 | |
| 397 | if self.CUDA: |
| 398 | inputs = inputs.to("cuda") |
| 399 | if XPU and self.OV == False: |
| 400 | inputs = inputs.to("xpu") |
| 401 | streaming = False |
| 402 | |
| 403 | criteria=[] |
| 404 | if request.StopPrompts: |
| 405 | criteria = StoppingCriteriaList( |
| 406 | [ |
| 407 | StopStringCriteria(tokenizer=self.tokenizer, stop_strings=request.StopPrompts), |
| 408 | ] |
| 409 | ) |
| 410 | |
| 411 | if streaming: |
| 412 | streamer=TextIteratorStreamer(self.tokenizer, |
| 413 | skip_prompt=True, |
| 414 | skip_special_tokens=True) |
| 415 | config=dict(inputs, |
| 416 | max_new_tokens=max_tokens, |
| 417 | temperature=request.Temperature, |
| 418 | top_p=request.TopP, |
| 419 | top_k=request.TopK, |
| 420 | do_sample=sample, |
| 421 | attention_mask=inputs["attention_mask"], |
| 422 | eos_token_id=self.tokenizer.eos_token_id, |
| 423 | pad_token_id=self.tokenizer.eos_token_id, |
| 424 | streamer=streamer, |
| 425 | stopping_criteria=criteria, |
| 426 | use_cache=True, |
| 427 | ) |