| 620 | |
| 621 | |
| 622 | class StaticInferencePredictor(InferencePredictorMixin, BasePredictor): |
| 623 | def __init__( |
| 624 | self, |
| 625 | config: PredictorArgument, |
| 626 | cache_kvs_shape: list[list[int]], |
| 627 | tokenizer: PretrainedTokenizer = None, |
| 628 | ): |
| 629 | self.cache_kvs_shape = cache_kvs_shape |
| 630 | BasePredictor.__init__(self, config, tokenizer) |
| 631 | InferencePredictorMixin.__init__(self, config, tokenizer) |
| 632 | |
| 633 | self.predictor = self._create_predictor(config) |
| 634 | |
| 635 | def _create_predictor(self, predictor_args: PredictorArgument): |
| 636 | if not is_paddlenlp_ops_available(): |
| 637 | raise ValueError( |
| 638 | "you should install the paddlenlp ops to run inference predictor, " |
| 639 | "https://github.com/PaddlePaddle/PaddleNLP/blob/develop/csrc/README.md" |
| 640 | ) |
| 641 | |
| 642 | # register the custome ops |
| 643 | import_module("paddlenlp_ops.encode_rotary_qk") |
| 644 | import_module("paddlenlp_ops.get_padding_offset") |
| 645 | import_module("paddlenlp_ops.qkv_transpose_split") |
| 646 | import_module("paddlenlp_ops.rebuild_padding") |
| 647 | import_module("paddlenlp_ops.transpose_remove_padding") |
| 648 | import_module("paddlenlp_ops.write_cache_kv") |
| 649 | |
| 650 | infer_model_path = get_infer_model_path(predictor_args.model_name_or_path, predictor_args.model_prefix) |
| 651 | |
| 652 | config = paddle.inference.Config(infer_model_path + ".pdmodel", infer_model_path + ".pdiparams") |
| 653 | |
| 654 | config.switch_ir_optim(True) |
| 655 | # remove `gpu_cpu_map_matmul_v2_to_matmul_pass` to avoid mapping matmul_v2 -> matmul op |
| 656 | if predictor_args.dtype == "bfloat16": |
| 657 | config.delete_pass("gpu_cpu_map_matmul_v2_to_matmul_pass") |
| 658 | |
| 659 | device_id = int(os.environ.get("FLAGS_selected_gpus", 0)) |
| 660 | config.enable_use_gpu(100, device_id) |
| 661 | config.enable_new_executor() |
| 662 | |
| 663 | if self.tensor_parallel_degree > 1: |
| 664 | trainer_endpoints = fleet.worker_endpoints() |
| 665 | current_endpoint = trainer_endpoints[self.tensor_parallel_rank] |
| 666 | |
| 667 | dist_config = config.dist_config() |
| 668 | dist_config.set_ranks(self.tensor_parallel_degree, self.tensor_parallel_rank) |
| 669 | dist_config.set_endpoints(trainer_endpoints, current_endpoint) |
| 670 | dist_config.enable_dist_model(True) |
| 671 | |
| 672 | dist_config.set_comm_init_config(os.path.join(predictor_args.model_name_or_path, "rank_mapping.csv")) |
| 673 | config.set_dist_config(dist_config) |
| 674 | |
| 675 | predictor = paddle.inference.create_predictor(config) |
| 676 | return predictor |
| 677 | |
| 678 | @paddle.no_grad() |
| 679 | def _infer(self, inputs): |
no outgoing calls
no test coverage detected
searching dependent graphs…