MCPcopy
hub / github.com/PaddlePaddle/PaddleNLP / StaticInferencePredictor

Class StaticInferencePredictor

llm/predictor.py:622–696  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

620
621
622class StaticInferencePredictor(InferencePredictorMixin, BasePredictor):
623 def __init__(
624 self,
625 config: PredictorArgument,
626 cache_kvs_shape: list[list[int]],
627 tokenizer: PretrainedTokenizer = None,
628 ):
629 self.cache_kvs_shape = cache_kvs_shape
630 BasePredictor.__init__(self, config, tokenizer)
631 InferencePredictorMixin.__init__(self, config, tokenizer)
632
633 self.predictor = self._create_predictor(config)
634
635 def _create_predictor(self, predictor_args: PredictorArgument):
636 if not is_paddlenlp_ops_available():
637 raise ValueError(
638 "you should install the paddlenlp ops to run inference predictor, "
639 "https://github.com/PaddlePaddle/PaddleNLP/blob/develop/csrc/README.md"
640 )
641
642 # register the custome ops
643 import_module("paddlenlp_ops.encode_rotary_qk")
644 import_module("paddlenlp_ops.get_padding_offset")
645 import_module("paddlenlp_ops.qkv_transpose_split")
646 import_module("paddlenlp_ops.rebuild_padding")
647 import_module("paddlenlp_ops.transpose_remove_padding")
648 import_module("paddlenlp_ops.write_cache_kv")
649
650 infer_model_path = get_infer_model_path(predictor_args.model_name_or_path, predictor_args.model_prefix)
651
652 config = paddle.inference.Config(infer_model_path + ".pdmodel", infer_model_path + ".pdiparams")
653
654 config.switch_ir_optim(True)
655 # remove `gpu_cpu_map_matmul_v2_to_matmul_pass` to avoid mapping matmul_v2 -> matmul op
656 if predictor_args.dtype == "bfloat16":
657 config.delete_pass("gpu_cpu_map_matmul_v2_to_matmul_pass")
658
659 device_id = int(os.environ.get("FLAGS_selected_gpus", 0))
660 config.enable_use_gpu(100, device_id)
661 config.enable_new_executor()
662
663 if self.tensor_parallel_degree > 1:
664 trainer_endpoints = fleet.worker_endpoints()
665 current_endpoint = trainer_endpoints[self.tensor_parallel_rank]
666
667 dist_config = config.dist_config()
668 dist_config.set_ranks(self.tensor_parallel_degree, self.tensor_parallel_rank)
669 dist_config.set_endpoints(trainer_endpoints, current_endpoint)
670 dist_config.enable_dist_model(True)
671
672 dist_config.set_comm_init_config(os.path.join(predictor_args.model_name_or_path, "rank_mapping.csv"))
673 config.set_dist_config(dist_config)
674
675 predictor = paddle.inference.create_predictor(config)
676 return predictor
677
678 @paddle.no_grad()
679 def _infer(self, inputs):

Callers 1

create_predictorFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…