(self)
| 641 | return model_obj.model_dir |
| 642 | |
| 643 | def __call__(self) -> Tuple[Path, Union[Path, None]]: |
| 644 | |
| 645 | if self.llm_args.model_format is _ModelFormatKind.TLLM_ENGINE: |
| 646 | return Path(self.llm_args.model), None |
| 647 | |
| 648 | # Download speculative model from HuggingFace if needed (all backends) |
| 649 | if (self.llm_args.speculative_config is not None and |
| 650 | self.llm_args.speculative_config.speculative_model is not None): |
| 651 | spec_model_obj = _ModelWrapper( |
| 652 | self.llm_args.speculative_config.speculative_model) |
| 653 | spec_model_dir = self._download_hf_model_if_needed(spec_model_obj) |
| 654 | self.llm_args.speculative_config.speculative_model = spec_model_dir |
| 655 | |
| 656 | # AutoDeploy doesn't use ModelLoader |
| 657 | if self.llm_args.backend == "_autodeploy": |
| 658 | return None, "" |
| 659 | |
| 660 | self.engine_cache_stage: Optional[CachedStage] = None |
| 661 | self._hf_model_dir = None |
| 662 | self.model_loader = ModelLoader(self.llm_args) |
| 663 | |
| 664 | if self.llm_args.backend is not None: |
| 665 | if self.llm_args.backend not in ["pytorch", "_autodeploy"]: |
| 666 | raise ValueError( |
| 667 | f'backend {self.llm_args.backend} is not supported.') |
| 668 | |
| 669 | self._hf_model_dir = self._download_hf_model_if_needed( |
| 670 | self.model_loader.model_obj, revision=self.llm_args.revision) |
| 671 | |
| 672 | if self.llm_args.quant_config.quant_algo is not None: |
| 673 | logger.warning( |
| 674 | "QuantConfig for pytorch backend is ignored. You can load" |
| 675 | "quantized model with hf_quant_config.json directly.") |
| 676 | # Currently, this is to make updated quant_config visible by llm.args.quant_config |
| 677 | # TODO: Unify the logics with those in tensorrt_llm/_torch/model_config.py |
| 678 | self.model_loader._update_from_hf_quant_config() |
| 679 | |
| 680 | return None, self._hf_model_dir |
| 681 | |
| 682 | if self.model_loader.model_obj.is_hub_model: |
| 683 | # This will download the config.json from HF model hub, this helps to create a PretrainedConfig for |
| 684 | # cache key. |
| 685 | self._hf_model_dir = download_hf_pretrained_config( |
| 686 | self.model_loader.model_obj.model_name, |
| 687 | revision=self.llm_args.revision) |
| 688 | |
| 689 | elif self.model_loader.model_obj.is_local_model: |
| 690 | self._hf_model_dir = self.model_loader.model_obj.model_dir if self.llm_args.model_format is _ModelFormatKind.HF else None |
| 691 | |
| 692 | if self.build_cache_enabled: |
| 693 | print_colored("Build cache is enabled.\n", 'yellow') |
| 694 | |
| 695 | self.engine_cache_stage = self._get_engine_cache_stage() |
| 696 | if self.engine_cache_stage.is_cached(): |
| 697 | self.llm_build_stats.cache_hitted = True |
| 698 | print_colored( |
| 699 | f"Reusing cached engine in {self.engine_cache_stage.get_engine_path()}\n\n", |
| 700 | 'grey') |
nothing calls this directly
no test coverage detected