load quantized model from local disk
(
cls,
model_name_or_path: Optional[str] = None,
save_dir: Optional[str] = None,
device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
max_memory: Optional[dict] = None,
device: Optional[Union[str, int]] = None,
low_cpu_mem_usage: bool = False,
use_triton: bool = False,
torch_dtype: torch.dtype = torch.float16,
inject_fused_attention: bool = True,
inject_fused_mlp: bool = True,
use_cuda_fp16: bool = True,
quantize_config: Optional[BaseQuantizeConfig] = None,
model_basename: Optional[str] = None,
use_safetensors: bool = False,
trust_remote_code: bool = False,
warmup_triton: bool = False,
trainable: bool = False,
**kwargs
)
| 100 | |
| 101 | @classmethod |
| 102 | def from_quantized( |
| 103 | cls, |
| 104 | model_name_or_path: Optional[str] = None, |
| 105 | save_dir: Optional[str] = None, |
| 106 | device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, |
| 107 | max_memory: Optional[dict] = None, |
| 108 | device: Optional[Union[str, int]] = None, |
| 109 | low_cpu_mem_usage: bool = False, |
| 110 | use_triton: bool = False, |
| 111 | torch_dtype: torch.dtype = torch.float16, |
| 112 | inject_fused_attention: bool = True, |
| 113 | inject_fused_mlp: bool = True, |
| 114 | use_cuda_fp16: bool = True, |
| 115 | quantize_config: Optional[BaseQuantizeConfig] = None, |
| 116 | model_basename: Optional[str] = None, |
| 117 | use_safetensors: bool = False, |
| 118 | trust_remote_code: bool = False, |
| 119 | warmup_triton: bool = False, |
| 120 | trainable: bool = False, |
| 121 | **kwargs |
| 122 | ): |
| 123 | """load quantized model from local disk""" |
| 124 | |
| 125 | # Parameters related to loading from Hugging Face Hub |
| 126 | cache_dir = kwargs.pop("cache_dir", None) |
| 127 | force_download = kwargs.pop("force_download", False) |
| 128 | resume_download = kwargs.pop("resume_download", False) |
| 129 | proxies = kwargs.pop("proxies", None) |
| 130 | local_files_only = kwargs.pop("local_files_only", False) |
| 131 | use_auth_token = kwargs.pop("use_auth_token", None) |
| 132 | revision = kwargs.pop("revision", None) |
| 133 | subfolder = kwargs.pop("subfolder", "") |
| 134 | commit_hash = kwargs.pop("_commit_hash", None) |
| 135 | |
| 136 | if use_triton and not TRITON_AVAILABLE: |
| 137 | logger.warning("triton is not installed, reset use_triton to False") |
| 138 | use_triton = False |
| 139 | |
| 140 | # == step1: prepare configs and file names == # |
| 141 | if model_name_or_path and save_dir: |
| 142 | logger.warning("save_dir will be ignored because model_name_or_path is explicit specified.") |
| 143 | if not model_name_or_path and save_dir: |
| 144 | model_name_or_path = save_dir |
| 145 | logger.warning("save_dir is deprecated and will be removed in version 0.3.0", PendingDeprecationWarning, |
| 146 | stacklevel=2) |
| 147 | if not model_name_or_path and not save_dir: |
| 148 | raise ValueError("at least one of model_name_or_path or save_dir should be specified.") |
| 149 | |
| 150 | config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code) |
| 151 | |
| 152 | if quantize_config is None: |
| 153 | quantize_config = BaseQuantizeConfig.from_pretrained(model_name_or_path, **kwargs) |
| 154 | |
| 155 | if model_basename is None: |
| 156 | if quantize_config.model_file_base_name: |
| 157 | model_basename = quantize_config.model_file_base_name |
| 158 | else: |
| 159 | model_basename = f"gptq_model-{quantize_config.bits}bit-{quantize_config.group_size}g" |
no test coverage detected