| 36 | |
| 37 | |
| 38 | class ModelWorker(BaseModelWorker): |
| 39 | def __init__( |
| 40 | self, |
| 41 | controller_addr: str, |
| 42 | worker_addr: str, |
| 43 | worker_id: str, |
| 44 | model_path: str, |
| 45 | model_names: List[str], |
| 46 | limit_worker_concurrency: int, |
| 47 | no_register: bool, |
| 48 | device: str, |
| 49 | num_gpus: int, |
| 50 | max_gpu_memory: str, |
| 51 | revision: str = None, |
| 52 | dtype: Optional[torch.dtype] = None, |
| 53 | load_8bit: bool = False, |
| 54 | cpu_offloading: bool = False, |
| 55 | gptq_config: Optional[GptqConfig] = None, |
| 56 | awq_config: Optional[AWQConfig] = None, |
| 57 | exllama_config: Optional[ExllamaConfig] = None, |
| 58 | xft_config: Optional[XftConfig] = None, |
| 59 | stream_interval: int = 2, |
| 60 | conv_template: Optional[str] = None, |
| 61 | embed_in_truncate: bool = False, |
| 62 | seed: Optional[int] = None, |
| 63 | debug: bool = False, |
| 64 | **kwargs, |
| 65 | ): |
| 66 | super().__init__( |
| 67 | controller_addr, |
| 68 | worker_addr, |
| 69 | worker_id, |
| 70 | model_path, |
| 71 | model_names, |
| 72 | limit_worker_concurrency, |
| 73 | conv_template=conv_template, |
| 74 | ) |
| 75 | |
| 76 | logger.info(f"Loading the model {self.model_names} on worker {worker_id} ...") |
| 77 | self.model, self.tokenizer = load_model( |
| 78 | model_path, |
| 79 | revision=revision, |
| 80 | device=device, |
| 81 | num_gpus=num_gpus, |
| 82 | max_gpu_memory=max_gpu_memory, |
| 83 | dtype=dtype, |
| 84 | load_8bit=load_8bit, |
| 85 | cpu_offloading=cpu_offloading, |
| 86 | gptq_config=gptq_config, |
| 87 | awq_config=awq_config, |
| 88 | exllama_config=exllama_config, |
| 89 | xft_config=xft_config, |
| 90 | debug=debug, |
| 91 | ) |
| 92 | self.device = device |
| 93 | if self.tokenizer.pad_token == None: |
| 94 | self.tokenizer.pad_token = self.tokenizer.eos_token |
| 95 | self.context_len = get_context_length(self.model.config) |
no outgoing calls
no test coverage detected
searching dependent graphs…