MCPcopy
hub / github.com/lm-sys/FastChat / ModelWorker

Class ModelWorker

fastchat/serve/model_worker.py:38–300  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

36
37
38class ModelWorker(BaseModelWorker):
39 def __init__(
40 self,
41 controller_addr: str,
42 worker_addr: str,
43 worker_id: str,
44 model_path: str,
45 model_names: List[str],
46 limit_worker_concurrency: int,
47 no_register: bool,
48 device: str,
49 num_gpus: int,
50 max_gpu_memory: str,
51 revision: str = None,
52 dtype: Optional[torch.dtype] = None,
53 load_8bit: bool = False,
54 cpu_offloading: bool = False,
55 gptq_config: Optional[GptqConfig] = None,
56 awq_config: Optional[AWQConfig] = None,
57 exllama_config: Optional[ExllamaConfig] = None,
58 xft_config: Optional[XftConfig] = None,
59 stream_interval: int = 2,
60 conv_template: Optional[str] = None,
61 embed_in_truncate: bool = False,
62 seed: Optional[int] = None,
63 debug: bool = False,
64 **kwargs,
65 ):
66 super().__init__(
67 controller_addr,
68 worker_addr,
69 worker_id,
70 model_path,
71 model_names,
72 limit_worker_concurrency,
73 conv_template=conv_template,
74 )
75
76 logger.info(f"Loading the model {self.model_names} on worker {worker_id} ...")
77 self.model, self.tokenizer = load_model(
78 model_path,
79 revision=revision,
80 device=device,
81 num_gpus=num_gpus,
82 max_gpu_memory=max_gpu_memory,
83 dtype=dtype,
84 load_8bit=load_8bit,
85 cpu_offloading=cpu_offloading,
86 gptq_config=gptq_config,
87 awq_config=awq_config,
88 exllama_config=exllama_config,
89 xft_config=xft_config,
90 debug=debug,
91 )
92 self.device = device
93 if self.tokenizer.pad_token == None:
94 self.tokenizer.pad_token = self.tokenizer.eos_token
95 self.context_len = get_context_length(self.model.config)

Callers 2

create_model_workerFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…