Class ModelWorker

fastchat/serve/model_worker.py:38–300 · view source on GitHub ↗

Source from the content-addressed store, hash-verified

36
37
38	class ModelWorker(BaseModelWorker):
39	def __init__(
40	self,
41	controller_addr: str,
42	worker_addr: str,
43	worker_id: str,
44	model_path: str,
45	model_names: List[str],
46	limit_worker_concurrency: int,
47	no_register: bool,
48	device: str,
49	num_gpus: int,
50	max_gpu_memory: str,
51	revision: str = None,
52	dtype: Optional[torch.dtype] = None,
53	load_8bit: bool = False,
54	cpu_offloading: bool = False,
55	gptq_config: Optional[GptqConfig] = None,
56	awq_config: Optional[AWQConfig] = None,
57	exllama_config: Optional[ExllamaConfig] = None,
58	xft_config: Optional[XftConfig] = None,
59	stream_interval: int = 2,
60	conv_template: Optional[str] = None,
61	embed_in_truncate: bool = False,
62	seed: Optional[int] = None,
63	debug: bool = False,
64	**kwargs,
65	):
66	super().__init__(
67	controller_addr,
68	worker_addr,
69	worker_id,
70	model_path,
71	model_names,
72	limit_worker_concurrency,
73	conv_template=conv_template,
74	)
75
76	logger.info(f"Loading the model {self.model_names} on worker {worker_id} ...")
77	self.model, self.tokenizer = load_model(
78	model_path,
79	revision=revision,
80	device=device,
81	num_gpus=num_gpus,
82	max_gpu_memory=max_gpu_memory,
83	dtype=dtype,
84	load_8bit=load_8bit,
85	cpu_offloading=cpu_offloading,
86	gptq_config=gptq_config,
87	awq_config=awq_config,
88	exllama_config=exllama_config,
89	xft_config=xft_config,
90	debug=debug,
91	)
92	self.device = device
93	if self.tokenizer.pad_token == None:
94	self.tokenizer.pad_token = self.tokenizer.eos_token
95	self.context_len = get_context_length(self.model.config)

create_multi_model_workerFunction · 0.90

create_model_workerFunction · 0.85

no outgoing calls

no test coverage detected

searching dependent graphs…