Single-step decode. Prepares the hidden states, possibly replicates them for CFG, and then delegates to `_compute_logits`. Below we wrap this function with a simple CUDA Graph capturing mechanism, doing 3 warmup steps if needed and then capturing or replaying the gr
(
self,
input_ids: torch.Tensor,
inference_params: InferenceParams,
cfg_scale: float,
allow_cudagraphs: bool = True,
)
| 116 | return logits |
| 117 | |
| 118 | def _decode_one_token( |
| 119 | self, |
| 120 | input_ids: torch.Tensor, |
| 121 | inference_params: InferenceParams, |
| 122 | cfg_scale: float, |
| 123 | allow_cudagraphs: bool = True, |
| 124 | ) -> torch.Tensor: |
| 125 | """ |
| 126 | Single-step decode. Prepares the hidden states, possibly replicates them |
| 127 | for CFG, and then delegates to `_compute_logits`. |
| 128 | |
| 129 | Below we wrap this function with a simple CUDA Graph capturing mechanism, |
| 130 | doing 3 warmup steps if needed and then capturing or replaying the graph. |
| 131 | We only recapture if the batch size changes. |
| 132 | """ |
| 133 | # TODO: support cfg_scale==1 |
| 134 | if cfg_scale == 1.0: |
| 135 | hidden_states = self.embed_codes(input_ids) |
| 136 | return self._compute_logits(hidden_states, inference_params, cfg_scale) |
| 137 | |
| 138 | bsz = input_ids.size(0) |
| 139 | |
| 140 | if not allow_cudagraphs or input_ids.device.type != "cuda": |
| 141 | hidden_states_local = self.embed_codes(input_ids) |
| 142 | hidden_states_local = hidden_states_local.repeat(2, 1, 1) |
| 143 | return self._compute_logits(hidden_states_local, inference_params, cfg_scale) |
| 144 | |
| 145 | need_capture = (self._cg_graph is None) or (self._cg_batch_size != bsz) |
| 146 | |
| 147 | if need_capture: |
| 148 | self._cg_graph = None |
| 149 | |
| 150 | self._cg_batch_size = bsz |
| 151 | self._cg_inference_params = inference_params |
| 152 | self._cg_scale = cfg_scale |
| 153 | |
| 154 | for _ in range(3): |
| 155 | hidden_states = self.embed_codes(input_ids) |
| 156 | hidden_states = hidden_states.repeat(2, 1, 1) # because cfg != 1.0 |
| 157 | logits = self._compute_logits(hidden_states, inference_params, cfg_scale) |
| 158 | |
| 159 | self._cg_input_ids = input_ids.clone() |
| 160 | self._cg_logits = torch.empty_like(logits) |
| 161 | |
| 162 | g = torch.cuda.CUDAGraph() |
| 163 | |
| 164 | def capture_region(): |
| 165 | hidden_states_local = self.embed_codes(self._cg_input_ids) |
| 166 | hidden_states_local = hidden_states_local.repeat(2, 1, 1) |
| 167 | self._cg_logits = self._compute_logits(hidden_states_local, self._cg_inference_params, self._cg_scale) |
| 168 | |
| 169 | with torch.cuda.graph(g): |
| 170 | capture_region() |
| 171 | |
| 172 | self._cg_graph = g |
| 173 | |
| 174 | else: |
| 175 | self._cg_input_ids.copy_(input_ids) |
nothing calls this directly
no test coverage detected