Set batch bucket to use speculatvie decoding. This will notify the adjust the lengths of inputs during modeling, and let the main model verifies tokens in parallel.
(self, num_tokens_to_verify: int = 5)
| 147 | return updated_block_ids |
| 148 | |
| 149 | def set_use_spec_dec(self, num_tokens_to_verify: int = 5) -> None: |
| 150 | """Set batch bucket to use speculatvie decoding. |
| 151 | This will notify the adjust the lengths of inputs during modeling, |
| 152 | and let the main model verifies tokens in parallel. |
| 153 | """ |
| 154 | self._use_spec_dec = True |
| 155 | self._num_tokens_to_verify = num_tokens_to_verify |
| 156 | |
| 157 | def reset_use_spec_dec(self) -> None: |
| 158 | """Reset the usage of speculative decoding for the batch bucket""" |
no outgoing calls
no test coverage detected