| 222 | |
| 223 | |
| 224 | class AttentionBackendName(str, Enum): |
| 225 | # EAGER = "eager" |
| 226 | |
| 227 | # `flash-attn` |
| 228 | FLASH = "flash" |
| 229 | FLASH_HUB = "flash_hub" |
| 230 | FLASH_VARLEN = "flash_varlen" |
| 231 | FLASH_VARLEN_HUB = "flash_varlen_hub" |
| 232 | FLASH_4_HUB = "flash_4_hub" |
| 233 | _FLASH_3 = "_flash_3" |
| 234 | _FLASH_VARLEN_3 = "_flash_varlen_3" |
| 235 | _FLASH_3_HUB = "_flash_3_hub" |
| 236 | _FLASH_3_VARLEN_HUB = "_flash_3_varlen_hub" |
| 237 | |
| 238 | # `aiter` |
| 239 | AITER = "aiter" |
| 240 | |
| 241 | # PyTorch native |
| 242 | FLEX = "flex" |
| 243 | NATIVE = "native" |
| 244 | _NATIVE_CUDNN = "_native_cudnn" |
| 245 | _NATIVE_EFFICIENT = "_native_efficient" |
| 246 | _NATIVE_FLASH = "_native_flash" |
| 247 | _NATIVE_MATH = "_native_math" |
| 248 | _NATIVE_NPU = "_native_npu" |
| 249 | _NATIVE_XLA = "_native_xla" |
| 250 | |
| 251 | # `sageattention` |
| 252 | SAGE = "sage" |
| 253 | SAGE_HUB = "sage_hub" |
| 254 | SAGE_VARLEN = "sage_varlen" |
| 255 | _SAGE_QK_INT8_PV_FP8_CUDA = "_sage_qk_int8_pv_fp8_cuda" |
| 256 | _SAGE_QK_INT8_PV_FP8_CUDA_SM90 = "_sage_qk_int8_pv_fp8_cuda_sm90" |
| 257 | _SAGE_QK_INT8_PV_FP16_CUDA = "_sage_qk_int8_pv_fp16_cuda" |
| 258 | _SAGE_QK_INT8_PV_FP16_TRITON = "_sage_qk_int8_pv_fp16_triton" |
| 259 | # TODO: let's not add support for Sparge Attention now because it requires tuning per model |
| 260 | # We can look into supporting something "autotune"-ing in the future |
| 261 | # SPARGE = "sparge" |
| 262 | |
| 263 | # `xformers` |
| 264 | XFORMERS = "xformers" |
| 265 | |
| 266 | |
| 267 | class _AttentionBackendRegistry: |
no outgoing calls
no test coverage detected
searching dependent graphs…