[platform] fix attn backend for cuda

Signed-off-by: Mengqing Cao <[email protected]>
vllm-project · Jan 9, 2025 · 4e02460 · 4e02460
1 parent 7e83803
commit 4e02460
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 55 deletions.
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
@@ -4,14 +4,21 @@
 import torch
 
 from tests.kernels.utils import override_backend_env_variable
-from vllm.attention.selector import get_attn_backend
+from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.openvino import OpenVinoPlatform
 from vllm.platforms.rocm import RocmPlatform
 from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
 @pytest.mark.parametrize(
     "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
 @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
@@ -39,10 +46,12 @@ def test_env(name: str, device: str, monkeypatch):
                                        False)
         assert backend.get_name() == "OPENVINO"
     else:
-        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
-            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
-                                       False)
-        assert backend.get_name() == name
+        if name in ["XFORMERS", "FLASHINFER"]:
+            with patch("vllm.attention.selector.current_platform",
+                       CudaPlatform()):
+                backend = get_attn_backend(16, torch.float16, torch.float16,
+                                           16, False)
+            assert backend.get_name() == name
 
 
 def test_flash_attn(monkeypatch):

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -147,75 +147,69 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
         if use_v1:
             logger.info("Using Flash Attention backend on V1 engine.")
             return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
-        if selected_backend == _Backend.FLASHINFER:
-            logger.info("Using FlashInfer backend.")
-            return "vllm.attention.backends.flashinfer.FlashInferBackend"
-        elif selected_backend == _Backend.XFORMERS:
-            logger.info("Using XFormers backend.")
-            return "vllm.attention.backends.xformers.XFormersBackend"
-        elif selected_backend == _Backend.FLASH_ATTN:
-            logger.info("Using FlashAttention backend.")
-            return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
-        elif selected_backend:
-            raise ValueError(
-                f"Invalid attention backend for {cls.device_name}")
-
-        target_backend = _Backend.FLASH_ATTN
-        if not cls.has_device_capability(80):
-            # Volta and Turing NVIDIA GPUs.
-            logger.info(
-                "Cannot use FlashAttention-2 backend for Volta and Turing "
-                "GPUs.")
-            target_backend = _Backend.XFORMERS
-        elif dtype not in (torch.float16, torch.bfloat16):
-            logger.info(
-                "Cannot use FlashAttention-2 backend for dtype other than "
-                "torch.float16 or torch.bfloat16.")
-            target_backend = _Backend.XFORMERS
-        elif kv_cache_dtype is not None and \
-            kv_cache_dtype.startswith("fp8"):
-            logger.info(
-                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
-            logger.warning(
-                "Please use FlashInfer backend with FP8 KV Cache for "
-                "better performance by setting environment variable  "
-                "VLLM_ATTENTION_BACKEND=FLASHINFER")
-            target_backend = _Backend.XFORMERS
-        elif block_size % 16 != 0:
-            logger.info(
-                "Cannot use FlashAttention-2 backend for block size not "
-                "divisible by 16.")
-            target_backend = _Backend.XFORMERS
-
-        # FlashAttn is valid for the model, checking if the package is
-        # installed.
-        if target_backend == _Backend.FLASH_ATTN:
+
+        if selected_backend == _Backend.FLASH_ATTN:
+            if not cls.has_device_capability(80):
+                # Volta and Turing NVIDIA GPUs.
+                logger.info(
+                    "Cannot use FlashAttention-2 backend for Volta and Turing "
+                    "GPUs.")
+                selected_backend = _Backend.XFORMERS
+            elif dtype not in (torch.float16, torch.bfloat16):
+                logger.info(
+                    "Cannot use FlashAttention-2 backend for dtype other than "
+                    "torch.float16 or torch.bfloat16.")
+                selected_backend = _Backend.XFORMERS
+            elif kv_cache_dtype is not None and kv_cache_dtype.startswith(
+                    "fp8"):
+                logger.info(
+                    "Cannot use FlashAttention-2 backend for FP8 KV cache.")
+                logger.warning(
+                    "Please use FlashInfer backend with FP8 KV Cache for "
+                    "better performance by setting environment variable  "
+                    "VLLM_ATTENTION_BACKEND=FLASHINFER")
+                selected_backend = _Backend.XFORMERS
+            elif block_size % 16 != 0:
+                logger.info(
+                    "Cannot use FlashAttention-2 backend for block size not "
+                    "divisible by 16.")
+                selected_backend = _Backend.XFORMERS
+
+        if selected_backend == _Backend.FLASH_ATTN:
             try:
                 import vllm.vllm_flash_attn  # noqa: F401
                 from vllm.attention.backends.flash_attn import (  # noqa: F401
                     FlashAttentionBackend)
 
-                supported_sizes = \
+                flash_attn_supported_sizes = \
                     FlashAttentionBackend.get_supported_head_sizes()
-                if head_size not in supported_sizes:
+                if head_size not in flash_attn_supported_sizes:
                     logger.info(
                         "Cannot use FlashAttention-2 backend for head size %d.",
                         head_size)
-                    target_backend = _Backend.XFORMERS
+                    selected_backend = _Backend.XFORMERS
             except ImportError:
                 logger.info(
                     "Cannot use FlashAttention-2 backend because the "
                     "vllm.vllm_flash_attn package is not found. "
                     "Make sure that vllm_flash_attn was built and installed "
                     "(on by default).")
-                target_backend = _Backend.XFORMERS
+                selected_backend = _Backend.XFORMERS
 
-        if target_backend == _Backend.XFORMERS:
+        if selected_backend == _Backend.FLASHINFER:
+            logger.info("Using FlashInfer backend.")
+            return "vllm.attention.backends.flashinfer.FlashInferBackend"
+        elif selected_backend == _Backend.XFORMERS:
             logger.info("Using XFormers backend.")
             return "vllm.attention.backends.xformers.XFormersBackend"
+        elif selected_backend == _Backend.FLASH_ATTN:
+            return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
+        elif selected_backend:
+            raise ValueError(
+                f"Invalid attention backend {selected_backend} for"
+                f"{cls.device_name}")
 
-        logger.info("Using Flash Attention backend.")
-        return "vllm.attention.backends.flash_attn.FlashAttentionBackend"
+        return ""
 
 
 # NVML utils