refactor get_punica_wrapper() into platform

Signed-off-by: Shanshan Shen <[email protected]>
vllm-project · Dec 26, 2024 · abf1f7e · abf1f7e
1 parent aa25985
commit abf1f7e
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 13 deletions.
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
@@ -1,19 +1,7 @@
 from vllm.platforms import current_platform
-from vllm.utils import print_info_once
 
 from .punica_base import PunicaWrapperBase
 
 
 def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
-    if current_platform.is_cuda_alike():
-        # Lazy import to avoid ImportError
-        from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
-        print_info_once("Using PunicaWrapperGPU.")
-        return PunicaWrapperGPU(*args, **kwargs)
-    elif current_platform.is_hpu():
-        # Lazy import to avoid ImportError
-        from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
-        print_info_once("Using PunicaWrapperHPU.")
-        return PunicaWrapperHPU(*args, **kwargs)
-    else:
-        raise NotImplementedError
+    return current_platform.get_punica_wrapper(*args, **kwargs)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -15,6 +15,9 @@
 import vllm._C  # noqa
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
+from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
+from vllm.utils import print_info_once
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
@@ -141,6 +144,11 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
+    @classmethod
+    def get_punica_wrapper(cls, *args, **kwargs) -> PunicaWrapperBase:
+        print_info_once("Using PunicaWrapperGPU.")
+        return PunicaWrapperGPU(*args, **kwargs)
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,

diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
@@ -3,6 +3,9 @@
 import torch
 
 from vllm.logger import init_logger
+from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
+from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
+from vllm.utils import print_info_once
 
 from .interface import Platform, PlatformEnum, _Backend
 
@@ -58,3 +61,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on HPU.")
         return False
+
+    @classmethod
+    def get_punica_wrapper(cls, *args, **kwargs) -> PunicaWrapperBase:
+        print_info_once("Using PunicaWrapperHPU.")
+        return PunicaWrapperHPU(*args, **kwargs)
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
@@ -8,6 +8,7 @@
 import torch
 
 from vllm.logger import init_logger
+from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -238,6 +239,13 @@ def is_pin_memory_available(cls) -> bool:
             return False
         return True
 
+    @classmethod
+    def get_punica_wrapper(cls, *args, **kwargs) -> PunicaWrapperBase:
+        """
+        Return the punica wrapper for current platform.
+        """
+        raise NotImplementedError
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -6,6 +6,9 @@
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase
+from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
+from vllm.utils import print_info_once
 
 from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
 
@@ -110,3 +113,8 @@ def verify_quantization(cls, quant: str) -> None:
                 "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
                 " is not set, enabling VLLM_USE_TRITON_AWQ.")
         envs.VLLM_USE_TRITON_AWQ = True
+
+    @classmethod
+    def get_punica_wrapper(cls, *args, **kwargs) -> PunicaWrapperBase:
+        print_info_once("Using PunicaWrapperGPU.")
+        return PunicaWrapperGPU(*args, **kwargs)