diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index af5481671b7cf..4d323375c8489 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -15,8 +15,6 @@ import vllm._C # noqa import vllm.envs as envs from vllm.logger import init_logger -from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase -from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU from .interface import DeviceCapability, Platform, PlatformEnum, _Backend @@ -219,9 +217,8 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype, return "vllm.attention.backends.flash_attn.FlashAttentionBackend" @classmethod - def get_punica_wrapper(cls, *args, **kwargs) -> PunicaWrapperBase: - logger.info_once("Using PunicaWrapperGPU.") - return PunicaWrapperGPU(*args, **kwargs) + def get_punica_wrapper(cls) -> str: + return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU" # NVML utils diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index da42fbeca0bfe..d70b5ccc24d56 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -3,8 +3,6 @@ import torch from vllm.logger import init_logger -from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase -from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU from .interface import Platform, PlatformEnum, _Backend @@ -65,6 +63,5 @@ def is_pin_memory_available(cls): return False @classmethod - def get_punica_wrapper(cls, *args, **kwargs) -> PunicaWrapperBase: - logger.info_once("Using PunicaWrapperHPU.") - return PunicaWrapperHPU(*args, **kwargs) + def get_punica_wrapper(cls) -> str: + return "vllm.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU" diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 47200efa0eb98..d06491d1cd6f6 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -8,7 +8,6 @@ import torch from vllm.logger import init_logger -from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase if TYPE_CHECKING: from vllm.config import VllmConfig @@ -264,7 +263,7 @@ def is_pin_memory_available(cls) -> bool: return True @classmethod - def get_punica_wrapper(cls, *args, **kwargs) -> PunicaWrapperBase: + def get_punica_wrapper(cls) -> str: """ Return the punica wrapper for current platform. """ diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 7024d0949371e..a77203085e2e8 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -6,8 +6,6 @@ import vllm.envs as envs from vllm.logger import init_logger -from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase -from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU from .interface import DeviceCapability, Platform, PlatformEnum, _Backend @@ -153,6 +151,5 @@ def verify_quantization(cls, quant: str) -> None: envs.VLLM_USE_TRITON_AWQ = True @classmethod - def get_punica_wrapper(cls, *args, **kwargs) -> PunicaWrapperBase: - logger.info_once("Using PunicaWrapperGPU.") - return PunicaWrapperGPU(*args, **kwargs) + def get_punica_wrapper(cls) -> str: + return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"