diff --git a/src/_bentoml_impl/server/allocator.py b/src/_bentoml_impl/server/allocator.py
index c822ba30b42..f49fc3e8edb 100644
--- a/src/_bentoml_impl/server/allocator.py
+++ b/src/_bentoml_impl/server/allocator.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import os
 import warnings
 from typing import Any
 
@@ -12,6 +13,7 @@
 from bentoml.exceptions import BentoMLConfigException
 
 NVIDIA_GPU = "nvidia.com/gpu"
+DISABLE_GPU_ALLOCATION_ENV = "BENTOML_DISABLE_GPU_ALLOCATION"
 
 
 class ResourceAllocator:
@@ -26,7 +28,9 @@ def __init__(self) -> None:
     def assign_gpus(self, count: float) -> list[int]:
         if count > self.remaining_gpus:
             warnings.warn(
-                f"Requested {count} GPUs, but only {self.remaining_gpus} are remaining.",
+                f"Requested {count} GPUs, but only {self.remaining_gpus} are remaining. "
+                f"Serving may fail due to inadequate GPUs. Set {DISABLE_GPU_ALLOCATION_ENV}=1 "
+                "to disable automatic allocation and allocate GPUs manually.",
                 ResourceWarning,
                 stacklevel=3,
             )
@@ -97,7 +101,7 @@ def get_worker_env(
                 return num_workers, worker_env
             else:  # workers is a number
                 num_workers = workers
-        if num_gpus:
+        if num_gpus and DISABLE_GPU_ALLOCATION_ENV not in os.environ:
             assigned = self.assign_gpus(num_gpus)
             # assign gpus to all workers
             worker_env = [