From 14bb76dd963f13bcca775dda92e05970860295f9 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Thu, 5 Dec 2024 17:39:31 +0800
Subject: [PATCH 1/4] optimize cuda graph max_bs_settings on low-end gpus

---
 python/sglang/srt/server_args.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 7b337500fd7..1c9779903d7 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -181,10 +181,12 @@ def __post_init__(self):
 
         # Set cuda graph max batch size
         if self.cuda_graph_max_bs is None:
-            if gpu_mem is not None and gpu_mem < 25_000:
-                self.cuda_graph_max_bs = 8
-            else:
-                self.cuda_graph_max_bs = 160
+            # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
+            if gpu_mem < 25_000:
+                if self.tp_size < 4:
+                   self.cuda_graph_max_bs = 8
+                else:
+                   self.cuda_graph_max_bs = 80
 
         # Choose kernel backends
         if self.attention_backend is None:

From 684715fd70fbf2e9f65b32789781265d90fe3332 Mon Sep 17 00:00:00 2001
From: BBuf <1182563586@qq.com>
Date: Thu, 5 Dec 2024 17:50:07 +0800
Subject: [PATCH 2/4] lint

---
 python/sglang/srt/server_args.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 1c9779903d7..2450650a2a9 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -184,9 +184,9 @@ def __post_init__(self):
             # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
             if gpu_mem < 25_000:
                 if self.tp_size < 4:
-                   self.cuda_graph_max_bs = 8
+                    self.cuda_graph_max_bs = 8
                 else:
-                   self.cuda_graph_max_bs = 80
+                    self.cuda_graph_max_bs = 80
 
         # Choose kernel backends
         if self.attention_backend is None:

From 7328ae992fa28d106d1e13ad2e228c271152d679 Mon Sep 17 00:00:00 2001
From: BBuf <1182563586@qq.com>
Date: Thu, 5 Dec 2024 18:15:11 +0800
Subject: [PATCH 3/4] refine

---
 python/sglang/srt/server_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 2450650a2a9..13421549dc6 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -182,7 +182,7 @@ def __post_init__(self):
         # Set cuda graph max batch size
         if self.cuda_graph_max_bs is None:
             # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
-            if gpu_mem < 25_000:
+            if gpu_mem is not None and gpu_mem < 25_000:
                 if self.tp_size < 4:
                     self.cuda_graph_max_bs = 8
                 else:

From 400037a6378e069a5ea2f4e53fc0df35dd98fa18 Mon Sep 17 00:00:00 2001
From: BBuf <1182563586@qq.com>
Date: Thu, 5 Dec 2024 18:16:14 +0800
Subject: [PATCH 4/4] refine

---
 python/sglang/srt/server_args.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 13421549dc6..50cf0b2ee9b 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -187,6 +187,8 @@ def __post_init__(self):
                     self.cuda_graph_max_bs = 8
                 else:
                     self.cuda_graph_max_bs = 80
+            else:
+                self.cuda_graph_max_bs = 160
 
         # Choose kernel backends
         if self.attention_backend is None: