From 547e7e95f95fadd0c0c34b3cb2e1f41d2a519745 Mon Sep 17 00:00:00 2001 From: Ata Fatahi Date: Wed, 11 Dec 2024 15:51:21 -0800 Subject: [PATCH 1/5] Clean up GPU memory after killing sglang processes Signed-off-by: Ata Fatahi --- scripts/killall_sglang.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/killall_sglang.sh b/scripts/killall_sglang.sh index 3696a1c35f4..d9899c95b6a 100755 --- a/scripts/killall_sglang.sh +++ b/scripts/killall_sglang.sh @@ -8,7 +8,6 @@ kill -9 $(ps aux | grep 'sglang::' | grep -v 'grep' | awk '{print $2}') 2>/dev/n kill -9 $(ps aux | grep 'sglang.launch_server' | grep -v 'grep' | awk '{print $2}') 2>/dev/null kill -9 $(ps aux | grep 'sglang.bench' | grep -v 'grep' | awk '{print $2}') 2>/dev/null -# Clean all GPU processes if any argument is provided -if [ $# -gt 0 ]; then - kill -9 $(nvidia-smi | sed -n '/Processes:/,$p' | grep " [0-9]" | awk '{print $5}') 2>/dev/null -fi +# Clean all GPU processes +kill -9 $(nvidia-smi | sed -n '/Processes:/,$p' | grep " [0-9]" | awk '{print $5}') 2>/dev/null +lsof /dev/nvidia* | awk '{print $2}' | xargs kill -9 2>/dev/null From 2ade51e22134c4f1e421a9a7be78e4e00f842c1d Mon Sep 17 00:00:00 2001 From: Ata Fatahi Date: Wed, 11 Dec 2024 19:35:55 -0800 Subject: [PATCH 2/5] show gpu status after clean up Signed-off-by: Ata Fatahi --- scripts/killall_sglang.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/killall_sglang.sh b/scripts/killall_sglang.sh index d9899c95b6a..da9327e3fd1 100755 --- a/scripts/killall_sglang.sh +++ b/scripts/killall_sglang.sh @@ -11,3 +11,6 @@ kill -9 $(ps aux | grep 'sglang.bench' | grep -v 'grep' | awk '{print $2}') 2>/d # Clean all GPU processes kill -9 $(nvidia-smi | sed -n '/Processes:/,$p' | grep " [0-9]" | awk '{print $5}') 2>/dev/null lsof /dev/nvidia* | awk '{print $2}' | xargs kill -9 2>/dev/null + +# Show GPU status after clean up +nvidia-smi From 9c579054a8700afd8a01324fe80642e0bd0146d3 Mon Sep 17 00:00:00 2001 From: Ata Fatahi Date: Wed, 11 Dec 2024 23:32:51 -0800 Subject: [PATCH 3/5] make cuda process killer conditional Signed-off-by: Ata Fatahi --- scripts/killall_sglang.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/killall_sglang.sh b/scripts/killall_sglang.sh index da9327e3fd1..69ad190564b 100755 --- a/scripts/killall_sglang.sh +++ b/scripts/killall_sglang.sh @@ -9,8 +9,11 @@ kill -9 $(ps aux | grep 'sglang.launch_server' | grep -v 'grep' | awk '{print $2 kill -9 $(ps aux | grep 'sglang.bench' | grep -v 'grep' | awk '{print $2}') 2>/dev/null # Clean all GPU processes -kill -9 $(nvidia-smi | sed -n '/Processes:/,$p' | grep " [0-9]" | awk '{print $5}') 2>/dev/null -lsof /dev/nvidia* | awk '{print $2}' | xargs kill -9 2>/dev/null +if [ $# -gt 0 ]; then + kill -9 $(nvidia-smi | sed -n '/Processes:/,$p' | grep " [0-9]" | awk '{print $5}') 2>/dev/null + lsof /dev/nvidia* | awk '{print $2}' | xargs kill -9 2>/dev/null +fi + # Show GPU status after clean up nvidia-smi From 007d352b39511482dd33ebe47c826a150d7e7111 Mon Sep 17 00:00:00 2001 From: Ata Fatahi Date: Wed, 11 Dec 2024 23:33:54 -0800 Subject: [PATCH 4/5] fix comment Signed-off-by: Ata Fatahi --- scripts/killall_sglang.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/killall_sglang.sh b/scripts/killall_sglang.sh index 69ad190564b..4057d2be2fb 100755 --- a/scripts/killall_sglang.sh +++ b/scripts/killall_sglang.sh @@ -8,7 +8,7 @@ kill -9 $(ps aux | grep 'sglang::' | grep -v 'grep' | awk '{print $2}') 2>/dev/n kill -9 $(ps aux | grep 'sglang.launch_server' | grep -v 'grep' | awk '{print $2}') 2>/dev/null kill -9 $(ps aux | grep 'sglang.bench' | grep -v 'grep' | awk '{print $2}') 2>/dev/null -# Clean all GPU processes +# Clean all GPU processes if any argument is provided if [ $# -gt 0 ]; then kill -9 $(nvidia-smi | sed -n '/Processes:/,$p' | grep " [0-9]" | awk '{print $5}') 2>/dev/null lsof /dev/nvidia* | awk '{print $2}' | xargs kill -9 2>/dev/null From 3b5c456fd67b682c17e82a23bd50d64f3042062a Mon Sep 17 00:00:00 2001 From: Ata Fatahi Date: Thu, 12 Dec 2024 15:41:41 -0800 Subject: [PATCH 5/5] add arg for cuda process killer in rust tests Signed-off-by: Ata Fatahi --- .github/workflows/pr-test-rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-test-rust.yml b/.github/workflows/pr-test-rust.yml index 9cfb5f6d961..928d0efa5b3 100644 --- a/.github/workflows/pr-test-rust.yml +++ b/.github/workflows/pr-test-rust.yml @@ -60,7 +60,7 @@ jobs: pip install --force-reinstall dist/*.whl - name: Run e2e test run: | - bash scripts/killall_sglang.sh + bash scripts/killall_sglang.sh "nuk_gpus" cd sgl-router/py_test python3 run_suite.py