vllm-project · KuntaiDu · Dec 25, 2024 · Dec 24, 2024 · Dec 24, 2024
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -10,7 +10,8 @@ set -ex
 
 kill_gpu_processes() {
   # kill all processes on GPU.
-  pkill -f pt_main_thread
+  pgrep pt_main_thread | xargs -r kill -9
+  pgrep python3 | xargs -r kill -9
   sleep 10
 
   # remove vllm config file
@@ -54,7 +55,7 @@ benchmark() {
 
   CUDA_VISIBLE_DEVICES=0 python3 \
     -m vllm.entrypoints.openai.api_server \
-    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --model $model \
     --port 8100 \
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
@@ -64,7 +65,7 @@ benchmark() {
 
   CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
-    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --model $model \
     --port 8200 \
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
@@ -87,7 +88,7 @@ benchmark() {
           --port 8100 \
           --save-result \
           --result-dir $results_folder \
-          --result-filename disagg_prefill_2xtp4.json \
+          --result-filename disagg_prefill_tp1.json \
           --request-rate "inf"
 
 
@@ -105,7 +106,7 @@ benchmark() {
           --port 8200 \
           --save-result \
           --result-dir $results_folder \
-          --result-filename disagg_prefill_2xtp4.json \
+          --result-filename disagg_prefill_tp1_overhead.json \
           --request-rate "$qps"
   kill_gpu_processes
 
@@ -118,7 +119,7 @@ main() {
   (which jq) || (apt-get -y install jq)
   (which socat) || (apt-get -y install socat)
 
-  pip install quart httpx
+  pip install quart httpx datasets
 
   cd "$(dirname "$0")"
 

diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -1,13 +1,12 @@
 #!/bin/bash
 
-# Requirement: 8x H100 GPUs.
+# Requirement: 2x GPUs.
 
 
-# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
-# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
-# Resource: 8x H100
+# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
+# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
+# Resource: 2x GPU
 # Approaches:
-# 1. Chunked prefill: 1 vllm instance with tp=8
 # 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
 # 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
 # Prefilling instance: max_output_token=1
@@ -114,7 +113,6 @@ benchmark() {
           --request-rate "$qps"
 
   sleep 2
-
 }
 
 
@@ -123,8 +121,9 @@ main() {
   (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
   (which jq) || (apt-get -y install jq)
   (which socat) || (apt-get -y install socat)
+  (which lsof) || (apt-get -y install lsof)
 
-  pip install quart httpx matplotlib aiohttp
+  pip install quart httpx matplotlib aiohttp datasets
 
   cd "$(dirname "$0")"
 

diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
 
-    print("Test run passed!")
+    print("My rank: %d, Test run passed!" % (my_rank))
 
 
 def stress_test(my_rank, buf, device):
@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
     else:
         torch.distributed.send(torch.tensor([n]), 0)
 
-    print("Passed stress test!")
+    print("My rank: %d, Passed stress test!" % (my_rank))
 
 
 if __name__ == "__main__":

diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh
@@ -1,3 +1,8 @@
 #!/bin/bash
-RANK=0 python test_lookup_buffer.py &
-RANK=1 python test_lookup_buffer.py &
+RANK=0 python3 test_lookup_buffer.py &
+PID0=$!
+RANK=1 python3 test_lookup_buffer.py &
+PID1=$!
+
+wait $PID0
+wait $PID1
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
@@ -10,39 +10,42 @@
 
 
 def test_run(my_rank, pipe):
+    print(f"rank {my_rank} test_run starts....")
     # test run
     x = torch.tensor([1]).to(pipe.device)
     y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
     if my_rank == 0:
         pipe.send_tensor(x)
-        print("sent tensor x")
+        print(f"rank {my_rank} sent tensor x")
         pipe.send_tensor(y)
-        print("sent tensor y")
+        print(f"rank {my_rank} sent tensor y")
         x2 = pipe.recv_tensor()
-        print("received x2 = ", x2)
+        print(f"rank {my_rank} received x2 = ", x2)
         y2 = pipe.recv_tensor()
-        print("received y2 = ", x2)
+        print(f"rank {my_rank} received y2 = ", x2)
 
     else:
         x2 = pipe.recv_tensor()
-        print("received x2 = ", x2)
+        print(f"rank {my_rank} received x2 = ", x2)
         y2 = pipe.recv_tensor()
-        print("received y2 = ", x2)
+        print(f"rank {my_rank} received y2 = ", x2)
         pipe.send_tensor(x)
-        print("sent tensor x")
+        print(f"rank {my_rank} sent tensor x")
         pipe.send_tensor(y)
-        print("sent tensor y")
+        print(f"rank {my_rank} sent tensor y")
 
     assert torch.allclose(x, x2)
     assert torch.allclose(y, y2)
 
+    print(f"rank {my_rank} test_run passed!")
 
-def stress_test(my_rank, pipe):
 
-    torch.distributed.barrier()
+def stress_test(my_rank, pipe):
+    print(f"rank {my_rank} stress_test starts....")
 
     tensors: List[torch.Tensor] = []
 
+    torch.distributed.barrier()
     torch.manual_seed(0)
 
     for i in tqdm(range(500)):
@@ -86,7 +89,6 @@ def stress_test(my_rank, pipe):
 
 
 def latency_test(my_rank, pipe, nelement, ntensor):
-
     latencies = []
 
     torch.distributed.barrier()
@@ -149,6 +151,7 @@ def latency_test(my_rank, pipe, nelement, ntensor):
     )
 
     test_run(my_rank, pipe)
+
     stress_test(my_rank, pipe)
 
     # Use this function if you want to test the latency of pipe impl.

diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh
@@ -1,3 +1,9 @@
 #!/bin/bash
+
 RANK=0 python3 test_send_recv.py &
-RANK=1 python3 test_send_recv.py &
+PID0=$!
+RANK=1 python3 test_send_recv.py &
+PID1=$!
+
+wait $PID0
+wait $PID1