diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh index 2924ea4a49f54..94999630bae12 100644 --- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh @@ -10,7 +10,8 @@ set -ex kill_gpu_processes() { # kill all processes on GPU. - pkill -f pt_main_thread + pgrep pt_main_thread | xargs -r kill -9 + pgrep python3 | xargs -r kill -9 sleep 10 # remove vllm config file @@ -54,7 +55,7 @@ benchmark() { CUDA_VISIBLE_DEVICES=0 python3 \ -m vllm.entrypoints.openai.api_server \ - --model meta-llama/Meta-Llama-3.1-8B-Instruct \ + --model $model \ --port 8100 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ @@ -64,7 +65,7 @@ benchmark() { CUDA_VISIBLE_DEVICES=1 python3 \ -m vllm.entrypoints.openai.api_server \ - --model meta-llama/Meta-Llama-3.1-8B-Instruct \ + --model $model \ --port 8200 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ @@ -87,7 +88,7 @@ benchmark() { --port 8100 \ --save-result \ --result-dir $results_folder \ - --result-filename disagg_prefill_2xtp4.json \ + --result-filename disagg_prefill_tp1.json \ --request-rate "inf" @@ -105,7 +106,7 @@ benchmark() { --port 8200 \ --save-result \ --result-dir $results_folder \ - --result-filename disagg_prefill_2xtp4.json \ + --result-filename disagg_prefill_tp1_overhead.json \ --request-rate "$qps" kill_gpu_processes @@ -118,7 +119,7 @@ main() { (which jq) || (apt-get -y install jq) (which socat) || (apt-get -y install socat) - pip install quart httpx + pip install quart httpx datasets cd "$(dirname "$0")" diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh index d8d9e976dce76..eb5d891d0d4a5 100644 --- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh @@ -1,13 +1,12 @@ #!/bin/bash -# Requirement: 8x H100 GPUs. +# Requirement: 2x GPUs. -# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV -# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests -# Resource: 8x H100 +# Model: meta-llama/Meta-Llama-3.1-8B-Instruct +# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests +# Resource: 2x GPU # Approaches: -# 1. Chunked prefill: 1 vllm instance with tp=8 # 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4 # 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance # Prefilling instance: max_output_token=1 @@ -114,7 +113,6 @@ benchmark() { --request-rate "$qps" sleep 2 - } @@ -123,8 +121,9 @@ main() { (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get -y install jq) (which socat) || (apt-get -y install socat) + (which lsof) || (apt-get -y install lsof) - pip install quart httpx matplotlib aiohttp + pip install quart httpx matplotlib aiohttp datasets cd "$(dirname "$0")" diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py index 96b0e58713332..718730bb8cbbe 100644 --- a/tests/kv_transfer/test_lookup_buffer.py +++ b/tests/kv_transfer/test_lookup_buffer.py @@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device): assert buffer.buffer_size == 0 assert len(buffer.buffer) == 0 - print("Test run passed!") + print("My rank: %d, Test run passed!" % (my_rank)) def stress_test(my_rank, buf, device): @@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device): else: torch.distributed.send(torch.tensor([n]), 0) - print("Passed stress test!") + print("My rank: %d, Passed stress test!" % (my_rank)) if __name__ == "__main__": diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh index 09d7ee018c3f4..f2aeaee9ca6d5 100644 --- a/tests/kv_transfer/test_lookup_buffer.sh +++ b/tests/kv_transfer/test_lookup_buffer.sh @@ -1,3 +1,8 @@ #!/bin/bash -RANK=0 python test_lookup_buffer.py & -RANK=1 python test_lookup_buffer.py & \ No newline at end of file +RANK=0 python3 test_lookup_buffer.py & +PID0=$! +RANK=1 python3 test_lookup_buffer.py & +PID1=$! + +wait $PID0 +wait $PID1 diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py index 65973bf10a4d7..4beba4dc05dde 100644 --- a/tests/kv_transfer/test_send_recv.py +++ b/tests/kv_transfer/test_send_recv.py @@ -10,39 +10,42 @@ def test_run(my_rank, pipe): + print(f"rank {my_rank} test_run starts....") # test run x = torch.tensor([1]).to(pipe.device) y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device) if my_rank == 0: pipe.send_tensor(x) - print("sent tensor x") + print(f"rank {my_rank} sent tensor x") pipe.send_tensor(y) - print("sent tensor y") + print(f"rank {my_rank} sent tensor y") x2 = pipe.recv_tensor() - print("received x2 = ", x2) + print(f"rank {my_rank} received x2 = ", x2) y2 = pipe.recv_tensor() - print("received y2 = ", x2) + print(f"rank {my_rank} received y2 = ", x2) else: x2 = pipe.recv_tensor() - print("received x2 = ", x2) + print(f"rank {my_rank} received x2 = ", x2) y2 = pipe.recv_tensor() - print("received y2 = ", x2) + print(f"rank {my_rank} received y2 = ", x2) pipe.send_tensor(x) - print("sent tensor x") + print(f"rank {my_rank} sent tensor x") pipe.send_tensor(y) - print("sent tensor y") + print(f"rank {my_rank} sent tensor y") assert torch.allclose(x, x2) assert torch.allclose(y, y2) + print(f"rank {my_rank} test_run passed!") -def stress_test(my_rank, pipe): - torch.distributed.barrier() +def stress_test(my_rank, pipe): + print(f"rank {my_rank} stress_test starts....") tensors: List[torch.Tensor] = [] + torch.distributed.barrier() torch.manual_seed(0) for i in tqdm(range(500)): @@ -86,7 +89,6 @@ def stress_test(my_rank, pipe): def latency_test(my_rank, pipe, nelement, ntensor): - latencies = [] torch.distributed.barrier() @@ -149,6 +151,7 @@ def latency_test(my_rank, pipe, nelement, ntensor): ) test_run(my_rank, pipe) + stress_test(my_rank, pipe) # Use this function if you want to test the latency of pipe impl. diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh index 1e89e246b4992..54e0604806841 100644 --- a/tests/kv_transfer/test_send_recv.sh +++ b/tests/kv_transfer/test_send_recv.sh @@ -1,3 +1,9 @@ #!/bin/bash + RANK=0 python3 test_send_recv.py & -RANK=1 python3 test_send_recv.py & \ No newline at end of file +PID0=$! +RANK=1 python3 test_send_recv.py & +PID1=$! + +wait $PID0 +wait $PID1