Skip to content

Commit

Permalink
[Misc] Update disaggregation benchmark scripts and test logs (vllm-pr…
Browse files Browse the repository at this point in the history
…oject#11456)

Signed-off-by: Jiaxin Shan <[email protected]>
  • Loading branch information
Jeffwan authored and BKitor committed Dec 30, 2024
1 parent 64bfd45 commit 216540a
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 29 deletions.
13 changes: 7 additions & 6 deletions benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ set -ex

kill_gpu_processes() {
# kill all processes on GPU.
pkill -f pt_main_thread
pgrep pt_main_thread | xargs -r kill -9
pgrep python3 | xargs -r kill -9
sleep 10

# remove vllm config file
Expand Down Expand Up @@ -54,7 +55,7 @@ benchmark() {

CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
--model $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
Expand All @@ -64,7 +65,7 @@ benchmark() {

CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
--model $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
Expand All @@ -87,7 +88,7 @@ benchmark() {
--port 8100 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_2xtp4.json \
--result-filename disagg_prefill_tp1.json \
--request-rate "inf"


Expand All @@ -105,7 +106,7 @@ benchmark() {
--port 8200 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_2xtp4.json \
--result-filename disagg_prefill_tp1_overhead.json \
--request-rate "$qps"
kill_gpu_processes

Expand All @@ -118,7 +119,7 @@ main() {
(which jq) || (apt-get -y install jq)
(which socat) || (apt-get -y install socat)

pip install quart httpx
pip install quart httpx datasets

cd "$(dirname "$0")"

Expand Down
13 changes: 6 additions & 7 deletions benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
#!/bin/bash

# Requirement: 8x H100 GPUs.
# Requirement: 2x GPUs.


# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV
# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
# Resource: 8x H100
# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
# Resource: 2x GPU
# Approaches:
# 1. Chunked prefill: 1 vllm instance with tp=8
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
# Prefilling instance: max_output_token=1
Expand Down Expand Up @@ -114,7 +113,6 @@ benchmark() {
--request-rate "$qps"

sleep 2

}


Expand All @@ -123,8 +121,9 @@ main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get -y install jq)
(which socat) || (apt-get -y install socat)
(which lsof) || (apt-get -y install lsof)

pip install quart httpx matplotlib aiohttp
pip install quart httpx matplotlib aiohttp datasets

cd "$(dirname "$0")"

Expand Down
4 changes: 2 additions & 2 deletions tests/kv_transfer/test_lookup_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0

print("Test run passed!")
print("My rank: %d, Test run passed!" % (my_rank))


def stress_test(my_rank, buf, device):
Expand Down Expand Up @@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
else:
torch.distributed.send(torch.tensor([n]), 0)

print("Passed stress test!")
print("My rank: %d, Passed stress test!" % (my_rank))


if __name__ == "__main__":
Expand Down
9 changes: 7 additions & 2 deletions tests/kv_transfer/test_lookup_buffer.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
#!/bin/bash
RANK=0 python test_lookup_buffer.py &
RANK=1 python test_lookup_buffer.py &
RANK=0 python3 test_lookup_buffer.py &
PID0=$!
RANK=1 python3 test_lookup_buffer.py &
PID1=$!

wait $PID0
wait $PID1
25 changes: 14 additions & 11 deletions tests/kv_transfer/test_send_recv.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,39 +10,42 @@


def test_run(my_rank, pipe):
print(f"rank {my_rank} test_run starts....")
# test run
x = torch.tensor([1]).to(pipe.device)
y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
if my_rank == 0:
pipe.send_tensor(x)
print("sent tensor x")
print(f"rank {my_rank} sent tensor x")
pipe.send_tensor(y)
print("sent tensor y")
print(f"rank {my_rank} sent tensor y")
x2 = pipe.recv_tensor()
print("received x2 = ", x2)
print(f"rank {my_rank} received x2 = ", x2)
y2 = pipe.recv_tensor()
print("received y2 = ", x2)
print(f"rank {my_rank} received y2 = ", x2)

else:
x2 = pipe.recv_tensor()
print("received x2 = ", x2)
print(f"rank {my_rank} received x2 = ", x2)
y2 = pipe.recv_tensor()
print("received y2 = ", x2)
print(f"rank {my_rank} received y2 = ", x2)
pipe.send_tensor(x)
print("sent tensor x")
print(f"rank {my_rank} sent tensor x")
pipe.send_tensor(y)
print("sent tensor y")
print(f"rank {my_rank} sent tensor y")

assert torch.allclose(x, x2)
assert torch.allclose(y, y2)

print(f"rank {my_rank} test_run passed!")

def stress_test(my_rank, pipe):

torch.distributed.barrier()
def stress_test(my_rank, pipe):
print(f"rank {my_rank} stress_test starts....")

tensors: List[torch.Tensor] = []

torch.distributed.barrier()
torch.manual_seed(0)

for i in tqdm(range(500)):
Expand Down Expand Up @@ -86,7 +89,6 @@ def stress_test(my_rank, pipe):


def latency_test(my_rank, pipe, nelement, ntensor):

latencies = []

torch.distributed.barrier()
Expand Down Expand Up @@ -149,6 +151,7 @@ def latency_test(my_rank, pipe, nelement, ntensor):
)

test_run(my_rank, pipe)

stress_test(my_rank, pipe)

# Use this function if you want to test the latency of pipe impl.
Expand Down
8 changes: 7 additions & 1 deletion tests/kv_transfer/test_send_recv.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
#!/bin/bash

RANK=0 python3 test_send_recv.py &
RANK=1 python3 test_send_recv.py &
PID0=$!
RANK=1 python3 test_send_recv.py &
PID1=$!

wait $PID0
wait $PID1

0 comments on commit 216540a

Please sign in to comment.