Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Misc] Update disaggregation benchmark scripts and test logs #11456

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ set -ex

kill_gpu_processes() {
# kill all processes on GPU.
pkill -f pt_main_thread
pgrep pt_main_thread | xargs -r kill -9
pgrep python3 | xargs -r kill -9
sleep 10

# remove vllm config file
Expand Down Expand Up @@ -54,7 +55,7 @@ benchmark() {

CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
--model $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
Expand All @@ -64,7 +65,7 @@ benchmark() {

CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
--model $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
Expand All @@ -87,7 +88,7 @@ benchmark() {
--port 8100 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_2xtp4.json \
--result-filename disagg_prefill_tp1.json \
--request-rate "inf"


Expand All @@ -105,7 +106,7 @@ benchmark() {
--port 8200 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_2xtp4.json \
--result-filename disagg_prefill_tp1_overhead.json \
--request-rate "$qps"
kill_gpu_processes

Expand All @@ -118,7 +119,7 @@ main() {
(which jq) || (apt-get -y install jq)
(which socat) || (apt-get -y install socat)

pip install quart httpx
pip install quart httpx datasets

cd "$(dirname "$0")"

Expand Down
13 changes: 6 additions & 7 deletions benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
#!/bin/bash

# Requirement: 8x H100 GPUs.
# Requirement: 2x GPUs.


# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV
# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
# Resource: 8x H100
# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
# Resource: 2x GPU
# Approaches:
# 1. Chunked prefill: 1 vllm instance with tp=8
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
# Prefilling instance: max_output_token=1
Expand Down Expand Up @@ -114,7 +113,6 @@ benchmark() {
--request-rate "$qps"

sleep 2

}


Expand All @@ -123,8 +121,9 @@ main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get -y install jq)
(which socat) || (apt-get -y install socat)
(which lsof) || (apt-get -y install lsof)

pip install quart httpx matplotlib aiohttp
pip install quart httpx matplotlib aiohttp datasets

cd "$(dirname "$0")"

Expand Down
4 changes: 2 additions & 2 deletions tests/kv_transfer/test_lookup_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0

print("Test run passed!")
print("My rank: %d, Test run passed!" % (my_rank))


def stress_test(my_rank, buf, device):
Expand Down Expand Up @@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
else:
torch.distributed.send(torch.tensor([n]), 0)

print("Passed stress test!")
print("My rank: %d, Passed stress test!" % (my_rank))


if __name__ == "__main__":
Expand Down
9 changes: 7 additions & 2 deletions tests/kv_transfer/test_lookup_buffer.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
#!/bin/bash
RANK=0 python test_lookup_buffer.py &
RANK=1 python test_lookup_buffer.py &
RANK=0 python3 test_lookup_buffer.py &
PID0=$!
RANK=1 python3 test_lookup_buffer.py &
PID1=$!

wait $PID0
wait $PID1
25 changes: 14 additions & 11 deletions tests/kv_transfer/test_send_recv.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,39 +10,42 @@


def test_run(my_rank, pipe):
print(f"rank {my_rank} test_run starts....")
# test run
x = torch.tensor([1]).to(pipe.device)
y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
if my_rank == 0:
pipe.send_tensor(x)
print("sent tensor x")
print(f"rank {my_rank} sent tensor x")
pipe.send_tensor(y)
print("sent tensor y")
print(f"rank {my_rank} sent tensor y")
x2 = pipe.recv_tensor()
print("received x2 = ", x2)
print(f"rank {my_rank} received x2 = ", x2)
y2 = pipe.recv_tensor()
print("received y2 = ", x2)
print(f"rank {my_rank} received y2 = ", x2)

else:
x2 = pipe.recv_tensor()
print("received x2 = ", x2)
print(f"rank {my_rank} received x2 = ", x2)
y2 = pipe.recv_tensor()
print("received y2 = ", x2)
print(f"rank {my_rank} received y2 = ", x2)
pipe.send_tensor(x)
print("sent tensor x")
print(f"rank {my_rank} sent tensor x")
pipe.send_tensor(y)
print("sent tensor y")
print(f"rank {my_rank} sent tensor y")

assert torch.allclose(x, x2)
assert torch.allclose(y, y2)

print(f"rank {my_rank} test_run passed!")

def stress_test(my_rank, pipe):

torch.distributed.barrier()
def stress_test(my_rank, pipe):
print(f"rank {my_rank} stress_test starts....")

tensors: List[torch.Tensor] = []

torch.distributed.barrier()
torch.manual_seed(0)

for i in tqdm(range(500)):
Expand Down Expand Up @@ -86,7 +89,6 @@ def stress_test(my_rank, pipe):


def latency_test(my_rank, pipe, nelement, ntensor):

latencies = []

torch.distributed.barrier()
Expand Down Expand Up @@ -149,6 +151,7 @@ def latency_test(my_rank, pipe, nelement, ntensor):
)

test_run(my_rank, pipe)

stress_test(my_rank, pipe)

# Use this function if you want to test the latency of pipe impl.
Expand Down
8 changes: 7 additions & 1 deletion tests/kv_transfer/test_send_recv.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
#!/bin/bash

RANK=0 python3 test_send_recv.py &
RANK=1 python3 test_send_recv.py &
PID0=$!
RANK=1 python3 test_send_recv.py &
PID1=$!

wait $PID0
wait $PID1
Loading