diff --git a/sglang/experiments/deepseek_v3.sh b/sglang/experiments/deepseek_v3.sh index ab8f061..1b0efd9 100644 --- a/sglang/experiments/deepseek_v3.sh +++ b/sglang/experiments/deepseek_v3.sh @@ -61,28 +61,9 @@ docker run --gpus all \ python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1 --random-output 512 --random-range-ratio 1 --num-prompts 1 --host 0.0.0.0 --port 40000 ' -# 8xH200 FP8/BF16 +# 8xH200/2x8xH200 FP8/BF16 # Online -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 --output-file deepseek_v3_8xh200_online_output.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 --output-file deepseek_v3_8xh200_online_output.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 --output-file deepseek_v3_8xh200_online_output.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 --output-file deepseek_v3_8xh200_online_output.jsonl - -# Offline -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 300 --random-input 1024 --random-output 1024 --output-file deepseek_v3_8xh200_offline_output.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 600 --random-input 1024 --random-output 1024 --output-file deepseek_v3_8xh200_offline_output.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 1200 --random-input 1024 --random-output 1024 --output-file deepseek_v3_8xh200_offline_output.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 2400 --random-input 1024 --random-output 1024 --output-file deepseek_v3_8xh200_offline_output.jsonl - -# 2x8xH200 BF16 -# Online -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 --host 0.0.0.0 --port 40000 --output-file deepseek_v3_2x8xh200_online_output.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 --host 0.0.0.0 --port 40000 --output-file deepseek_v3_2x8xh200_online_output.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 --host 0.0.0.0 --port 40000 --output-file deepseek_v3_2x8xh200_online_output.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 --host 0.0.0.0 --port 40000 --output-file deepseek_v3_2x8xh200_online_output.jsonl - -# Offline -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 300 --random-input 1024 --random-output 1024 --host 0.0.0.0 --port 40000 --output-file deepseek_v3_2x8xh200_offline_output.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 600 --random-input 1024 --random-output 1024 --host 0.0.0.0 --port 40000 --output-file deepseek_v3_2x8xh200_offline_output.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 1200 --random-input 1024 --random-output 1024 --host 0.0.0.0 --port 40000 --output-file deepseek_v3_2x8xh200_offline_output.jsonl -python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 2400 --random-input 1024 --random-output 1024 --host 0.0.0.0 --port 40000 --output-file deepseek_v3_2x8xh200_offline_output.jsonl +python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 --host 0.0.0.0 --port 40000 --output-file deepseek_v3_2x8xh200_FP8_online_output.jsonl +python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 --host 0.0.0.0 --port 40000 --output-file deepseek_v3_2x8xh200_FP8_online_output.jsonl +python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 --host 0.0.0.0 --port 40000 --output-file deepseek_v3_2x8xh200_FP8_online_output.jsonl +python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-range-ratio 1 --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 --host 0.0.0.0 --port 40000 --output-file deepseek_v3_2x8xh200_FP8_online_output.jsonl diff --git a/sglang/experiments/deepseek_v3_2x8xh200_BF16_online_output.jsonl b/sglang/experiments/deepseek_v3_2x8xh200_BF16_online_output.jsonl new file mode 100644 index 0000000..c90715b --- /dev/null +++ b/sglang/experiments/deepseek_v3_2x8xh200_BF16_online_output.jsonl @@ -0,0 +1,4 @@ +{"backend": "sglang", "dataset_name": "random", "request_rate": 1.0, "max_concurrency": null, "total_input_tokens": 307200, "total_output_tokens": 307200, "total_output_tokens_retokenized": 306053, "mean_e2e_latency_ms": 968448.8521837055, "median_e2e_latency_ms": 971353.9656687062, "median_ttft_ms": 53189.53575310297, "median_itl_ms": 638.6785819195211, "output_throughput": 275.05934570759786, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 1116.849890010897, "completed": 300} +{"backend": "sglang", "dataset_name": "random", "request_rate": 2.0, "max_concurrency": null, "total_input_tokens": 614400, "total_output_tokens": 614400, "total_output_tokens_retokenized": 612299, "mean_e2e_latency_ms": 2003883.862575565, "median_e2e_latency_ms": 2010951.2275049929, "median_ttft_ms": 313373.927626526, "median_itl_ms": 1192.372274119407, "output_throughput": 256.4982169510671, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 2395.338288519997, "completed": 600} +{"backend": "sglang", "dataset_name": "random", "request_rate": 4.0, "max_concurrency": null, "total_input_tokens": 1228800, "total_output_tokens": 1228800, "total_output_tokens_retokenized": 1224692, "mean_e2e_latency_ms": 3206867.3097752165, "median_e2e_latency_ms": 3881082.652960904, "median_ttft_ms": 774460.7280562632, "median_itl_ms": 1178.4203723073006, "output_throughput": 255.4465169783954, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 4810.4002925353125, "completed": 1200} +{"backend": "sglang", "dataset_name": "random", "request_rate": 8.0, "max_concurrency": null, "total_input_tokens": 2457600, "total_output_tokens": 2457600, "total_output_tokens_retokenized": 2449303, "mean_e2e_latency_ms": 6004940.752673052, "median_e2e_latency_ms": 6819185.607663356, "median_ttft_ms": 4072706.7238641903, "median_itl_ms": 1205.5958840064704, "output_throughput": 250.07723760043686, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 9827.363831995986, "completed": 2400} diff --git a/sglang/experiments/deepseek_v3_2x8xh200_FP8_online_output.jsonl b/sglang/experiments/deepseek_v3_2x8xh200_FP8_online_output.jsonl new file mode 100644 index 0000000..3e942bf --- /dev/null +++ b/sglang/experiments/deepseek_v3_2x8xh200_FP8_online_output.jsonl @@ -0,0 +1,4 @@ +{"backend": "sglang", "dataset_name": "random", "request_rate": 1.0, "max_concurrency": null, "total_input_tokens": 307200, "total_output_tokens": 307200, "total_output_tokens_retokenized": 306092, "mean_e2e_latency_ms": 982681.0571394442, "median_e2e_latency_ms": 985610.623908462, "median_ttft_ms": 56824.06605547294, "median_itl_ms": 662.3261536005884, "output_throughput": 271.5959636429426, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 1131.0919200694188, "completed": 300} +{"backend": "sglang", "dataset_name": "random", "request_rate": 2.0, "max_concurrency": null, "total_input_tokens": 614400, "total_output_tokens": 614400, "total_output_tokens_retokenized": 612142, "mean_e2e_latency_ms": 1978002.6884525253, "median_e2e_latency_ms": 1975371.9891069923, "median_ttft_ms": 305318.36949149147, "median_itl_ms": 1219.141379930079, "output_throughput": 288.41458420567164, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 2130.2667536460795, "completed": 600} +{"backend": "sglang", "dataset_name": "random", "request_rate": 4.0, "max_concurrency": null, "total_input_tokens": 1228800, "total_output_tokens": 1228800, "total_output_tokens_retokenized": 1224515, "mean_e2e_latency_ms": 3929702.0734317033, "median_e2e_latency_ms": 3901390.298462007, "median_ttft_ms": 767082.1364489384, "median_itl_ms": 2189.8306920193136, "output_throughput": 269.19030517752464, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 4564.800352633931, "completed": 1200} +{"backend": "sglang", "dataset_name": "random", "request_rate": 8.0, "max_concurrency": null, "total_input_tokens": 2457600, "total_output_tokens": 2457600, "total_output_tokens_retokenized": 2448836, "mean_e2e_latency_ms": 6079389.87389776, "median_e2e_latency_ms": 7374173.1368335895, "median_ttft_ms": 1680440.4092754703, "median_itl_ms": 2007.022154983133, "output_throughput": 276.74194728875386, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 8880.475201093126, "completed": 2400} diff --git a/sglang/experiments/deepseek_v3_8xh200_BF16_online_output.jsonl b/sglang/experiments/deepseek_v3_8xh200_BF16_online_output.jsonl new file mode 100644 index 0000000..acf89b1 --- /dev/null +++ b/sglang/experiments/deepseek_v3_8xh200_BF16_online_output.jsonl @@ -0,0 +1,4 @@ +{"backend": "sglang", "dataset_name": "random", "request_rate": 1.0, "max_concurrency": null, "total_input_tokens": 307200, "total_output_tokens": 307200, "total_output_tokens_retokenized": 306052, "mean_e2e_latency_ms": 219910.4881566499, "median_e2e_latency_ms": 214924.09367999062, "median_ttft_ms": 587.1520687360317, "median_itl_ms": 159.6419473644346, "output_throughput": 639.9948784795965, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 480.00384117104113, "completed": 300} +{"backend": "sglang", "dataset_name": "random", "request_rate": 2.0, "max_concurrency": null, "total_input_tokens": 614400, "total_output_tokens": 614400, "total_output_tokens_retokenized": 612253, "mean_e2e_latency_ms": 235341.5755853096, "median_e2e_latency_ms": 235524.69775360078, "median_ttft_ms": 598.7704854924232, "median_itl_ms": 162.9884666763246, "output_throughput": 1313.7380027330087, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 467.67315760208294, "completed": 600} +{"backend": "sglang", "dataset_name": "random", "request_rate": 4.0, "max_concurrency": null, "total_input_tokens": 1228800, "total_output_tokens": 1228800, "total_output_tokens_retokenized": 1224646, "mean_e2e_latency_ms": 321625.8439514022, "median_e2e_latency_ms": 324438.43806162477, "median_ttft_ms": 766.6953965090215, "median_itl_ms": 237.9868463613093, "output_throughput": 2378.259950851199, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 516.6802727179602, "completed": 1200} +{"backend": "sglang", "dataset_name": "random", "request_rate": 8.0, "max_concurrency": null, "total_input_tokens": 2457600, "total_output_tokens": 2457600, "total_output_tokens_retokenized": 2449187, "mean_e2e_latency_ms": 654511.2723356115, "median_e2e_latency_ms": 686261.5671905223, "median_ttft_ms": 1191.7396115604788, "median_itl_ms": 255.96281047910452, "output_throughput": 2249.0334558295294, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 1092.736078971997, "completed": 2400} diff --git a/sglang/experiments/deepseek_v3_8xh200_FP8_online_output.jsonl b/sglang/experiments/deepseek_v3_8xh200_FP8_online_output.jsonl new file mode 100644 index 0000000..9e0c87c --- /dev/null +++ b/sglang/experiments/deepseek_v3_8xh200_FP8_online_output.jsonl @@ -0,0 +1,4 @@ +{"backend": "sglang", "dataset_name": "random", "request_rate": 1.0, "max_concurrency": null, "total_input_tokens": 307200, "total_output_tokens": 307200, "total_output_tokens_retokenized": 306153, "mean_e2e_latency_ms": 139395.84098776494, "median_e2e_latency_ms": 147735.42626714334, "median_ttft_ms": 563.4104600176215, "median_itl_ms": 101.78019991144538, "output_throughput": 773.1474288758621, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 397.33689659507945, "completed": 300} +{"backend": "sglang", "dataset_name": "random", "request_rate": 2.0, "max_concurrency": null, "total_input_tokens": 614400, "total_output_tokens": 614400, "total_output_tokens_retokenized": 612306, "mean_e2e_latency_ms": 227131.108927244, "median_e2e_latency_ms": 234757.1316829417, "median_ttft_ms": 684.3277416191995, "median_itl_ms": 149.458127329126, "output_throughput": 1401.770501155294, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 438.3028459320776, "completed": 600} +{"backend": "sglang", "dataset_name": "random", "request_rate": 4.0, "max_concurrency": null, "total_input_tokens": 1228800, "total_output_tokens": 1228800, "total_output_tokens_retokenized": 1224403, "mean_e2e_latency_ms": 370518.6826122479, "median_e2e_latency_ms": 376040.6724580098, "median_ttft_ms": 865.2611614670604, "median_itl_ms": 287.9461294505745, "output_throughput": 2214.7587909169233, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 554.8233988457359, "completed": 1200} +{"backend": "sglang", "dataset_name": "random", "request_rate": 8.0, "max_concurrency": null, "total_input_tokens": 2457600, "total_output_tokens": 2457600, "total_output_tokens_retokenized": 2449246, "mean_e2e_latency_ms": 687402.8331566683, "median_e2e_latency_ms": 692710.8259119559, "median_ttft_ms": 1358.7704463861883, "median_itl_ms": 515.1780359447002, "output_throughput": 2864.307071093772, "sharegpt_output_len": null, "random_input_len": 1024, "random_output_len": 1024, "random_range_ratio": 1.0, "duration": 858.0085650738329, "completed": 2400} diff --git a/sglang/experiments/deepseek_v3_bf16_2x8xh200_log_output.txt b/sglang/experiments/deepseek_v3_bf16_2x8xh200_log_output.txt new file mode 100644 index 0000000..6d366da --- /dev/null +++ b/sglang/experiments/deepseek_v3_bf16_2x8xh200_log_output.txt @@ -0,0 +1,152 @@ +Failed to fetch model from http://0.0.0.0:30000/v1/models. Error: HTTPConnectionPool(host='0.0.0.0', port=30000): Max retries exceeded with url: /v1/models (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused')) +Please specify the correct host and port using `--host` and `--port`. +Failed to fetch model from http://0.0.0.0:30000/v1/models. Error: HTTPConnectionPool(host='0.0.0.0', port=30000): Max retries exceeded with url: /v1/models (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused')) +Please specify the correct host and port using `--host` and `--port`. +Failed to fetch model from http://0.0.0.0:30000/v1/models. Error: HTTPConnectionPool(host='0.0.0.0', port=30000): Max retries exceeded with url: /v1/models (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused')) +Please specify the correct host and port using `--host` and `--port`. +Failed to fetch model from http://0.0.0.0:30000/v1/models. Error: HTTPConnectionPool(host='0.0.0.0', port=30000): Max retries exceeded with url: /v1/models (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused')) +Please specify the correct host and port using `--host` and `--port`. +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=40000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=300, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=1.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_2xh200_BF16_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 307200 +#Output tokens: 307200 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 1.0 +Max reqeuest concurrency: not set +Successful requests: 300 +Benchmark duration (s): 1116.85 +Total input tokens: 307200 +Total generated tokens: 307200 +Total generated tokens (retokenized): 306053 +Request throughput (req/s): 0.27 +Input token throughput (tok/s): 275.06 +Output token throughput (tok/s): 275.06 +Total token throughput (tok/s): 550.12 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 968448.85 +Median E2E Latency (ms): 971353.97 +---------------Time to First Token---------------- +Mean TTFT (ms): 105080.04 +Median TTFT (ms): 53189.54 +P99 TTFT (ms): 251466.03 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 843.96 +Median TPOT (ms): 843.03 +P99 TPOT (ms): 1070.14 +---------------Inter-token Latency---------------- +Mean ITL (ms): 843.96 +Median ITL (ms): 638.68 +P99 ITL (ms): 708.01 +================================================== +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=40000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=600, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=2.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_2x8xh200_BF16_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 614400 +#Output tokens: 614400 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 2.0 +Max reqeuest concurrency: not set +Successful requests: 600 +Benchmark duration (s): 2395.34 +Total input tokens: 614400 +Total generated tokens: 614400 +Total generated tokens (retokenized): 612299 +Request throughput (req/s): 0.25 +Input token throughput (tok/s): 256.50 +Output token throughput (tok/s): 256.50 +Total token throughput (tok/s): 513.00 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 2003883.86 +Median E2E Latency (ms): 2010951.23 +---------------Time to First Token---------------- +Mean TTFT (ms): 317480.50 +Median TTFT (ms): 313373.93 +P99 TTFT (ms): 628073.04 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 1648.49 +Median TPOT (ms): 1622.07 +P99 TPOT (ms): 2054.30 +---------------Inter-token Latency---------------- +Mean ITL (ms): 1648.32 +Median ITL (ms): 1192.37 +P99 ITL (ms): 1525.58 +================================================== +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=40000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=1200, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=4.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_2x8xh200_BF16_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 1228800 +#Output tokens: 1228800 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 4.0 +Max reqeuest concurrency: not set +Successful requests: 1200 +Benchmark duration (s): 4810.40 +Total input tokens: 1228800 +Total generated tokens: 1228800 +Total generated tokens (retokenized): 1224692 +Request throughput (req/s): 0.25 +Input token throughput (tok/s): 255.45 +Output token throughput (tok/s): 255.45 +Total token throughput (tok/s): 510.89 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 3206867.31 +Median E2E Latency (ms): 3881082.65 +---------------Time to First Token---------------- +Mean TTFT (ms): 1426498.17 +Median TTFT (ms): 774460.73 +P99 TTFT (ms): 3980643.34 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 1740.34 +Median TPOT (ms): 1645.51 +P99 TPOT (ms): 3600.89 +---------------Inter-token Latency---------------- +Mean ITL (ms): 1740.23 +Median ITL (ms): 1178.42 +P99 ITL (ms): 1608.58 +================================================== +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=40000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=2400, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=8.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_2x8xh200_BF16_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 2457600 +#Output tokens: 2457600 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 8.0 +Max reqeuest concurrency: not set +Successful requests: 2400 +Benchmark duration (s): 9827.36 +Total input tokens: 2457600 +Total generated tokens: 2457600 +Total generated tokens (retokenized): 2449303 +Request throughput (req/s): 0.24 +Input token throughput (tok/s): 250.08 +Output token throughput (tok/s): 250.08 +Total token throughput (tok/s): 500.15 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 6004940.75 +Median E2E Latency (ms): 6819185.61 +---------------Time to First Token---------------- +Mean TTFT (ms): 3356919.45 +Median TTFT (ms): 4072706.72 +P99 TTFT (ms): 7107066.15 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 2588.49 +Median TPOT (ms): 2239.22 +P99 TPOT (ms): 7387.83 +---------------Inter-token Latency---------------- +Mean ITL (ms): 2587.96 +Median ITL (ms): 1205.60 +P99 ITL (ms): 8271.60 +================================================== diff --git a/sglang/experiments/deepseek_v3_bf16_8xh200_log_output.txt b/sglang/experiments/deepseek_v3_bf16_8xh200_log_output.txt new file mode 100644 index 0000000..8a9873a --- /dev/null +++ b/sglang/experiments/deepseek_v3_bf16_8xh200_log_output.txt @@ -0,0 +1,144 @@ +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=30000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=300, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=1.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_8xh200_BF16_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 307200 +#Output tokens: 307200 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 1.0 +Max reqeuest concurrency: not set +Successful requests: 300 +Benchmark duration (s): 480.00 +Total input tokens: 307200 +Total generated tokens: 307200 +Total generated tokens (retokenized): 306052 +Request throughput (req/s): 0.62 +Input token throughput (tok/s): 639.99 +Output token throughput (tok/s): 639.99 +Total token throughput (tok/s): 1279.99 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 219910.49 +Median E2E Latency (ms): 214924.09 +---------------Time to First Token---------------- +Mean TTFT (ms): 1484.08 +Median TTFT (ms): 587.15 +P99 TTFT (ms): 10167.11 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 213.52 +Median TPOT (ms): 209.48 +P99 TPOT (ms): 271.65 +---------------Inter-token Latency---------------- +Mean ITL (ms): 213.52 +Median ITL (ms): 159.64 +P99 ITL (ms): 907.22 +================================================== +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=30000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=600, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=2.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_8xh200_BF16_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 614400 +#Output tokens: 614400 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 2.0 +Max reqeuest concurrency: not set +Successful requests: 600 +Benchmark duration (s): 467.67 +Total input tokens: 614400 +Total generated tokens: 614400 +Total generated tokens (retokenized): 612253 +Request throughput (req/s): 1.28 +Input token throughput (tok/s): 1313.74 +Output token throughput (tok/s): 1313.74 +Total token throughput (tok/s): 2627.48 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 235341.58 +Median E2E Latency (ms): 235524.70 +---------------Time to First Token---------------- +Mean TTFT (ms): 652.11 +Median TTFT (ms): 598.77 +P99 TTFT (ms): 1338.42 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 229.41 +Median TPOT (ms): 229.30 +P99 TPOT (ms): 296.47 +---------------Inter-token Latency---------------- +Mean ITL (ms): 229.42 +Median ITL (ms): 162.99 +P99 ITL (ms): 922.06 +================================================== +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=30000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=1200, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=4.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_8xh200_BF16_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 1228800 +#Output tokens: 1228800 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 4.0 +Max reqeuest concurrency: not set +Successful requests: 1200 +Benchmark duration (s): 516.68 +Total input tokens: 1228800 +Total generated tokens: 1228800 +Total generated tokens (retokenized): 1224646 +Request throughput (req/s): 2.32 +Input token throughput (tok/s): 2378.26 +Output token throughput (tok/s): 2378.26 +Total token throughput (tok/s): 4756.52 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 321625.84 +Median E2E Latency (ms): 324438.44 +---------------Time to First Token---------------- +Mean TTFT (ms): 790.54 +Median TTFT (ms): 766.70 +P99 TTFT (ms): 1631.13 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 313.62 +Median TPOT (ms): 316.35 +P99 TPOT (ms): 404.28 +---------------Inter-token Latency---------------- +Mean ITL (ms): 313.63 +Median ITL (ms): 237.99 +P99 ITL (ms): 1125.06 +================================================== +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=30000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=2400, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=8.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_8xh200_BF16_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 2457600 +#Output tokens: 2457600 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 8.0 +Max reqeuest concurrency: not set +Successful requests: 2400 +Benchmark duration (s): 1092.74 +Total input tokens: 2457600 +Total generated tokens: 2457600 +Total generated tokens (retokenized): 2449187 +Request throughput (req/s): 2.20 +Input token throughput (tok/s): 2249.03 +Output token throughput (tok/s): 2249.03 +Total token throughput (tok/s): 4498.07 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 654511.27 +Median E2E Latency (ms): 686261.57 +---------------Time to First Token---------------- +Mean TTFT (ms): 96306.56 +Median TTFT (ms): 1191.74 +P99 TTFT (ms): 471552.20 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 545.65 +Median TPOT (ms): 516.67 +P99 TPOT (ms): 832.97 +---------------Inter-token Latency---------------- +Mean ITL (ms): 545.71 +Median ITL (ms): 255.96 +P99 ITL (ms): 4197.25 +================================================== diff --git a/sglang/experiments/deepseek_v3_fp8_2x8xh200_log_output.txt b/sglang/experiments/deepseek_v3_fp8_2x8xh200_log_output.txt new file mode 100644 index 0000000..c7d51e7 --- /dev/null +++ b/sglang/experiments/deepseek_v3_fp8_2x8xh200_log_output.txt @@ -0,0 +1,145 @@ +nohup: ignoring input +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=40000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=300, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=1.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_2x8h200_FP8_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 307200 +#Output tokens: 307200 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 1.0 +Max reqeuest concurrency: not set +Successful requests: 300 +Benchmark duration (s): 1131.09 +Total input tokens: 307200 +Total generated tokens: 307200 +Total generated tokens (retokenized): 306092 +Request throughput (req/s): 0.27 +Input token throughput (tok/s): 271.60 +Output token throughput (tok/s): 271.60 +Total token throughput (tok/s): 543.19 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 982681.06 +Median E2E Latency (ms): 985610.62 +---------------Time to First Token---------------- +Mean TTFT (ms): 99781.93 +Median TTFT (ms): 56824.07 +P99 TTFT (ms): 244007.03 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 863.05 +Median TPOT (ms): 862.84 +P99 TPOT (ms): 1084.94 +---------------Inter-token Latency---------------- +Mean ITL (ms): 863.05 +Median ITL (ms): 662.33 +P99 ITL (ms): 695.39 +================================================== +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=40000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=600, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=2.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_2x8xh200_FP8_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 614400 +#Output tokens: 614400 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 2.0 +Max reqeuest concurrency: not set +Successful requests: 600 +Benchmark duration (s): 2130.27 +Total input tokens: 614400 +Total generated tokens: 614400 +Total generated tokens (retokenized): 612142 +Request throughput (req/s): 0.28 +Input token throughput (tok/s): 288.41 +Output token throughput (tok/s): 288.41 +Total token throughput (tok/s): 576.83 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 1978002.69 +Median E2E Latency (ms): 1975371.99 +---------------Time to First Token---------------- +Mean TTFT (ms): 309169.92 +Median TTFT (ms): 305318.37 +P99 TTFT (ms): 609895.40 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 1631.31 +Median TPOT (ms): 1632.35 +P99 TPOT (ms): 2057.38 +---------------Inter-token Latency---------------- +Mean ITL (ms): 1631.34 +Median ITL (ms): 1219.14 +P99 ITL (ms): 1537.46 +================================================== +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=40000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=1200, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=4.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_2x8xh200_FP8_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 1228800 +#Output tokens: 1228800 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 4.0 +Max reqeuest concurrency: not set +Successful requests: 1200 +Benchmark duration (s): 4564.80 +Total input tokens: 1228800 +Total generated tokens: 1228800 +Total generated tokens (retokenized): 1224515 +Request throughput (req/s): 0.26 +Input token throughput (tok/s): 269.19 +Output token throughput (tok/s): 269.19 +Total token throughput (tok/s): 538.38 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 3929702.07 +Median E2E Latency (ms): 3901390.30 +---------------Time to First Token---------------- +Mean TTFT (ms): 767128.52 +Median TTFT (ms): 767082.14 +P99 TTFT (ms): 1504428.26 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 3091.47 +Median TPOT (ms): 3023.99 +P99 TPOT (ms): 3886.39 +---------------Inter-token Latency---------------- +Mean ITL (ms): 3091.12 +Median ITL (ms): 2189.83 +P99 ITL (ms): 2596.82 +================================================== +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=40000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=2400, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=8.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_2x8xh200_FP8_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 2457600 +#Output tokens: 2457600 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 8.0 +Max reqeuest concurrency: not set +Successful requests: 2400 +Benchmark duration (s): 8880.48 +Total input tokens: 2457600 +Total generated tokens: 2457600 +Total generated tokens (retokenized): 2448836 +Request throughput (req/s): 0.27 +Input token throughput (tok/s): 276.74 +Output token throughput (tok/s): 276.74 +Total token throughput (tok/s): 553.48 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 6079389.87 +Median E2E Latency (ms): 7374173.14 +---------------Time to First Token---------------- +Mean TTFT (ms): 2858184.95 +Median TTFT (ms): 1680440.41 +P99 TTFT (ms): 7511052.50 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 3148.78 +Median TPOT (ms): 2974.87 +P99 TPOT (ms): 6686.54 +---------------Inter-token Latency---------------- +Mean ITL (ms): 3148.57 +Median ITL (ms): 2007.02 +P99 ITL (ms): 2745.71 +================================================== diff --git a/sglang/experiments/deepseek_v3_fp8_8xh200_log_output.txt b/sglang/experiments/deepseek_v3_fp8_8xh200_log_output.txt new file mode 100644 index 0000000..a41e219 --- /dev/null +++ b/sglang/experiments/deepseek_v3_fp8_8xh200_log_output.txt @@ -0,0 +1,144 @@ +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=30000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=300, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=1.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_8xh200_FP8_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 307200 +#Output tokens: 307200 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 1.0 +Max reqeuest concurrency: not set +Successful requests: 300 +Benchmark duration (s): 397.34 +Total input tokens: 307200 +Total generated tokens: 307200 +Total generated tokens (retokenized): 306153 +Request throughput (req/s): 0.76 +Input token throughput (tok/s): 773.15 +Output token throughput (tok/s): 773.15 +Total token throughput (tok/s): 1546.29 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 139395.84 +Median E2E Latency (ms): 147735.43 +---------------Time to First Token---------------- +Mean TTFT (ms): 629.85 +Median TTFT (ms): 563.41 +P99 TTFT (ms): 1184.81 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 135.65 +Median TPOT (ms): 143.71 +P99 TPOT (ms): 154.90 +---------------Inter-token Latency---------------- +Mean ITL (ms): 135.65 +Median ITL (ms): 101.78 +P99 ITL (ms): 588.61 +================================================== +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=30000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=600, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=2.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_8xh200_FP8_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 614400 +#Output tokens: 614400 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 2.0 +Max reqeuest concurrency: not set +Successful requests: 600 +Benchmark duration (s): 438.30 +Total input tokens: 614400 +Total generated tokens: 614400 +Total generated tokens (retokenized): 612306 +Request throughput (req/s): 1.37 +Input token throughput (tok/s): 1401.77 +Output token throughput (tok/s): 1401.77 +Total token throughput (tok/s): 2803.54 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 227131.11 +Median E2E Latency (ms): 234757.13 +---------------Time to First Token---------------- +Mean TTFT (ms): 742.35 +Median TTFT (ms): 684.33 +P99 TTFT (ms): 1576.72 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 221.30 +Median TPOT (ms): 228.78 +P99 TPOT (ms): 280.95 +---------------Inter-token Latency---------------- +Mean ITL (ms): 221.30 +Median ITL (ms): 149.46 +P99 ITL (ms): 1046.25 +================================================== +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=30000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=1200, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=4.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_8xh200_FP8_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 1228800 +#Output tokens: 1228800 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 4.0 +Max reqeuest concurrency: not set +Successful requests: 1200 +Benchmark duration (s): 554.82 +Total input tokens: 1228800 +Total generated tokens: 1228800 +Total generated tokens (retokenized): 1224403 +Request throughput (req/s): 2.16 +Input token throughput (tok/s): 2214.76 +Output token throughput (tok/s): 2214.76 +Total token throughput (tok/s): 4429.52 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 370518.68 +Median E2E Latency (ms): 376040.67 +---------------Time to First Token---------------- +Mean TTFT (ms): 881.28 +Median TTFT (ms): 865.26 +P99 TTFT (ms): 1518.00 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 361.33 +Median TPOT (ms): 366.48 +P99 TPOT (ms): 451.84 +---------------Inter-token Latency---------------- +Mean ITL (ms): 361.33 +Median ITL (ms): 287.95 +P99 ITL (ms): 1244.53 +================================================== +Namespace(backend='sglang', base_url=None, host='0.0.0.0', port=30000, dataset_name='random', dataset_path='', model='deepseek-ai/DeepSeek-V3', tokenizer=None, num_prompts=2400, sharegpt_output_len=None, random_input_len=1024, random_output_len=1024, random_range_ratio=1.0, request_rate=8.0, max_concurrency=None, seed=1, multi=False, request_rate_range='2,34,2', output_file='deepseek_v3_8xh200_FP8_online_output.jsonl', disable_tqdm=False, disable_stream=False, disable_ignore_eos=False, return_logprob=False, extra_request_body=None, gen_num_groups=64, gen_prompts_per_group=16, gen_system_prompt_len=2048, gen_question_len=128, gen_output_len=256, profile=False, lora_name=None) + +#Input tokens: 2457600 +#Output tokens: 2457600 +Starting initial single prompt test run... +Initial test run completed. Starting main benchmark run... + +============ Serving Benchmark Result ============ +Backend: sglang +Traffic request rate: 8.0 +Max reqeuest concurrency: not set +Successful requests: 2400 +Benchmark duration (s): 858.01 +Total input tokens: 2457600 +Total generated tokens: 2457600 +Total generated tokens (retokenized): 2449246 +Request throughput (req/s): 2.80 +Input token throughput (tok/s): 2864.31 +Output token throughput (tok/s): 2864.31 +Total token throughput (tok/s): 5728.61 +----------------End-to-End Latency---------------- +Mean E2E Latency (ms): 687402.83 +Median E2E Latency (ms): 692710.83 +---------------Time to First Token---------------- +Mean TTFT (ms): 1627.56 +Median TTFT (ms): 1358.77 +P99 TTFT (ms): 4392.08 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 670.36 +Median TPOT (ms): 675.95 +P99 TPOT (ms): 780.39 +---------------Inter-token Latency---------------- +Mean ITL (ms): 670.53 +Median ITL (ms): 515.18 +P99 ITL (ms): 4618.92 +==================================================