Update default max_num_batch_tokens for chunked prefill to 2048 (vllm…

…-project#10544)
sleepwalker2017 · Dec 13, 2024 · 9f36f7b · 9f36f7b
1 parent 7d5171c
commit 9f36f7b
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/vllm/config.py b/vllm/config.py
@@ -1133,9 +1133,9 @@ def __post_init__(self) -> None:
                     # max_num_batched_tokens.
                     self.max_num_batched_tokens = max(self.max_model_len, 2048)
                 else:
-                    # It is the values that have the best balance between ITL
-                    # and TTFT on A100. Note it is not optimized for throughput.
-                    self.max_num_batched_tokens = 512
+                    # This value is chosen to have a balance between ITL
+                    # and TTFT. Note it is not optimized for throughput.
+                    self.max_num_batched_tokens = 2048
             else:
                 # If max_model_len is too short, use 2048 as the default value
                 # for higher throughput.