Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the default chunked prefill size #2268

Merged
merged 1 commit into from
Nov 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/sglang/srt/managers/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,8 @@ def __init__(

# Init chunked prefill
self.chunked_prefill_size = server_args.chunked_prefill_size
if self.chunked_prefill_size <= 0: # -1 means disable
self.chunked_prefill_size = None
self.being_chunked_req = None
self.is_mixed_chunk = (
self.chunked_prefill_size is not None and server_args.enable_mixed_chunk
Expand Down
9 changes: 6 additions & 3 deletions python/sglang/srt/model_executor/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def __init__(
logger.info(
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
)
server_args.chunked_prefill_size = None
server_args.chunked_prefill_size = -1
self.mem_fraction_static *= 0.95
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
if self.model_config.hf_config.architectures == [
Expand Down Expand Up @@ -148,19 +148,22 @@ def __init__(

set_cpu_offload_max_bytes(int(server_args.cpu_offload_gb * 1024**3))

# Init components
# Get memory before model loading
min_per_gpu_memory = self.init_torch_distributed()

# Load the model
self.sampler = Sampler()
self.load_model()

# Apply torch TP if model supports it
# Apply torch TP if the model supports it
supports_torch_tp = getattr(self.model, "supports_torch_tp", False)
if self.tp_size > 1 and supports_torch_tp:
self.apply_torch_tp()
self.torch_tp_applied = True
else:
self.torch_tp_applied = False

# Init memory pool and attention backends
if server_args.lora_paths is not None:
self.init_lora_manager()
self.init_memory_pool(
Expand Down
38 changes: 21 additions & 17 deletions python/sglang/srt/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class ServerArgs:
mem_fraction_static: Optional[float] = None
max_running_requests: Optional[int] = None
max_total_tokens: Optional[int] = None
chunked_prefill_size: int = 8192
chunked_prefill_size: Optional[int] = None
max_prefill_tokens: int = 16384
schedule_policy: str = "lpm"
schedule_conservativeness: float = 1.0
Expand Down Expand Up @@ -128,7 +128,7 @@ class ServerArgs:
enable_dp_attention: bool = False
enable_torch_compile: bool = False
torch_compile_max_bs: int = 32
cuda_graph_max_bs: int = 160
cuda_graph_max_bs: Optional[int] = None
torchao_config: str = ""
enable_nan_detection: bool = False
enable_p2p_check: bool = False
Expand All @@ -144,14 +144,15 @@ def __post_init__(self):
if self.served_model_name is None:
self.served_model_name = self.model_path

if self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0:
# Disable chunked prefill
self.chunked_prefill_size = None

if self.random_seed is None:
self.random_seed = random.randint(0, 1 << 30)

# Mem fraction depends on the tensor parallelism size
if is_hip():
gpu_mem = get_amdgpu_memory_capacity()
else:
gpu_mem = get_nvgpu_memory_capacity()

# Set mem fraction static, which depends on the tensor parallelism size
if self.mem_fraction_static is None:
if self.tp_size >= 16:
self.mem_fraction_static = 0.79
Expand All @@ -164,18 +165,21 @@ def __post_init__(self):
else:
self.mem_fraction_static = 0.88

# Adjust for GPUs with small memory capacities
if is_hip():
gpu_mem = get_amdgpu_memory_capacity()
else:
gpu_mem = get_nvgpu_memory_capacity()
# Set chunked prefill size, which depends on the gpu memory capacity
if self.chunked_prefill_size is None:
if gpu_mem < 25_000:
self.chunked_prefill_size = 2048
else:
self.chunked_prefill_size = 8192

if gpu_mem < 25000:
logger.warning(
"Your GPU has less than 25GB memory. You may want to set a smaller --chunked-prefill-size (e.g., 512) to improve performance."
)
# Set cuda graph max batch size
if self.cuda_graph_max_bs is None:
if gpu_mem < 25_000:
self.cuda_graph_max_bs = 8
else:
self.cuda_graph_max_bs = 160

# Choose kernel backends
# Set kernel backends
if not is_flashinfer_available():
self.attention_backend = "triton"
self.sampling_backend = "pytorch"
Expand Down
Loading