From 6ce1ff12eb0069cb95b292899915f99311d96a6c Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 9 Dec 2024 04:55:32 -0800 Subject: [PATCH 1/4] Fix --- python/pyproject.toml | 3 ++- python/sglang/bench_offline_throughput.py | 2 +- python/sglang/srt/layers/attention/triton_backend.py | 1 - python/sglang/srt/layers/radix_attention.py | 9 ++++++++- python/sglang/srt/managers/schedule_batch.py | 2 +- python/sglang/srt/managers/tokenizer_manager.py | 2 +- .../sglang/srt/model_executor/cuda_graph_runner.py | 2 +- python/sglang/srt/model_executor/model_runner.py | 12 ++++++++---- python/sglang/srt/server.py | 2 +- python/sglang/srt/utils.py | 9 ++++++--- 10 files changed, 29 insertions(+), 15 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 8e935528e21..11810615add 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -33,7 +33,7 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"] srt_xpu = ["sglang[runtime_common]"] #For Intel Gaudi(device : hpu) follow the installation guide #https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html -srt_hpu = ["sglang[runtime_common]"] +srt_hpu = ["sglang[runtime_common]"] openai = ["openai>=1.0", "tiktoken"] anthropic = ["anthropic>=0.20.0"] @@ -50,6 +50,7 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"] all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"] all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"] all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"] + dev = ["sglang[all]", "sglang[test]"] dev_hip = ["sglang[all_hip]", "sglang[test]"] dev_xpu = ["sglang[all_xpu]", "sglang[test]"] diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index 2e9eb1ad223..196049a65ff 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -285,7 +285,7 @@ def throughput_test( else: raise ValueError('Please set backend to either "engine" or "runtime"') - tokenizer_id = server_args.model_path + tokenizer_id = server_args.tokenizer_path or server_args.model_path tokenizer = get_tokenizer(tokenizer_id) # Set global environmnets diff --git a/python/sglang/srt/layers/attention/triton_backend.py b/python/sglang/srt/layers/attention/triton_backend.py index 1a539ebd75c..6c8bce20aa1 100644 --- a/python/sglang/srt/layers/attention/triton_backend.py +++ b/python/sglang/srt/layers/attention/triton_backend.py @@ -5,7 +5,6 @@ import torch from sglang.srt.layers.attention import AttentionBackend -from sglang.srt.managers.schedule_batch import global_server_args_dict from sglang.srt.model_executor.forward_batch_info import ForwardBatch if TYPE_CHECKING: diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py index 1df29ec68a9..4b762c00ba5 100644 --- a/python/sglang/srt/layers/radix_attention.py +++ b/python/sglang/srt/layers/radix_attention.py @@ -48,7 +48,14 @@ def __init__( self.sliding_window_size = sliding_window_size or -1 self.is_cross_attention = is_cross_attention - def forward(self, q, k, v, forward_batch: ForwardBatch, save_kv_cache=True): + def forward( + self, + q, + k, + v, + forward_batch: ForwardBatch, + save_kv_cache: bool = True, + ): if k is not None: # For cross-layer sharing, kv can be None assert v is not None diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index bb9eb181611..89915da32d1 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -484,7 +484,7 @@ def __repr__(self): @dataclasses.dataclass class ScheduleBatch: - """Store all inforamtion of a batch on the scheduler.""" + """Store all information of a batch on the scheduler.""" # Request, memory pool, and cache reqs: List[Req] diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 29b98df2efa..bd00959b8ab 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -22,7 +22,7 @@ import sys import time import uuid -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Union import fastapi import uvloop diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 77efba89212..93c3b250cd3 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -127,7 +127,7 @@ def __init__(self, model_runner: "ModelRunner"): # Batch sizes to capture if model_runner.server_args.disable_cuda_graph_padding: - self.capture_bs = list(range(1, 32)) + [64, 128] + self.capture_bs = list(range(1, 33)) + [64, 128] else: self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)] diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 3f0cbecac15..b65d0fdff69 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -242,20 +242,22 @@ def load_model(self): if torch.cuda.get_device_capability()[1] < 5: raise RuntimeError("SGLang only supports sm75 and above.") - # Prepare the vllm model config + # Prepare the model config self.load_config = LoadConfig( load_format=self.server_args.load_format, download_dir=self.server_args.download_dir, ) - if self.server_args.load_format == "gguf": monkey_patch_vllm_gguf_config() + + # Load the model self.model = get_model( model_config=self.model_config, load_config=self.load_config, device_config=DeviceConfig(self.device), ) + # Parse other args self.sliding_window_size = ( self.model.get_attention_sliding_window_size() if hasattr(self.model, "get_attention_sliding_window_size") @@ -270,8 +272,10 @@ def load_model(self): f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB" ) - def update_weights_from_disk(self, model_path: str, load_format: str): - """Update engine weights online from disk.""" + def update_weights_from_disk( + self, model_path: str, load_format: str + ) -> tuple[bool, str]: + """Update engine weights in-place from the disk.""" from sglang.srt.model_loader.loader import ( DefaultModelLoader, device_loading_context, diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 29bc44eb524..1d34a0a55fc 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -196,7 +196,7 @@ async def stop_profile_async(): @app.post("/update_weights_from_disk") @time_func_latency async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: Request): - """Update the weights from disk inplace without re-launching the server.""" + """Update the weights from disk in-place without re-launching the server.""" success, message = await tokenizer_manager.update_weights_from_disk(obj, request) content = {"success": success, "message": message} if success: diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 5c310136a21..dabc608fac0 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -169,7 +169,7 @@ def inner_func(*args, **kwargs): return wrapper -def get_available_gpu_memory(device, gpu_id, distributed=False): +def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True): """ Get available memory for cuda:gpu_id device. When distributed is True, the available memory is the minimum available memory of all GPUs. @@ -184,7 +184,8 @@ def get_available_gpu_memory(device, gpu_id, distributed=False): "which may cause useless memory allocation for torch CUDA context.", ) - torch.cuda.empty_cache() + if empty_cache: + torch.cuda.empty_cache() free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id) elif device == "xpu": @@ -196,7 +197,9 @@ def get_available_gpu_memory(device, gpu_id, distributed=False): f"WARNING: current device is not {gpu_id}, but {torch.xpu.current_device()}, ", "which may cause useless memory allocation for torch XPU context.", ) - torch.xpu.empty_cache() + + if empty_cache: + torch.xpu.empty_cache() used_memory = torch.xpu.memory_allocated() total_gpu_memory = torch.xpu.get_device_properties(gpu_id).total_memory free_gpu_memory = total_gpu_memory - used_memory From da45262e394ad6ea6ac3fa6a39f7675aac6a5eda Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 9 Dec 2024 05:02:13 -0800 Subject: [PATCH 2/4] fix --- python/sglang/srt/models/gemma2_reward.py | 1 - python/sglang/srt/models/llama_classification.py | 1 - python/sglang/srt/models/llama_reward.py | 2 -- 3 files changed, 4 deletions(-) diff --git a/python/sglang/srt/models/gemma2_reward.py b/python/sglang/srt/models/gemma2_reward.py index e5c2fc07aaf..1fe87c30aef 100644 --- a/python/sglang/srt/models/gemma2_reward.py +++ b/python/sglang/srt/models/gemma2_reward.py @@ -32,7 +32,6 @@ def __init__( ) -> None: super().__init__() self.config = config - self.torchao_config = None self.quant_config = quant_config self.num_labels = config.num_labels self.model = Gemma2Model(config, quant_config=quant_config) diff --git a/python/sglang/srt/models/llama_classification.py b/python/sglang/srt/models/llama_classification.py index c4ee76379b6..75e8af9af32 100644 --- a/python/sglang/srt/models/llama_classification.py +++ b/python/sglang/srt/models/llama_classification.py @@ -33,7 +33,6 @@ def __init__( ) -> None: super().__init__() self.config = config - self.torchao_config = None self.quant_config = quant_config self.model = LlamaModel(config, quant_config=quant_config) diff --git a/python/sglang/srt/models/llama_reward.py b/python/sglang/srt/models/llama_reward.py index dcde8b468ea..6550ee411a1 100644 --- a/python/sglang/srt/models/llama_reward.py +++ b/python/sglang/srt/models/llama_reward.py @@ -21,7 +21,6 @@ from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.model_executor.forward_batch_info import ForwardBatch -from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel @@ -33,7 +32,6 @@ def __init__( ) -> None: super().__init__() self.config = config - self.torchao_config = None self.quant_config = quant_config self.num_labels = config.num_labels self.model = LlamaModel(config, quant_config=quant_config) From e815ded268d5fac87aa10a7634c7ea8c4da2bf95 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 9 Dec 2024 05:30:50 -0800 Subject: [PATCH 3/4] Update run_suite.py --- test/srt/run_suite.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index cb6a60612dd..5035810f86a 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -15,7 +15,6 @@ "test_double_sparsity.py", "test_embedding_openai_server.py", "test_eval_accuracy_mini.py", - "test_fused_moe.py", "test_get_weights_by_name.py", "test_gguf.py", "test_input_embeddings.py", From bd230599d2fb13d3c31142c8e7be6913280d73d4 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 9 Dec 2024 05:58:37 -0800 Subject: [PATCH 4/4] support any json schema --- python/sglang/srt/constrained/xgrammar_backend.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/constrained/xgrammar_backend.py b/python/sglang/srt/constrained/xgrammar_backend.py index ee8e8eb07f4..91cd17c6f20 100644 --- a/python/sglang/srt/constrained/xgrammar_backend.py +++ b/python/sglang/srt/constrained/xgrammar_backend.py @@ -117,7 +117,10 @@ def init_value_impl(self, key: Tuple[str, str]) -> XGrammarGrammar: key_type, key_string = key if key_type == "json": try: - ctx = self.grammar_compiler.compile_json_schema(schema=key_string) + if key_string == "$$ANY$$": + ctx = self.grammar_compiler.compile_builtin_json_grammar() + else: + ctx = self.grammar_compiler.compile_json_schema(schema=key_string) except RuntimeError as e: logging.warning( f"Skip invalid json_schema: json_schema={key_string}, {e=}"