Merge branch 'main' into ravi/phi3v

sgl-project · Dec 9, 2024 · e57c541 · e57c541
2 parents 3dd6218 + 8586b72
commit e57c541
Show file tree

Hide file tree

Showing 52 changed files with 2,423 additions and 975 deletions.
diff --git a/.github/workflows/experiment-runner.yml b/.github/workflows/experiment-runner.yml
@@ -0,0 +1,30 @@
+name: Experiment Runner
+
+on:
+  workflow_dispatch:
+    inputs:
+      script:
+        description: "Experiment Runner Script"
+        default: "configs/sharegpt_config.yaml"
+
+concurrency:
+  group: experiment-runner-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  experiment-runner-1-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci_install_dependency.sh
+
+      - name: Test experiment runner
+        timeout-minutes: 120
+        run: |
+          cd test/srt
+          python3 experiment_runner.py --config ${{ inputs.script }}
diff --git a/benchmark/kernels/fused_moe_triton/README.md b/benchmark/kernels/fused_moe_triton/README.md
@@ -10,7 +10,7 @@ Example usage:
 ```bash
 # Tune Qwen2-57B with FP8 and TP=4
 python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \
-    --model Qwen/Qwen2-57B-A14B-Instruct-FP8 \
+    --model Qwen/Qwen2-57B-A14B-Instruct \
     --tp-size 4 \
     --dtype fp8_w8a8 \
     --tune
@@ -34,7 +34,7 @@ python benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_tri
 
 # Compare with FP8 mode for Qwen2-57B
 python benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py \
-    --model Qwen/Qwen2-57B-A14B-Instruct-FP8 \
+    --model Qwen/Qwen2-57B-A14B-Instruct \
     --use-fp8
 
 # Compare with custom TP size
@@ -43,3 +43,7 @@ python benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_tri
 ```
 
 The benchmark results will be saved as plots and data files in the specified output directory (default: `./configs/benchmark_ops/vllm_sglang_fused_moe/`).
+
+- `benchmark_torch_compile_fused_moe.py`: A tool for benchmarking the performance of the fused MoE kernel with `torch.compile` and original fused MoE kernel.
+
+Usage is the same as `benchmark_vllm_vs_sglang_fused_moe_triton.py`.
diff --git a/docs/references/contributor_guide.md b/docs/references/contributor_guide.md
@@ -1,5 +1,9 @@
 # Contributor Guide
 
+# Build SGLang
+
+See [Install SGLang, Method 2: From Source section](../start/install.md).
+
 ## Format Your Code
 Use these commands to format your code and pass CI linting tests.
 

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -13,7 +13,7 @@ classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: Apache Software License",
 ]
-dependencies = ["requests", "tqdm", "numpy", "IPython"]
+dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
 
 [project.optional-dependencies]
 runtime_common = ["aiohttp", "decord", "fastapi",
@@ -22,7 +22,7 @@ runtime_common = ["aiohttp", "decord", "fastapi",
     "packaging", "pillow", "prometheus-client>=0.20.0",
     "psutil", "pydantic", "python-multipart",
     "pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
-    "xgrammar>=0.1.4"]
+    "xgrammar>=0.1.6"]
 srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer>=0.1.6"]
 
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
@@ -33,7 +33,7 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
 srt_xpu = ["sglang[runtime_common]"]
 #For Intel Gaudi(device : hpu) follow the installation guide
 #https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
-srt_hpu =  ["sglang[runtime_common]"]
+srt_hpu = ["sglang[runtime_common]"]
 
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
@@ -50,6 +50,7 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
+
 dev = ["sglang[all]", "sglang[test]"]
 dev_hip = ["sglang[all_hip]", "sglang[test]"]
 dev_xpu = ["sglang[all_xpu]", "sglang[test]"]

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
@@ -285,7 +285,7 @@ def throughput_test(
     else:
         raise ValueError('Please set backend to either "engine" or "runtime"')
 
-    tokenizer_id = server_args.model_path
+    tokenizer_id = server_args.tokenizer_path or server_args.model_path
     tokenizer = get_tokenizer(tokenizer_id)
 
     # Set global environmnets

diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
@@ -321,6 +321,8 @@ async def async_request_sglang_generate(
             },
             "stream": not args.disable_stream,
             "lora_path": request_func_input.lora_name,
+            "return_logprob": args.return_logprob,
+            "logprob_start_len": -1,
             **request_func_input.extra_request_body,
         }
         headers = {}
@@ -911,7 +913,7 @@ async def limited_request_func(request_func_input, pbar):
         prompt=test_prompt,
         api_url=api_url,
         prompt_len=test_prompt_len,
-        output_len=test_output_len,
+        output_len=min(test_output_len, 32),
         lora_name=lora_name,
         extra_request_body=extra_request_body,
     )
@@ -1413,6 +1415,11 @@ def set_ulimit(target_soft_limit=65535):
         action="store_true",
         help="Disable ignoring EOS.",
     )
+    parser.add_argument(
+        "--return-logprob",
+        action="store_true",
+        help="Return logprob.",
+    )
     parser.add_argument(
         "--extra-request-body",
         metavar='{"key1": "value1", "key2": "value2"}',

diff --git a/python/sglang/srt/constrained/xgrammar_backend.py b/python/sglang/srt/constrained/xgrammar_backend.py
@@ -117,7 +117,10 @@ def init_value_impl(self, key: Tuple[str, str]) -> XGrammarGrammar:
         key_type, key_string = key
         if key_type == "json":
             try:
-                ctx = self.grammar_compiler.compile_json_schema(schema=key_string)
+                if key_string == "$$ANY$$":
+                    ctx = self.grammar_compiler.compile_builtin_json_grammar()
+                else:
+                    ctx = self.grammar_compiler.compile_json_schema(schema=key_string)
             except RuntimeError as e:
                 logging.warning(
                     f"Skip invalid json_schema: json_schema={key_string}, {e=}"

diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -678,6 +678,7 @@ def call_begin_forward(
                 self.num_qo_heads,
                 self.num_kv_heads,
                 self.head_dim,
+                q_data_type=self.q_data_type,
             )
 
         # cached part
@@ -691,6 +692,7 @@ def call_begin_forward(
             self.num_kv_heads,
             self.head_dim,
             1,
+            q_data_type=self.q_data_type,
         )
 
 

diff --git a/python/sglang/srt/layers/attention/triton_backend.py b/python/sglang/srt/layers/attention/triton_backend.py
@@ -5,7 +5,6 @@
 import torch
 
 from sglang.srt.layers.attention import AttentionBackend
-from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 
 if TYPE_CHECKING:
@@ -35,10 +34,8 @@ def __init__(self, model_runner: ModelRunner):
                 model_runner.model_config.num_attention_heads // model_runner.tp_size
             )
 
-        if global_server_args_dict.get("triton_attention_reduce_in_fp32", False):
-            self.reduce_dtype = torch.float32
-        else:
-            self.reduce_dtype = torch.float16
+        self.num_kv_splits = model_runner.server_args.triton_attention_num_kv_splits
+        self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[-1]
 
         self.forward_metadata = None
 
@@ -50,23 +47,23 @@ def init_forward_metadata(self, forward_batch: ForwardBatch):
         """Init auxiliary variables for triton attention backend."""
 
         if forward_batch.forward_mode.is_decode():
-            start_loc = torch.zeros_like(forward_batch.seq_lens, dtype=torch.int32)
-            start_loc[1:] = torch.cumsum(forward_batch.seq_lens[:-1], dim=0)
-
-            total_num_tokens = forward_batch.seq_lens_sum
             attn_logits = torch.empty(
-                (self.num_head, total_num_tokens),
-                dtype=self.reduce_dtype,
+                (
+                    forward_batch.batch_size,
+                    self.num_head,
+                    self.num_kv_splits,
+                    self.v_head_dim + 1,
+                ),
+                dtype=torch.float32,
                 device=self.device,
             )
 
-            max_seq_len = torch.max(forward_batch.seq_lens).item()
             max_extend_len = None
         else:
-            start_loc = attn_logits = max_seq_len = None
+            attn_logits = None
             max_extend_len = torch.max(forward_batch.extend_seq_lens).item()
 
-        self.forward_metadata = start_loc, attn_logits, max_seq_len, max_extend_len
+        self.forward_metadata = attn_logits, max_extend_len
 
     def init_cuda_graph_state(self, max_bs: int):
         self.cuda_graph_max_total_num_tokens = max_bs * self.cuda_graph_max_seq_len
@@ -75,11 +72,8 @@ def init_cuda_graph_state(self, max_bs: int):
             (max_bs,), dtype=torch.int32, device=self.device
         )
         self.cuda_graph_attn_logits = torch.empty(
-            (
-                self.num_head,
-                self.cuda_graph_max_total_num_tokens,
-            ),
-            dtype=self.reduce_dtype,
+            (max_bs, self.num_head, self.num_kv_splits, self.v_head_dim + 1),
+            dtype=torch.float32,
             device="cuda",
         )
 
@@ -92,9 +86,7 @@ def init_forward_metadata_capture_cuda_graph(
     ):
         # NOTE: encoder_lens expected to be zeros or None
         self.forward_metadata = (
-            self.cuda_graph_start_loc,
             self.cuda_graph_attn_logits,
-            self.cuda_graph_max_seq_len,
             None,
         )
 
@@ -133,7 +125,7 @@ def forward_extend(
                 layer, forward_batch.out_cache_loc, k, v
             )
 
-        start_loc, attn_logits, max_seq_len, max_extend_len = self.forward_metadata
+        _, max_extend_len = self.forward_metadata
         self.extend_attention_fwd(
             q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
             k.contiguous(),
@@ -171,7 +163,7 @@ def forward_decode(
         else:
             o = torch.empty_like(q)
 
-        start_loc, attn_logits, max_seq_len, max_extend_len = self.forward_metadata
+        attn_logits, _ = self.forward_metadata
 
         if save_kv_cache:
             forward_batch.token_to_kv_pool.set_kv_buffer(
@@ -185,10 +177,9 @@ def forward_decode(
             o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
             forward_batch.req_to_token_pool.req_to_token,
             forward_batch.req_pool_indices,
-            start_loc,
             forward_batch.seq_lens,
             attn_logits,
-            max_seq_len,
+            self.num_kv_splits,
             layer.scaling,
             layer.logit_cap,
         )