Skip to content

Commit

Permalink
Merge branch 'main' into ravi/phi3v
Browse files Browse the repository at this point in the history
  • Loading branch information
ravi03071991 authored Dec 9, 2024
2 parents 3dd6218 + 8586b72 commit e57c541
Show file tree
Hide file tree
Showing 52 changed files with 2,423 additions and 975 deletions.
30 changes: 30 additions & 0 deletions .github/workflows/experiment-runner.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Experiment Runner

on:
workflow_dispatch:
inputs:
script:
description: "Experiment Runner Script"
default: "configs/sharegpt_config.yaml"

concurrency:
group: experiment-runner-${{ github.ref }}
cancel-in-progress: true

jobs:
experiment-runner-1-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
- name: Test experiment runner
timeout-minutes: 120
run: |
cd test/srt
python3 experiment_runner.py --config ${{ inputs.script }}
8 changes: 6 additions & 2 deletions benchmark/kernels/fused_moe_triton/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Example usage:
```bash
# Tune Qwen2-57B with FP8 and TP=4
python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \
--model Qwen/Qwen2-57B-A14B-Instruct-FP8 \
--model Qwen/Qwen2-57B-A14B-Instruct \
--tp-size 4 \
--dtype fp8_w8a8 \
--tune
Expand All @@ -34,7 +34,7 @@ python benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_tri

# Compare with FP8 mode for Qwen2-57B
python benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py \
--model Qwen/Qwen2-57B-A14B-Instruct-FP8 \
--model Qwen/Qwen2-57B-A14B-Instruct \
--use-fp8

# Compare with custom TP size
Expand All @@ -43,3 +43,7 @@ python benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_tri
```

The benchmark results will be saved as plots and data files in the specified output directory (default: `./configs/benchmark_ops/vllm_sglang_fused_moe/`).

- `benchmark_torch_compile_fused_moe.py`: A tool for benchmarking the performance of the fused MoE kernel with `torch.compile` and original fused MoE kernel.

Usage is the same as `benchmark_vllm_vs_sglang_fused_moe_triton.py`.
4 changes: 4 additions & 0 deletions docs/references/contributor_guide.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Contributor Guide

# Build SGLang

See [Install SGLang, Method 2: From Source section](../start/install.md).

## Format Your Code
Use these commands to format your code and pass CI linting tests.

Expand Down
7 changes: 4 additions & 3 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
]
dependencies = ["requests", "tqdm", "numpy", "IPython"]
dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]

[project.optional-dependencies]
runtime_common = ["aiohttp", "decord", "fastapi",
Expand All @@ -22,7 +22,7 @@ runtime_common = ["aiohttp", "decord", "fastapi",
"packaging", "pillow", "prometheus-client>=0.20.0",
"psutil", "pydantic", "python-multipart",
"pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
"xgrammar>=0.1.4"]
"xgrammar>=0.1.6"]
srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer>=0.1.6"]

# HIP (Heterogeneous-computing Interface for Portability) for AMD
Expand All @@ -33,7 +33,7 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
srt_xpu = ["sglang[runtime_common]"]
#For Intel Gaudi(device : hpu) follow the installation guide
#https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
srt_hpu = ["sglang[runtime_common]"]
srt_hpu = ["sglang[runtime_common]"]

openai = ["openai>=1.0", "tiktoken"]
anthropic = ["anthropic>=0.20.0"]
Expand All @@ -50,6 +50,7 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]

dev = ["sglang[all]", "sglang[test]"]
dev_hip = ["sglang[all_hip]", "sglang[test]"]
dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/bench_offline_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def throughput_test(
else:
raise ValueError('Please set backend to either "engine" or "runtime"')

tokenizer_id = server_args.model_path
tokenizer_id = server_args.tokenizer_path or server_args.model_path
tokenizer = get_tokenizer(tokenizer_id)

# Set global environmnets
Expand Down
9 changes: 8 additions & 1 deletion python/sglang/bench_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,8 @@ async def async_request_sglang_generate(
},
"stream": not args.disable_stream,
"lora_path": request_func_input.lora_name,
"return_logprob": args.return_logprob,
"logprob_start_len": -1,
**request_func_input.extra_request_body,
}
headers = {}
Expand Down Expand Up @@ -911,7 +913,7 @@ async def limited_request_func(request_func_input, pbar):
prompt=test_prompt,
api_url=api_url,
prompt_len=test_prompt_len,
output_len=test_output_len,
output_len=min(test_output_len, 32),
lora_name=lora_name,
extra_request_body=extra_request_body,
)
Expand Down Expand Up @@ -1413,6 +1415,11 @@ def set_ulimit(target_soft_limit=65535):
action="store_true",
help="Disable ignoring EOS.",
)
parser.add_argument(
"--return-logprob",
action="store_true",
help="Return logprob.",
)
parser.add_argument(
"--extra-request-body",
metavar='{"key1": "value1", "key2": "value2"}',
Expand Down
5 changes: 4 additions & 1 deletion python/sglang/srt/constrained/xgrammar_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,10 @@ def init_value_impl(self, key: Tuple[str, str]) -> XGrammarGrammar:
key_type, key_string = key
if key_type == "json":
try:
ctx = self.grammar_compiler.compile_json_schema(schema=key_string)
if key_string == "$$ANY$$":
ctx = self.grammar_compiler.compile_builtin_json_grammar()
else:
ctx = self.grammar_compiler.compile_json_schema(schema=key_string)
except RuntimeError as e:
logging.warning(
f"Skip invalid json_schema: json_schema={key_string}, {e=}"
Expand Down
2 changes: 2 additions & 0 deletions python/sglang/srt/layers/attention/flashinfer_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,7 @@ def call_begin_forward(
self.num_qo_heads,
self.num_kv_heads,
self.head_dim,
q_data_type=self.q_data_type,
)

# cached part
Expand All @@ -691,6 +692,7 @@ def call_begin_forward(
self.num_kv_heads,
self.head_dim,
1,
q_data_type=self.q_data_type,
)


Expand Down
41 changes: 16 additions & 25 deletions python/sglang/srt/layers/attention/triton_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import torch

from sglang.srt.layers.attention import AttentionBackend
from sglang.srt.managers.schedule_batch import global_server_args_dict
from sglang.srt.model_executor.forward_batch_info import ForwardBatch

if TYPE_CHECKING:
Expand Down Expand Up @@ -35,10 +34,8 @@ def __init__(self, model_runner: ModelRunner):
model_runner.model_config.num_attention_heads // model_runner.tp_size
)

if global_server_args_dict.get("triton_attention_reduce_in_fp32", False):
self.reduce_dtype = torch.float32
else:
self.reduce_dtype = torch.float16
self.num_kv_splits = model_runner.server_args.triton_attention_num_kv_splits
self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[-1]

self.forward_metadata = None

Expand All @@ -50,23 +47,23 @@ def init_forward_metadata(self, forward_batch: ForwardBatch):
"""Init auxiliary variables for triton attention backend."""

if forward_batch.forward_mode.is_decode():
start_loc = torch.zeros_like(forward_batch.seq_lens, dtype=torch.int32)
start_loc[1:] = torch.cumsum(forward_batch.seq_lens[:-1], dim=0)

total_num_tokens = forward_batch.seq_lens_sum
attn_logits = torch.empty(
(self.num_head, total_num_tokens),
dtype=self.reduce_dtype,
(
forward_batch.batch_size,
self.num_head,
self.num_kv_splits,
self.v_head_dim + 1,
),
dtype=torch.float32,
device=self.device,
)

max_seq_len = torch.max(forward_batch.seq_lens).item()
max_extend_len = None
else:
start_loc = attn_logits = max_seq_len = None
attn_logits = None
max_extend_len = torch.max(forward_batch.extend_seq_lens).item()

self.forward_metadata = start_loc, attn_logits, max_seq_len, max_extend_len
self.forward_metadata = attn_logits, max_extend_len

def init_cuda_graph_state(self, max_bs: int):
self.cuda_graph_max_total_num_tokens = max_bs * self.cuda_graph_max_seq_len
Expand All @@ -75,11 +72,8 @@ def init_cuda_graph_state(self, max_bs: int):
(max_bs,), dtype=torch.int32, device=self.device
)
self.cuda_graph_attn_logits = torch.empty(
(
self.num_head,
self.cuda_graph_max_total_num_tokens,
),
dtype=self.reduce_dtype,
(max_bs, self.num_head, self.num_kv_splits, self.v_head_dim + 1),
dtype=torch.float32,
device="cuda",
)

Expand All @@ -92,9 +86,7 @@ def init_forward_metadata_capture_cuda_graph(
):
# NOTE: encoder_lens expected to be zeros or None
self.forward_metadata = (
self.cuda_graph_start_loc,
self.cuda_graph_attn_logits,
self.cuda_graph_max_seq_len,
None,
)

Expand Down Expand Up @@ -133,7 +125,7 @@ def forward_extend(
layer, forward_batch.out_cache_loc, k, v
)

start_loc, attn_logits, max_seq_len, max_extend_len = self.forward_metadata
_, max_extend_len = self.forward_metadata
self.extend_attention_fwd(
q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
k.contiguous(),
Expand Down Expand Up @@ -171,7 +163,7 @@ def forward_decode(
else:
o = torch.empty_like(q)

start_loc, attn_logits, max_seq_len, max_extend_len = self.forward_metadata
attn_logits, _ = self.forward_metadata

if save_kv_cache:
forward_batch.token_to_kv_pool.set_kv_buffer(
Expand All @@ -185,10 +177,9 @@ def forward_decode(
o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
forward_batch.req_to_token_pool.req_to_token,
forward_batch.req_pool_indices,
start_loc,
forward_batch.seq_lens,
attn_logits,
max_seq_len,
self.num_kv_splits,
layer.scaling,
layer.logit_cap,
)
Expand Down
Loading

0 comments on commit e57c541

Please sign in to comment.