Skip to content

Commit

Permalink
Fix model loader for more quantization formats (#2448)
Browse files Browse the repository at this point in the history
  • Loading branch information
merrymercy authored Dec 11, 2024
1 parent f677239 commit 959735f
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 2 deletions.
22 changes: 22 additions & 0 deletions python/sglang/srt/models/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,28 @@ def forward(


class LlamaForCausalLM(nn.Module):

# BitandBytes specific attributes
default_bitsandbytes_target_modules = [
".gate_proj.",
".down_proj.",
".up_proj.",
".q_proj.",
".k_proj.",
".v_proj.",
".o_proj.",
]
# in TP, these weights are partitioned along the column dimension (dim=-1)
column_parallel_weights_modules = [".down_proj.", ".o_proj."]
bitsandbytes_stacked_params_mapping = {
# shard_name, weight_name, index
"q_proj": ("qkv_proj", 0),
"k_proj": ("qkv_proj", 1),
"v_proj": ("qkv_proj", 2),
"gate_proj": ("gate_up_proj", 0),
"up_proj": ("gate_up_proj", 1),
}

def __init__(
self,
config: LlamaConfig,
Expand Down
20 changes: 20 additions & 0 deletions python/sglang/srt/models/qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,26 @@ def forward(


class Qwen2ForCausalLM(nn.Module):

# BitandBytes specific attributes
default_bitsandbytes_target_modules = [
".gate_proj.",
".down_proj.",
".up_proj.",
".q_proj.",
".k_proj.",
".v_proj.",
".o_proj.",
]
bitsandbytes_stacked_params_mapping = {
# shard_name, weight_name, index
"q_proj": ("qkv_proj", 0),
"k_proj": ("qkv_proj", 1),
"v_proj": ("qkv_proj", 2),
"gate_proj": ("gate_up_proj", 0),
"up_proj": ("gate_up_proj", 1),
}

def __init__(
self,
config: Qwen2Config,
Expand Down
14 changes: 12 additions & 2 deletions python/sglang/srt/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,15 @@ def add_cli_args(parser: argparse.ArgumentParser):
"--load-format",
type=str,
default=ServerArgs.load_format,
choices=["auto", "pt", "safetensors", "npcache", "dummy", "gguf"],
choices=[
"auto",
"pt",
"safetensors",
"npcache",
"dummy",
"gguf",
"bitsandbytes",
],
help="The format of the model weights to load. "
'"auto" will try to load the weights in the safetensors format '
"and fall back to the pytorch bin format if safetensors format "
Expand All @@ -294,7 +302,9 @@ def add_cli_args(parser: argparse.ArgumentParser):
"a numpy cache to speed up the loading. "
'"dummy" will initialize the weights with random values, '
"which is mainly for profiling."
'"gguf" will load the weights in the gguf format. ',
'"gguf" will load the weights in the gguf format. '
'"bitsandbytes" will load the weights using bitsandbytes '
"quantization.",
)
parser.add_argument(
"--trust-remote-code",
Expand Down

0 comments on commit 959735f

Please sign in to comment.