Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Model] Support GGUF models newly added in transformers 4.46.0 #9685

Merged
merged 19 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 8 additions & 14 deletions examples/gguf_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,20 @@
from vllm import LLM, SamplingParams


def run_gguf_inference(model_path):
PROMPT_TEMPLATE = "<|system|>\n{system_message}</s>\n<|user|>\n{prompt}</s>\n<|assistant|>\n" # noqa: E501
system_message = "You are a friendly chatbot who always responds in the style of a pirate." # noqa: E501
def run_gguf_inference(model_path, tokenizer):
# Sample prompts.
prompts = [
"How many helicopters can a human eat in one sitting?",
"What's the future of AI?",
]
prompts = [
PROMPT_TEMPLATE.format(system_message=system_message, prompt=prompt)
for prompt in prompts
]
prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0, max_tokens=128)

# Create an LLM.
llm = LLM(model=model_path,
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
gpu_memory_utilization=0.95)
llm = LLM(model=model_path, tokenizer=tokenizer)

outputs = llm.generate(prompts, sampling_params)
outputs = llm.chat(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
Expand All @@ -32,7 +25,8 @@ def run_gguf_inference(model_path):


if __name__ == "__main__":
repo_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
filename = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
tokenizer = "microsoft/Phi-3-medium-4k-instruct"
model = hf_hub_download(repo_id, filename=filename)
run_gguf_inference(model)
run_gguf_inference(model, tokenizer)
111 changes: 82 additions & 29 deletions tests/models/decoder_only/language/test_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,84 @@
"""

import os
from typing import List, NamedTuple, Type

import pytest
import transformers
from huggingface_hub import hf_hub_download
from packaging.version import parse
from transformers import AutoTokenizer

from tests.quantization.utils import is_quant_method_supported

from ....conftest import VllmRunner
from ...utils import check_logprobs_close

os.environ["TOKENIZERS_PARALLELISM"] = "true"

MAX_MODEL_LEN = 1024

# FIXME: Move this to confest

class GGUFTestConfig(NamedTuple):
original_model: str
gguf_repo: str
gguf_filename: str
run_requirement: bool = True

@property
def gguf_model(self):
return hf_hub_download(self.gguf_repo, filename=self.gguf_filename)


TRANSFORMERS_REQUIREMENT = parse(transformers.__version__) >= parse("4.46.0")

LLAMA_CONFIG = GGUFTestConfig(
original_model="meta-llama/Llama-3.2-1B-Instruct",
gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
)

QWEN2_CONFIG = GGUFTestConfig(
original_model="Qwen/Qwen2.5-1.5B-Instruct",
gguf_repo="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
)

PHI3_CONFIG = GGUFTestConfig(
original_model="microsoft/Phi-3.5-mini-instruct",
gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
gguf_filename="Phi-3.5-mini-instruct-IQ4_XS.gguf",
run_requirement=TRANSFORMERS_REQUIREMENT,
)

GPT2_CONFIG = GGUFTestConfig(
original_model="openai-community/gpt2-large",
gguf_repo="QuantFactory/gpt2-large-GGUF",
gguf_filename="gpt2-large.Q4_K_M.gguf",
run_requirement=TRANSFORMERS_REQUIREMENT,
)

STABLELM_CONFIG = GGUFTestConfig(
original_model="stabilityai/stablelm-3b-4e1t",
gguf_repo="afrideva/stablelm-3b-4e1t-GGUF",
gguf_filename="stablelm-3b-4e1t.q4_k_m.gguf",
run_requirement=TRANSFORMERS_REQUIREMENT,
)

STARCODER_CONFIG = GGUFTestConfig(
original_model="bigcode/starcoder2-3b",
gguf_repo="QuantFactory/starcoder2-3b-GGUF",
gguf_filename="starcoder2-3b.Q6_K.gguf",
run_requirement=TRANSFORMERS_REQUIREMENT,
)

MODELS = [
("meta-llama/Llama-3.2-1B-Instruct",
hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
filename="Llama-3.2-1B-Instruct-Q4_K_M.gguf")),
("meta-llama/Llama-3.2-1B-Instruct",
hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf")),
("Qwen/Qwen2-1.5B-Instruct",
hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
("Qwen/Qwen2-1.5B-Instruct",
hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
LLAMA_CONFIG,
QWEN2_CONFIG,
PHI3_CONFIG,
GPT2_CONFIG,
STABLELM_CONFIG,
# STARCODER_CONFIG, # broken
]


Expand All @@ -42,10 +93,10 @@
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("tp_size", [1, 2])
def test_models(
num_gpus_available,
vllm_runner,
example_prompts,
model,
num_gpus_available: int,
vllm_runner: Type[VllmRunner],
example_prompts: List[str],
model: GGUFTestConfig,
dtype: str,
max_tokens: int,
num_logprobs: int,
Expand All @@ -54,28 +105,30 @@ def test_models(
if num_gpus_available < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

original_model, gguf_model = model
if not model.run_requirement:
pytest.skip(
f"Model not supported in transformers=={transformers.__version__}")

tokenizer = AutoTokenizer.from_pretrained(original_model)
messages = [[{
'role': 'user',
'content': prompt
}] for prompt in example_prompts]
example_prompts = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
tokenizer = AutoTokenizer.from_pretrained(model.original_model)
if tokenizer.chat_template is not None:
messages = [[{
'role': 'user',
'content': prompt
}] for prompt in example_prompts]
example_prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True)

# Run unquantized model.
with vllm_runner(model_name=original_model,
with vllm_runner(model_name=model.original_model,
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size) as original_model:

original_outputs = original_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)

# Run gguf model.
with vllm_runner(model_name=gguf_model,
with vllm_runner(model_name=model.gguf_model,
tokenizer_name=model.original_model,
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size) as gguf_model:
Expand Down
48 changes: 32 additions & 16 deletions vllm/model_executor/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,8 +440,11 @@ def weight_loader(self,
is_gguf_weight = getattr(param, "is_gguf_weight", False)
is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
if is_gguf_weight_type:
param.data[loaded_shard_id].copy_(loaded_weight)
param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
if loaded_shard_id is not None:
param.data[loaded_shard_id].copy_(loaded_weight)
param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
else:
param.weight_type = loaded_weight.item()
return

if is_gguf_weight:
Expand All @@ -455,11 +458,16 @@ def weight_loader(self,
loaded_weight = loaded_weight.narrow(output_dim, start_idx,
shard_size)

param.shard_id.append(loaded_shard_id)
param.shard_id_map[loaded_shard_id] = len(param.data_container)
param.data_container.append(loaded_weight)
if len(param.data_container) == 2:
self.qweight = param.materialize_nested()
if loaded_shard_id is not None:
param.shard_id.append(loaded_shard_id)
param.shard_id_map[loaded_shard_id] = len(param.data_container)
param.data_container.append(loaded_weight)
if len(param.data_container) == 2:
self.qweight = param.materialize_nested()
else:
param.materialize(loaded_weight.shape,
dtype=loaded_weight.dtype)
param.data.copy_(loaded_weight)
return

param_data = param.data
Expand Down Expand Up @@ -775,10 +783,13 @@ def weight_loader(self,
# initialize GGUF param after we know the quantize type
is_gguf_weight = getattr(param, "is_gguf_weight", False)
is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
if is_gguf_weight_type and loaded_shard_id is not None:
idx_map = {"q": 0, "k": 1, "v": 2}
param.data[idx_map[loaded_shard_id]].copy_(loaded_weight)
param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
if is_gguf_weight_type:
if loaded_shard_id is not None:
idx_map = {"q": 0, "k": 1, "v": 2}
param.data[idx_map[loaded_shard_id]].copy_(loaded_weight)
param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
else:
param.weight_type = loaded_weight.item()
return

if is_gguf_weight:
Expand All @@ -792,11 +803,16 @@ def weight_loader(self,
loaded_weight = loaded_weight.narrow(output_dim, start_idx,
shard_size)

param.shard_id.append(loaded_shard_id)
param.shard_id_map[loaded_shard_id] = len(param.data_container)
param.data_container.append(loaded_weight)
if len(param.data_container) == 3:
self.qweight = param.materialize_nested()
if loaded_shard_id is not None:
param.shard_id.append(loaded_shard_id)
param.shard_id_map[loaded_shard_id] = len(param.data_container)
param.data_container.append(loaded_weight)
if len(param.data_container) == 3:
self.qweight = param.materialize_nested()
else:
param.materialize(loaded_weight.shape,
dtype=loaded_weight.dtype)
param.data.copy_(loaded_weight)
return

param_data = param.data
Expand Down
9 changes: 6 additions & 3 deletions vllm/model_executor/models/gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,9 @@ def __init__(
assert not config.scale_attn_by_inverse_layer_idx
assert not config.reorder_and_upcast_attn
self.embed_dim = config.hidden_size
self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim)
self.wte = VocabParallelEmbedding(config.vocab_size,
self.embed_dim,
quant_config=quant_config)
Isotr0py marked this conversation as resolved.
Show resolved Hide resolved
self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
self.start_layer, self.end_layer, self.h = make_layers(
config.num_hidden_layers,
Expand Down Expand Up @@ -259,7 +261,8 @@ def __init__(
self.lm_head = self.transformer.wte
else:
self.lm_head = ParallelLMHead(self.config.vocab_size,
self.config.hidden_size)
self.config.hidden_size,
quant_config=quant_config)
Isotr0py marked this conversation as resolved.
Show resolved Hide resolved
self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = Sampler()
self.make_empty_intermediate_tensors = (
Expand Down Expand Up @@ -297,7 +300,7 @@ def sample(
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
params_dict = dict(self.named_parameters(remove_duplicate=False))
for name, loaded_weight in weights:
if "lm_head.weight" in name:
if name.startswith("lm_head"):
# GPT-2 ties the weights of the embedding layer and the final
# linear layer.
continue
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,8 @@ def __init__(
)

is_neox_style = True
if quant_config is not None and quant_config.get_name() == "gguf":
is_gguf = quant_config and quant_config.get_name() == "gguf"
if is_gguf and config.model_type == "llama":
is_neox_style = False

self.rotary_emb = get_rope(
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/models/stablelm.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def __init__(self,
quant_config=quant_config)
self.down_proj = RowParallelLinear(config.intermediate_size,
config.hidden_size,
bias=False)
bias=False,
quant_config=quant_config)
Isotr0py marked this conversation as resolved.
Show resolved Hide resolved
self.act_fn = SiluAndMul()

def forward(self, x: torch.Tensor) -> torch.Tensor:
Expand Down Expand Up @@ -203,6 +204,7 @@ def __init__(self,
self.embed_tokens = VocabParallelEmbedding(
config.vocab_size,
config.hidden_size,
quant_config=quant_config,
)
self.start_layer, self.end_layer, self.layers = make_layers(
config.num_hidden_layers,
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/starcoder2.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,8 @@ def __init__(self,

# TODO: consider padding_idx (currently removed)
self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
config.hidden_size)
config.hidden_size,
quant_config=quant_config)
Isotr0py marked this conversation as resolved.
Show resolved Hide resolved
self.start_layer, self.end_layer, self.layers = make_layers(
config.num_hidden_layers,
lambda prefix: Starcoder2DecoderLayer(
Expand Down