From 0f3c74f02bc834e2112fb42515f22cdcc0c54394 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 27 Dec 2024 14:54:58 +0800 Subject: [PATCH 01/12] trigger edge case on internvl intentionally Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/intern_vit.py | 12 ++++++++++-- vllm/model_executor/models/internvl.py | 14 +++++++++++++- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 7ff68bd60e8ad..508d3557db2fb 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -155,13 +155,21 @@ def __init__( self.tp_size) self.scale = self.head_dim**-0.5 - self.qkv = QKVParallelLinear( + # self.qkv = QKVParallelLinear( + # self.embed_dim, + # self.head_dim, + # num_dummy_heads + self.num_heads, + # bias=config.qkv_bias, + # quant_config=quant_config, + # prefix=f"{prefix}.qkv", + # ) + self.qkv_proj = QKVParallelLinear( self.embed_dim, self.head_dim, num_dummy_heads + self.num_heads, bias=config.qkv_bias, quant_config=quant_config, - prefix=f"{prefix}.qkv", + prefix=f"{prefix}.qkv_proj", ) self.qk_normalization = config.qk_normalization diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index f4b7e4478c164..b4a91c05650e5 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -34,7 +34,7 @@ from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, get_clip_num_patches) from .interfaces import SupportsMultiModal, SupportsPP -from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) IMG_START = '' @@ -473,6 +473,18 @@ def dummy_data( @INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor) class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): + # BitandBytes specific attributes + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "q_proj": ("qkv_proj", 0), + "k_proj": ("qkv_proj", 1), + "v_proj": ("qkv_proj", 2), + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } + + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={".qkv.": ".qkv_proj."}) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() From c5eac0816fb9c9ae88c1deec6d8fc5afdac78957 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 27 Dec 2024 15:15:00 +0800 Subject: [PATCH 02/12] trigger edge case on internvl intentionally Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/internvl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index b4a91c05650e5..89bdff56e396b 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -483,7 +483,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): "up_proj": ("gate_up_proj", 1), } - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={".qkv.": ".qkv_proj."}) + hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={".qkv.": ".qkv_proj."}) def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() From 4e8ed740570373c32cdc57815e59671dba2e4f06 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 27 Dec 2024 16:01:53 +0800 Subject: [PATCH 03/12] trigger edge case on internvl intentionally Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/internvl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 89bdff56e396b..42b50543fb85a 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -786,4 +786,4 @@ def sample( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) From 1b0edd3e06f3e0fb3fa82af49e367e4bfb0e65ea Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 27 Dec 2024 16:12:42 +0800 Subject: [PATCH 04/12] trigger edge case on internvl intentionally Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/intern_vit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 508d3557db2fb..e9cc3d7394e87 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -207,7 +207,8 @@ def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor): def forward(self, x: torch.Tensor) -> torch.Tensor: B, N, _ = x.shape - qkv, _ = self.qkv(x) + # qkv, _ = self.qkv(x) + qkv, _ = self.qkv_proj(x) q, k, v = qkv.chunk(3, dim=-1) if self.qk_normalization: From 02f8a5d86112cb1aad751b57a6031b992799e3b1 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 27 Dec 2024 16:50:24 +0800 Subject: [PATCH 05/12] handle target modules Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/model_loader/loader.py | 3 +-- vllm/model_executor/models/phi3.py | 4 ---- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index f2d9293b31a83..272ca628a2a58 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -995,8 +995,7 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None: for sub_name in sub_modules: self.target_modules.append( name.replace(last_name, sub_name)) - else: - self.target_modules.append(name) + self.target_modules.append(name) assert (self.target_modules ), "vllm currently does not support BNB quantization for" f" {type(model).__name__}" diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py index 937858ee3b8c2..34141511ea791 100644 --- a/vllm/model_executor/models/phi3.py +++ b/vllm/model_executor/models/phi3.py @@ -14,7 +14,3 @@ class Phi3ForCausalLM(LlamaForCausalLM): "gate_up_proj", ], } - - # BitandBytes specific attributes - # Initialize an empty dict when there is no stacked parameter mapping. - bitsandbytes_stacked_params_mapping = {} From d2dee347a8e14e002ea66a9c67477a0950f80d02 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 27 Dec 2024 16:58:12 +0800 Subject: [PATCH 06/12] revert phi3 Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi3.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py index 34141511ea791..937858ee3b8c2 100644 --- a/vllm/model_executor/models/phi3.py +++ b/vllm/model_executor/models/phi3.py @@ -14,3 +14,7 @@ class Phi3ForCausalLM(LlamaForCausalLM): "gate_up_proj", ], } + + # BitandBytes specific attributes + # Initialize an empty dict when there is no stacked parameter mapping. + bitsandbytes_stacked_params_mapping = {} From d732bb0fed299eb57800941ebc5e25ff68582b17 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 27 Dec 2024 17:07:05 +0800 Subject: [PATCH 07/12] add comments Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/model_loader/loader.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 272ca628a2a58..ee405a30e10fd 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -995,7 +995,11 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None: for sub_name in sub_modules: self.target_modules.append( name.replace(last_name, sub_name)) + # we also add original module name in case that model has + # a mixture of disk-merged and disk-splitted weights with + # same last name. self.target_modules.append(name) + assert (self.target_modules ), "vllm currently does not support BNB quantization for" f" {type(model).__name__}" From 5f4934f3ab3112de7802bb51349cf719455aec18 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 27 Dec 2024 17:09:48 +0800 Subject: [PATCH 08/12] revert internvl Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/intern_vit.py | 17 ++++------------- vllm/model_executor/models/internvl.py | 16 ++-------------- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index e9cc3d7394e87..20cd180ea8a01 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -155,21 +155,13 @@ def __init__( self.tp_size) self.scale = self.head_dim**-0.5 - # self.qkv = QKVParallelLinear( - # self.embed_dim, - # self.head_dim, - # num_dummy_heads + self.num_heads, - # bias=config.qkv_bias, - # quant_config=quant_config, - # prefix=f"{prefix}.qkv", - # ) - self.qkv_proj = QKVParallelLinear( + self.qkv = QKVParallelLinear( self.embed_dim, self.head_dim, num_dummy_heads + self.num_heads, bias=config.qkv_bias, quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", + prefix=f"{prefix}.qkv", ) self.qk_normalization = config.qk_normalization @@ -207,8 +199,7 @@ def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor): def forward(self, x: torch.Tensor) -> torch.Tensor: B, N, _ = x.shape - # qkv, _ = self.qkv(x) - qkv, _ = self.qkv_proj(x) + qkv, _ = self.qkv(x) q, k, v = qkv.chunk(3, dim=-1) if self.qk_normalization: @@ -480,4 +471,4 @@ def load_weights(self, weights: Iterable[Tuple[str, default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) - return loaded_params + return loaded_params \ No newline at end of file diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 42b50543fb85a..d4d67a0bc4c33 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -34,7 +34,7 @@ from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, get_clip_num_patches) from .interfaces import SupportsMultiModal, SupportsPP -from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, +from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) IMG_START = '' @@ -473,18 +473,6 @@ def dummy_data( @INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor) class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): - # BitandBytes specific attributes - bitsandbytes_stacked_params_mapping = { - # shard_name, weight_name, index - "q_proj": ("qkv_proj", 0), - "k_proj": ("qkv_proj", 1), - "v_proj": ("qkv_proj", 2), - "gate_proj": ("gate_up_proj", 0), - "up_proj": ("gate_up_proj", 1), - } - - hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={".qkv.": ".qkv_proj."}) - def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() @@ -786,4 +774,4 @@ def sample( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + return loader.load_weights(weights) \ No newline at end of file From e2381087f2d0943609e7b9306e83a9d722f275d3 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 27 Dec 2024 17:10:21 +0800 Subject: [PATCH 09/12] code format Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/model_loader/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index ee405a30e10fd..c209926cfa7c8 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -995,7 +995,7 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None: for sub_name in sub_modules: self.target_modules.append( name.replace(last_name, sub_name)) - # we also add original module name in case that model has + # we also add original module name in case that model has # a mixture of disk-merged and disk-splitted weights with # same last name. self.target_modules.append(name) From 0df51657c11818653d25adeadcb1e62fa097e9e8 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 27 Dec 2024 17:11:15 +0800 Subject: [PATCH 10/12] revert internvl Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/intern_vit.py | 2 +- vllm/model_executor/models/internvl.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 20cd180ea8a01..7ff68bd60e8ad 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -471,4 +471,4 @@ def load_weights(self, weights: Iterable[Tuple[str, default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) - return loaded_params \ No newline at end of file + return loaded_params diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index d4d67a0bc4c33..f4b7e4478c164 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -774,4 +774,4 @@ def sample( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) - return loader.load_weights(weights) \ No newline at end of file + return loader.load_weights(weights) From c372d3faba888bc611a37d08950f6fc29ee4ebdb Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 27 Dec 2024 21:55:55 +0800 Subject: [PATCH 11/12] update comments Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/model_loader/loader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index c209926cfa7c8..d0c77c841ede7 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -995,9 +995,9 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None: for sub_name in sub_modules: self.target_modules.append( name.replace(last_name, sub_name)) - # we also add original module name in case that model has - # a mixture of disk-merged and disk-splitted weights with - # same last name. + # Add original module name even if the module has stacked map, + # in case model has a mixture of disk-merged and disk-splitted + # weights with same last name. self.target_modules.append(name) assert (self.target_modules From b89cee37a794cb1aa8019c803d667d75ea0d40bb Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 27 Dec 2024 22:01:15 +0800 Subject: [PATCH 12/12] code format Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/model_loader/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index d0c77c841ede7..1fe887f3c40f8 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -996,7 +996,7 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None: self.target_modules.append( name.replace(last_name, sub_name)) # Add original module name even if the module has stacked map, - # in case model has a mixture of disk-merged and disk-splitted + # in case model has a mixture of disk-merged and disk-splitted # weights with same last name. self.target_modules.append(name)