[Speculative Decoding] Medusa Implementation with Top-1 proposer (vll…

…m-project#4978)
xjpang · Jul 24, 2024 · b403a0a · b403a0a
1 parent 0e3026a
commit b403a0a
Show file tree

Hide file tree

Showing 9 changed files with 587 additions and 4 deletions.
diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -0,0 +1,226 @@
+"""This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality.
+
+However, we still need to verify below scenario could be passed:
+    * Batch size 1 greedy equality
+    * Batch size >1 greedy equality
+    * Test greedy equality under preemption
+    * Test greedy equality under various number of speculative tokens.
+
+With those tests, we can say at least, Medusa would not break the
+correctess for the target model outputs.
+"""
+
+import pytest
+
+from .conftest import run_greedy_equality_correctness_test
+
+# main model
+# lmsys/vicuna-7b-v1.3 was to be used but it's causing
+# OOM in CI pipeline, so using a smaller model.
+MAIN_MODEL = "JackFram/llama-68m"
+
+# speculative model
+SPEC_MODEL = "abhigoyal/vllm-medusa-llama-68m-random"
+
+# max. number of speculative tokens: this corresponds to
+# num_heads in the config.json of the speculator model.
+MAX_SPEC_TOKENS = 5
+
+# precision
+PRECISION = "float32"
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
+                                    batch_size: int, output_len: int):
+    """Verify greedy equality with different batch size."""
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+                                                    test_llm_generator,
+                                                    batch_size: int,
+                                                    output_len: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": SPEC_MODEL,
+            "num_speculative_tokens": k,
+        }
+        # Try a range of num. speculative tokens
+        for k in range(1, 1 + MAX_SPEC_TOKENS)
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
+                         batch_size: int, output_len: int):
+    """Verify that mlp speculative decoding produces exact equality
+    to without spec decode with different values of num_speculative_tokens.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": SPEC_MODEL,
+                             "num_speculative_tokens": MAX_SPEC_TOKENS,
+                             "speculative_disable_by_batch_size": 4
+                         }])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator,
+                           batch_size: int, output_len: int):
+    """Verify that mlp speculative decoding produces exact equality
+    to without spec decode when speculation is disabled for large
+    batch sizes.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+if __name__ == "__main__":
+    import pytest
+    pytest.main([__file__])
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -64,6 +64,7 @@
     "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
     "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
     "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
+    "MedusaModel": ("medusa", "Medusa"),
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM")
 }

diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
@@ -0,0 +1,159 @@
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
+from vllm.transformers_utils.configs.medusa import MedusaConfig
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self, hidden_size: int, num_layers: int) -> None:
+        super().__init__()
+
+        self.layers = nn.ModuleList([
+            nn.Linear(hidden_size, hidden_size, bias=False)
+            for _ in range(num_layers)
+        ])
+        self.act = nn.SiLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            x = x + self.act(layer(x))
+        return x
+
+
+class Medusa(nn.Module):
+
+    def __init__(self, config: MedusaConfig, **_) -> None:
+        super().__init__()
+        self.config = config
+        self.blocks = nn.ModuleList([
+            ResidualBlock(hidden_size=self.config.hidden_size,
+                          num_layers=self.config.num_hidden_layers)
+            for _ in range(self.config.num_heads)
+        ])
+        self.orig_vocab_size = config.vocab_size
+        self.truncated_vocab_size = config.truncated_vocab_size
+        self.unpadded_vocab_size = self.truncated_vocab_size
+
+        self.lm_heads = nn.ModuleList([
+            ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=self.truncated_vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            ) for _ in range(self.config.num_heads)
+        ])
+
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                self.truncated_vocab_size,
+                                                logit_scale)
+
+        self.token_map = None
+
+    def forward(self, hidden_states: torch.Tensor) -> List[torch.Tensor]:
+        return [block(hidden_states) for block in self.blocks]
+
+    def compute_logits(
+            self, hidden_states: List[torch.Tensor],
+            sampling_metadata: SamplingMetadata) -> List[torch.Tensor]:
+        logits = []
+
+        for hs, lm_head in zip(hidden_states, self.lm_heads):
+            _logits = self.logits_processor(lm_head, hs, sampling_metadata)
+
+            if self.token_map is None:
+                logits.append(_logits)
+            else:
+                logits.append(-torch.inf * torch.ones(
+                    size=(*_logits.shape[:-1], self.orig_vocab_size),
+                    device=_logits.device,
+                    dtype=_logits.dtype))
+
+                logits[-1][..., self.token_map] = _logits
+
+        return logits
+
+    def sample(
+        self,
+        logits: List[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> List[SamplerOutput]:
+        logits = torch.stack(logits, dim=0).float()
+        logprobs = torch.log_softmax(logits, dim=-1)
+        token_ids = logits.argmax(-1)  # support only top-1 for now
+        probs = torch.softmax(logits, dim=-1)
+
+        token_id_list = []
+        token_prob_list = []
+        token_logprob_list = []
+
+        for idx, seq_group in enumerate(sampling_metadata.seq_groups):
+            token_id_list.append(token_ids[:, seq_group.sample_indices])
+            token_prob_list.append(probs[:, seq_group.sample_indices])
+            token_logprob_list.append(logprobs[:, seq_group.sample_indices])
+
+        outputs: List[Optional[SamplerOutput]] = []
+        for idx in range(len(sampling_metadata.seq_groups)):
+            outputs.append(
+                SamplerOutput(
+                    outputs=None,
+                    sampled_token_probs=token_prob_list[idx].squeeze(1),
+                    logprobs=token_logprob_list[idx].squeeze(1),
+                    sampled_token_ids=token_id_list[idx].squeeze(1),
+                ))
+
+        return outputs
+
+    def generate_proposals(
+        self,
+        previous_hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> List[SamplerOutput]:
+        return self.sample(
+            logits=self.compute_logits(
+                hidden_states=self.forward(previous_hidden_states),
+                sampling_metadata=sampling_metadata,
+            ),
+            sampling_metadata=sampling_metadata,
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+
+        weights_map = {}
+
+        for name, loaded_weight in weights:
+            name = name.replace("medusa_heads.", "")
+
+            if name == "token_map":
+                if self.truncated_vocab_size < self.orig_vocab_size:
+                    self.token_map = nn.Parameter(loaded_weight,
+                                                  requires_grad=False)
+            elif name in params_dict:
+                weights_map[name] = loaded_weight
+
+        for name, loaded_weight in weights_map.items():
+            if "lm_head" in name and self.token_map is not None and\
+                loaded_weight.shape[0] > self.token_map.shape[0]:
+
+                loaded_weight = loaded_weight[self.token_map]
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+        if self.token_map is not None:
+            self.token_map.to(device=self.lm_heads[0].weight.device)
+
+        assert (self.truncated_vocab_size
+                == self.orig_vocab_size) or (self.token_map is not None)