diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py index 36d03e40ef9a..c63f0dc6095b 100644 --- a/benchmarks/kernels/bench_fp8_gemm.py +++ b/benchmarks/kernels/bench_fp8_gemm.py @@ -4,11 +4,11 @@ import itertools import torch -import triton from weight_shapes import WEIGHT_SHAPES from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant +from vllm.triton_utils import triton @triton.testing.perf_report( diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 2fad37d6801b..343a0ae53ad3 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -4,9 +4,35 @@ import random from typing import Any +import numpy as np import pytest from vllm import LLM, SamplingParams +from vllm.v1.metrics.reader import Metric + + +def get_spec_acceptance_metrics(metrics: list[Metric], k: int): + num_drafts = 0 + num_accepted = 0 + acceptance_counts = [0] * k + for metric in metrics: + if metric.name == "vllm:spec_decode_num_drafts": + num_drafts += metric.value + elif metric.name == "vllm:spec_decode_num_accepted_tokens": + num_accepted += metric.value + elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos": + for pos in range(len(metric.values)): + acceptance_counts[pos] += metric.values[pos] + acceptance_rate_per_pos = [ + count / num_drafts for count in acceptance_counts + ] + mean_acceptance_length = 1 + (num_accepted / num_drafts) + return { + "num_drafts": num_drafts, + "num_accepted": num_accepted, + "acceptance_rate_per_pos": acceptance_rate_per_pos, + "mean_acceptance_length": mean_acceptance_length, + } @pytest.fixture @@ -42,6 +68,34 @@ def test_prompts(): return prompts +@pytest.fixture +def test_ngram_acceptance_rate_prompts(): + prompts = [] + words = ["test", "temp", "hello", "where"] + for i in range(len(words)): + word = words[i] + prompt = f"Please repeat the word '{word}' 50 times.\n" + prompt += "Here is an example of how it should look like: " + " ".join( + [word] * 10) + "...\n" + prompt += "Give no other output than the word at least " + prompt += "fifty times in a row in lowercase " + prompt += "with spaces between each word and without quotes." + prompts.append([{"role": "user", "content": prompt}]) + return prompts + + +@pytest.fixture +def test_draft_acceptance_rate_prompts(): + prompts = [ + "Please write a short story about a cat that loves to chase mice.", + "What is the capital of France?", + "Explain the theory of relativity in simple terms.", + "Describe the process of photosynthesis in plants.", + "What are the main ingredients in a traditional pizza?", + ] + return [[{"role": "user", "content": prompt}] for prompt in prompts] + + @pytest.fixture def sampling_config(): return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False) @@ -98,9 +152,9 @@ def test_ngram_correctness( print(f"ref_output: {ref_output.outputs[0].text}") print(f"spec_output: {spec_output.outputs[0].text}") - # Heuristic: expect at least 70% of the prompts to match exactly + # Heuristic: expect at least 65% of the prompts to match exactly # Upon failure, inspect the outputs to check for inaccuracy. - assert matches > int(0.7 * len(ref_outputs)) + assert matches > int(0.65 * (matches + misses)) del spec_llm @@ -147,7 +201,94 @@ def test_eagle_correctness( print(f"ref_output: {ref_output.outputs[0].text}") print(f"spec_output: {spec_output.outputs[0].text}") - # Heuristic: expect at least 66% of the prompts to match exactly + # Heuristic: expect at least 65% of the prompts to match exactly # Upon failure, inspect the outputs to check for inaccuracy. - assert matches > int(0.66 * len(ref_outputs)) + assert matches > int(0.65 * len(ref_outputs)) + del spec_llm + + +def test_ngram_acceptance_rate( + monkeypatch: pytest.MonkeyPatch, + test_ngram_acceptance_rate_prompts: list[list[dict[str, Any]]], + sampling_config: SamplingParams, + model_name: str, +): + ''' + Test the acceptance rate of speculative decoding using ngram method. + The acceptance rate should be very high on the sample prompts, + as they are designed for 100% matches with the ngram method. + ''' + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + spec_llm = LLM( + model=model_name, + speculative_config={ + "method": "ngram", + "prompt_lookup_max": 5, + "prompt_lookup_min": 3, + "num_speculative_tokens": 3, + }, + max_model_len=1024, + disable_log_stats=False, + ) + sampling_config.max_tokens = 50 + spec_llm.chat(test_ngram_acceptance_rate_prompts, sampling_config) + + metrics = get_spec_acceptance_metrics(spec_llm.get_metrics(), k=3) + + # Expect nearly all (90%) of drafted tokens to be accepted + mean_acceptance_rate = np.mean(metrics["acceptance_rate_per_pos"]) + assert mean_acceptance_rate > 0.90 + + # Expect the average acceptance length to be greater than 3 + assert metrics["mean_acceptance_length"] > 3 + + del spec_llm + + +@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"]) +def test_eagle_acceptance_rate( + monkeypatch: pytest.MonkeyPatch, + test_draft_acceptance_rate_prompts: list[dict[str, Any]], + sampling_config: SamplingParams, + model_name: str, + use_eagle3: bool, +): + ''' + Test the acceptance rate of speculative decoding using EAGLE methods. + ''' + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + spec_model_name = eagle3_model_name( + ) if use_eagle3 else eagle_model_name() + spec_llm = LLM( + model=model_name, + trust_remote_code=True, + speculative_config={ + "method": "eagle3" if use_eagle3 else "eagle", + "model": spec_model_name, + "num_speculative_tokens": 3, + "max_model_len": 2048, + }, + max_model_len=2048, + disable_log_stats=False, + ) + sampling_config.max_tokens = 50 + spec_llm.chat(test_draft_acceptance_rate_prompts, sampling_config) + + metrics = get_spec_acceptance_metrics(spec_llm.get_metrics(), k=3) + + # Expect many of drafted tokens to be accepted + if use_eagle3: + # EAGLE3 is more accurate, so we expect a higher acceptance rate + assert metrics["acceptance_rate_per_pos"][0] > 0.75 + assert metrics["acceptance_rate_per_pos"][2] > 0.4 + assert metrics["mean_acceptance_length"] > 2.75 + else: + assert metrics["acceptance_rate_per_pos"][0] > 0.6 + assert metrics["acceptance_rate_per_pos"][2] > 0.2 + assert metrics["mean_acceptance_length"] > 2 + del spec_llm diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index b49ac45f3129..07dfb640cff5 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -8,6 +8,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, VllmConfig) +from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.v1.spec_decode.eagle import EagleProposer model_dir = "meta-llama/Llama-3.1-8B-Instruct" @@ -112,21 +113,26 @@ def test_prepare_inputs(): assert torch.equal(token_indices, expected_token_indices) -@pytest.mark.parametrize( - "method,proposer_helper,draft_model_dir,target_attribute_path", [ - ("eagle", lambda k: _create_proposer("eagle", k), eagle_dir, - ('lm_head', )), - ("eagle3", lambda k: _create_proposer("eagle3", k), eagle3_dir, - ('model', 'embed_tokens')), - ]) +@pytest.mark.parametrize("method,proposer_helper", [ + ("eagle", lambda k: _create_proposer("eagle", k)), + ("eagle3", lambda k: _create_proposer("eagle3", k)), +]) +@pytest.mark.parametrize("pp_size", [1, 2]) +@pytest.mark.parametrize("use_distinct_embed_tokens", [True, False]) @mock.patch('vllm.v1.spec_decode.eagle.get_pp_group') @mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config') @mock.patch('vllm.v1.spec_decode.eagle.get_model') def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method, - proposer_helper, draft_model_dir, target_attribute_path): - - # Setup model mock + proposer_helper, pp_size, use_distinct_embed_tokens): + # Setup draft model mock mock_model = mock.MagicMock() + if use_distinct_embed_tokens: + # Some models can have a different hidden size than the target model, + # so we test that their embed_tokens doesn't get overwritten + mock_model.model.embed_tokens.weight.shape = (131072, 2048) + else: + mock_model.model.embed_tokens.weight.shape = (131072, 4096) + mock_get_model.return_value = mock_model # Setup mocks for attention layers @@ -144,22 +150,24 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method, # Setup mock for pp group to return the appropriate value for world size mock_pp_group = mock.MagicMock() - mock_pp_group.world_size = 2 if method == "eagle" else 1 + mock_pp_group.world_size = pp_size mock_get_pp_group.return_value = mock_pp_group - # Setup target model with the appropriate attributes - target_model = mock.MagicMock() + # Setup the target model mock with a custom class so that + # isinstance() checks match the expected type. + class _TargetModelStub(LlamaForCausalLM): + model: mock.MagicMock + lm_head: mock.MagicMock - # Create the necessary attributes on the target model - current_obj = target_model - for i, attr in enumerate(target_attribute_path): - if i == len(target_attribute_path) - 1: - # Set the last attribute in the path to a MagicMock - setattr(current_obj, attr, mock.MagicMock()) - else: - # Create intermediate objects if needed - setattr(current_obj, attr, mock.MagicMock()) - current_obj = getattr(current_obj, attr) + target_model = mock.create_autospec(_TargetModelStub, instance=True) + target_model.model = mock.MagicMock() + target_model.model.embed_tokens.weight.shape = (131072, 4096) + + from vllm.model_executor.models import SupportsMultiModal + assert not isinstance(target_model, SupportsMultiModal) + + if method == "eagle": + target_model.lm_head = mock.MagicMock() # Create proposer using the helper function proposer = proposer_helper(k=8) @@ -170,10 +178,18 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method, # Verify common interactions mock_get_model.assert_called_once() - # Verify the specific attribute sharing based on the method + # Verify that EAGLE models gain the lm head from the target model if method == "eagle": assert proposer.model.lm_head == target_model.lm_head + + # Verify that the embed tokens are set correctly + # If pp_size is > 1, the embed tokens should be distinct + if pp_size > 1 or use_distinct_embed_tokens: + assert proposer.model.model.embed_tokens != \ + target_model.model.embed_tokens else: + # When pp_size is 1 and the draft and target models have + # embed_tokens of the same shape, they should be shared. assert proposer.model.model.embed_tokens == \ target_model.model.embed_tokens diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index 172dc8b5ec06..e2581e3a903b 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -54,13 +54,11 @@ def __init__( speculative_config.draft_model_config.hf_config self.vocab_size = self.config.vocab_size - # if PP disabled then draft will share embed with target - if get_pp_group().world_size > 1: - self.embed_tokens = VocabParallelEmbedding( - self.config.vocab_size, - self.config.hidden_size, - prefix=maybe_prefix(prefix, "embed_tokens"), - ) + self.embed_tokens = VocabParallelEmbedding( + self.config.vocab_size, + self.config.hidden_size, + prefix=maybe_prefix(prefix, "embed_tokens"), + ) self.layers = nn.ModuleList([ LlamaDecoderLayer( @@ -163,4 +161,4 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): if "lm_head" not in name: name = "model." + name model_weights[name] = loaded_weight - return loader.load_weights(model_weights.items()) + loader.load_weights(model_weights.items()) diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 1e40017fc792..789983ef6b01 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -9,7 +9,6 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig -from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear @@ -94,13 +93,11 @@ def __init__( speculative_config.draft_model_config.hf_config self.vocab_size = self.config.vocab_size - # if PP disabled then draft will share embed with target - if get_pp_group().world_size > 1: - self.embed_tokens = VocabParallelEmbedding( - self.config.vocab_size, - self.config.hidden_size, - prefix=maybe_prefix(prefix, "embed_tokens"), - ) + self.embed_tokens = VocabParallelEmbedding( + self.config.vocab_size, + self.config.hidden_size, + prefix=maybe_prefix(prefix, "embed_tokens"), + ) self.layers = nn.ModuleList([ LlamaDecoderLayer( @@ -239,6 +236,7 @@ def combine_hidden_states( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): model_weights = {} includes_draft_id_mapping = False + includes_embed_tokens = False for name, loaded_weight in weights: if "t2d" in name: continue @@ -247,12 +245,18 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): includes_draft_id_mapping = True elif "lm_head" not in name: name = "model." + name + if "embed_tokens" in name: + includes_embed_tokens = True model_weights[name] = loaded_weight + skip_substrs = [] + if not includes_draft_id_mapping: + skip_substrs.append("draft_id_to_target_id") + if not includes_embed_tokens: + skip_substrs.append("embed_tokens") loader = AutoWeightsLoader( self, skip_prefixes=None, - skip_substrs=["draft_id_to_target_id"] \ - if not includes_draft_id_mapping else None, + skip_substrs=skip_substrs, ) loader.load_weights(model_weights.items()) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 9f833cbb587d..d27dd521aba8 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -157,6 +157,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: def get_current_memory_usage(cls, device: Optional[torch.types.Device] = None ) -> float: + torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats(device) return torch.cuda.max_memory_allocated(device) diff --git a/vllm/utils.py b/vllm/utils.py index c879b38d065a..ddd7930de55c 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -898,6 +898,7 @@ def __init__(self, device: Optional[torch.types.Device] = None): def current_memory_usage(self) -> float: # Return the memory usage in bytes. from vllm.platforms import current_platform + gc.collect() return current_platform.get_current_memory_usage(self.device) def __enter__(self): diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 1ca856423165..4cdf80ea543b 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -329,16 +329,19 @@ def load_model(self, target_model: nn.Module) -> None: self.attn_layer_names = list(draft_attn_layer_names) # share embed_tokens with the target model if needed - if get_pp_group().world_size == 1: + if get_pp_group().world_size == 1 \ + and self.model.model.embed_tokens.weight.shape \ + == target_model.model.embed_tokens.weight.shape: logger.info( - "The EAGLE head shares the same vocab embedding" \ + "Assuming the EAGLE head shares the same vocab embedding" \ " with the target model." ) + del self.model.model.embed_tokens self.model.model.embed_tokens = target_model.model.embed_tokens else: logger.info( - "Since PP > 1, the EAGLE head loaded its own vocab embedding" \ - " weights instead of sharing them with the target model." + "The EAGLE head's vocab embedding will be loaded separately" \ + " from the target model." ) # share lm_head with the target model if needed