vllm-project
diff --git a/‎examples/awq/qwen3_moe_example.py
Lines changed: 82 additions & 0 deletions b/‎examples/awq/qwen3_moe_example.py
Lines changed: 82 additions & 0 deletions
diff --git a/‎examples/multimodal_audio/README.md
Lines changed: 0 additions & 6 deletions b/‎examples/multimodal_audio/README.md
Lines changed: 0 additions & 6 deletions
diff --git a/‎examples/multimodal_vision/README.md
Lines changed: 0 additions & 6 deletions b/‎examples/multimodal_vision/README.md
Lines changed: 0 additions & 6 deletions
diff --git a/‎examples/multimodal_vision/gemma3_example.py
Lines changed: 4 additions & 4 deletions b/‎examples/multimodal_vision/gemma3_example.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/multimodal_vision/idefics3_example.py
Lines changed: 2 additions & 3 deletions b/‎examples/multimodal_vision/idefics3_example.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/multimodal_vision/llava_example.py
Lines changed: 2 additions & 3 deletions b/‎examples/multimodal_vision/llava_example.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/multimodal_vision/mistral3_chat_template.json
Lines changed: 3 additions & 0 deletions b/‎examples/multimodal_vision/mistral3_chat_template.json
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/multimodal_vision/mistral3_example.py
Lines changed: 89 additions & 0 deletions b/‎examples/multimodal_vision/mistral3_example.py
Lines changed: 89 additions & 0 deletions
diff --git a/‎examples/multimodal_vision/mllama_example.py
Lines changed: 2 additions & 3 deletions b/‎examples/multimodal_vision/mllama_example.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/multimodal_vision/pixtral_example.py
Lines changed: 2 additions & 3 deletions b/‎examples/multimodal_vision/pixtral_example.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/multimodal_vision/qwen2_vl_example.py
Lines changed: 2 additions & 3 deletions b/‎examples/multimodal_vision/qwen2_vl_example.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/multimodal_vision/qwen_2_5_vl_example.py
Lines changed: 2 additions & 5 deletions b/‎examples/multimodal_vision/qwen_2_5_vl_example.py
Lines changed: 2 additions & 5 deletions
diff --git a/‎examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
Lines changed: 0 additions & 2 deletions b/‎examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎examples/quantization_w4a16_fp4/llama3_example.py
Lines changed: 8 additions & 0 deletions b/‎examples/quantization_w4a16_fp4/llama3_example.py
Lines changed: 8 additions & 0 deletions
@@ -0,0 +1,82 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.awq import AWQModifier
+
+# Select model and load it.
+MODEL_ID = "Qwen/Qwen3-30B-A3B"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+
+# Select calibration dataset.
+DATASET_ID = "mit-han-lab/pile-val-backup"
+DATASET_SPLIT = "validation"
+
+# Select number of samples. 256 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 256
+MAX_SEQUENCE_LENGTH = 512
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            [{"role": "user", "content": example["text"]}],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+# Configure the quantization algorithm to run.
+# NOTE: vllm currently does not support asym MoE, using symmetric here
+recipe = [
+    AWQModifier(
+        ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
+        scheme="W4A16",
+        targets=["Linear"],
+    ),
+]
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-sym"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
@@ -47,12 +47,6 @@ Sequential targets are the modules which determine the granularity of error prop
 
 Choosing sequential targets with higher granularity (for example "Linear" instead of "LlamaDecoderLayer") will result in fewer hessians being allocated at the same time, decreasing the memory requirements for compression. This may also increase the recovered accuracy of the model, as compression error is propagated at a higher granularity. However, using higher granularity sequential targets may also increase compression time, as more time is spent offloading and onloading activations.
 
-### Ignore ###
-If your model is not traceable for your desired dataset, first consider adding any problematic modules to the ignore list. Doing this prevents the model tracer from tracing the internals of those modules, thereby avoid the untraceable operations.
-
-## Tracing Errors ##
-Because the architectures of audio-language models is often times more complex than those of typical decoder-only text models, you may encounter `torch.fx.TraceError`s when attempting to quantize your model. For more information on `torch.fx.TraceError`s, why they occur, and how to resolve them, please see the [Model Tracing Guide](/src/llmcompressor/transformers/tracing/GUIDE.md).
-
 ## Adding Your Own Smoothquant Mappings ##
 For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md).
 
 
@@ -51,12 +51,6 @@ Sequential targets are the modules which determine the granularity of error prop
 
 Choosing sequential targets with higher granularity (for example "Linear" instead of "LlamaDecoderLayer") will result in fewer hessians being allocated at the same time, decreasing the memory requirements for compression. This may also increase the recovered accuracy of the model, as compression error is propagated at a higher granularity. However, using higher granularity sequential targets may also increase compression time, as more time is spent offloading and onloading activations.
 
-### Ignore ###
-If your model is not traceable for your desired dataset, first consider adding any problematic modules to the ignore list. Doing this prevents the model tracer from tracing the internals of those modules, thereby avoid the untraceable operations.
-
-## Tracing Errors ##
-Because the architectures of vision-language models is often times more complex than those of typical decoder-only text models, you may encounter `torch.fx.TraceError`s when attempting to quantize your model. For more information on `torch.fx.TraceError`s, why they occur, and how to resolve them, please see the [Model Tracing Guide](/src/llmcompressor/transformers/tracing/GUIDE.md).
-
 ## Adding Your Own Smoothquant Mappings ##
 For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md).
 
 
@@ -1,15 +1,14 @@
 import requests
 import torch
 from PIL import Image
-from transformers import AutoProcessor
+from transformers import AutoProcessor, Gemma3ForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers.tracing import TraceableGemma3ForConditionalGeneration
 
 # Load model.
 model_id = "google/gemma-3-4b-it"
-model = TraceableGemma3ForConditionalGeneration.from_pretrained(
+model = Gemma3ForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype="auto"
 )
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
@@ -64,8 +63,9 @@ def data_collator(batch):
 image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
 raw_image = Image.open(requests.get(image_url, stream=True).raw)
 
+# Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333
 inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
-output = model.generate(**inputs, max_new_tokens=100)
+output = model.generate(**inputs, max_new_tokens=100, disable_compile=True)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
 
 
@@ -2,15 +2,14 @@
 import torch
 from datasets import load_dataset
 from PIL import Image
-from transformers import AutoProcessor
+from transformers import AutoProcessor, Idefics3ForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers.tracing import TraceableIdefics3ForConditionalGeneration
 
 # Load model.
 model_id = "HuggingFaceM4/Idefics3-8B-Llama3"  # or "HuggingFaceTB/SmolVLM-Instruct"
-model = TraceableIdefics3ForConditionalGeneration.from_pretrained(
+model = Idefics3ForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype="auto"
 )
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
@@ -1,15 +1,14 @@
 import requests
 import torch
 from PIL import Image
-from transformers import AutoProcessor
+from transformers import AutoProcessor, LlavaForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration
 
 # Load model.
 model_id = "llava-hf/llava-1.5-7b-hf"
-model = TraceableLlavaForConditionalGeneration.from_pretrained(
+model = LlavaForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype="auto"
 )
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
@@ -0,0 +1,3 @@
+{
+  "chat_template": "{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n    {%- if messages[0]['content'] is string %}\n        {%- set system_message = messages[0]['content'] %}\n    {%- else %}\n        {%- set system_message = messages[0]['content'][0]['text'] %}\n    {%- endif %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = default_system_message %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n    {%- if message['role'] == 'user' %}\n        {%- if message['content'] is string %}\n            {{- '[INST]' + message['content'] + '[/INST]' }}\n        {%- else %}\n            {{- '[INST]' }}\n            {%- for block in message['content'] %}\n                {%- if block['type'] == 'text' %}\n                    {{- block['text'] }}\n                {%- elif block['type'] in ['image', 'image_url'] %}\n                    {{- '[IMG]' }}\n                {%- else %}\n                    {{- raise_exception('Only text and image blocks are supported in message content!') }}\n                {%- endif %}\n            {%- endfor %}\n            {{- '[/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'system' %}\n        {%- if message['content'] is string %}\n            {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n        {%- else %}\n            {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {%- if message['content'] is string %}\n            {{- message['content'] + eos_token }}\n        {%- else %}\n            {{- message['content'][0]['text'] + eos_token }}\n        {%- endif %}\n    {%- else %}\n        {{- raise_exception('Only user, system and assistant roles are supported!') }}\n    {%- endif %}\n{%- endfor %}"
+}
@@ -0,0 +1,89 @@
+import json
+import os
+
+import requests
+import torch
+from PIL import Image
+from transformers import AutoProcessor, Mistral3ForConditionalGeneration
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+
+# Load model.
+model_id = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+model = Mistral3ForConditionalGeneration.from_pretrained(
+    model_id, device_map="auto", torch_dtype="auto"
+)
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+
+# Use a custom calibration chat template, rather than the overly-verbose default
+file_path = os.path.join(os.path.dirname(__file__), "mistral3_chat_template.json")
+with open(file_path, "r") as file:
+    processor.chat_template = json.load(file)["chat_template"]
+
+# Oneshot arguments
+DATASET_ID = "flickr30k"
+DATASET_SPLIT = "test"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Define a oneshot data collator for multimodal inputs.
+def data_collator(batch):
+    assert len(batch) == 1
+    return {
+        key: torch.tensor(value)
+        if key != "pixel_values"
+        else torch.tensor(value, dtype=model.dtype)
+        for key, value in batch[0].items()
+    }
+
+
+# Recipe
+recipe = [
+    GPTQModifier(
+        targets="Linear",
+        scheme="W4A16",
+        sequential_targets=["MistralDecoderLayer"],
+        ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
+    ),
+]
+
+# Perform oneshot
+oneshot(
+    model=model,
+    tokenizer=model_id,
+    dataset=DATASET_ID,
+    splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"},
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+    data_collator=data_collator,
+)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Please describe the animal in this image\n"},
+            {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
+raw_image = Image.open(requests.get(image_url, stream=True).raw)
+
+inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
+inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype)  # fix dtype
+output = model.generate(**inputs, max_new_tokens=100)
+print(processor.decode(output[0], skip_special_tokens=True))
+print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
@@ -1,15 +1,14 @@
 import requests
 import torch
 from PIL import Image
-from transformers import AutoProcessor
+from transformers import AutoProcessor, MllamaForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers.tracing import TraceableMllamaForConditionalGeneration
 
 # Load model.
 model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-model = TraceableMllamaForConditionalGeneration.from_pretrained(
+model = MllamaForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype="auto"
 )
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
@@ -1,15 +1,14 @@
 import requests
 import torch
 from PIL import Image
-from transformers import AutoProcessor
+from transformers import AutoProcessor, LlavaForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration
 
 # Load model.
 model_id = "mgoin/pixtral-12b"
-model = TraceableLlavaForConditionalGeneration.from_pretrained(
+model = LlavaForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype="auto"
 )
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
@@ -4,15 +4,14 @@
 import torch
 from datasets import load_dataset
 from qwen_vl_utils import process_vision_info
-from transformers import AutoProcessor
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers.tracing import TraceableQwen2VLForConditionalGeneration
 
 # Load model.
 model_id = "Qwen/Qwen2-VL-2B-Instruct"
-model = TraceableQwen2VLForConditionalGeneration.from_pretrained(
+model = Qwen2VLForConditionalGeneration.from_pretrained(
     model_id,
     device_map="auto",
     torch_dtype="auto",
 
@@ -4,17 +4,14 @@
 import torch
 from datasets import load_dataset
 from qwen_vl_utils import process_vision_info
-from transformers import AutoProcessor
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
-from llmcompressor.transformers.tracing import (
-    TraceableQwen2_5_VLForConditionalGeneration,
-)
 
 # Load model.
 model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
-model = TraceableQwen2_5_VLForConditionalGeneration.from_pretrained(
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     model_id,
     device_map="auto",
     torch_dtype="auto",
 
@@ -68,15 +68,13 @@
     model=model,
     **oneshot_kwargs,
     stage="sparsity_stage",
-    output_dir=output_dir,
 )
 
 # Sparse finetune
 finetune_applied_model = train(
     model=oneshot_applied_model,
     **oneshot_kwargs,
     **training_kwargs,
-    output_dir=output_dir,
     stage="finetuning_stage",
 )
 
 
@@ -19,6 +19,14 @@
 # Apply quantization.
 oneshot(model=model, recipe=recipe)
 
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+
 # Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4A16"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	+ "chat_template": "{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n {%- if messages[0]['content'] is string %}\n {%- set system_message = messages[0]['content'] %}\n {%- else %}\n {%- set system_message = messages[0]['content'][0]['text'] %}\n {%- endif %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set system_message = default_system_message %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n {%- if message['role'] == 'user' %}\n {%- if message['content'] is string %}\n {{- '[INST]' + message['content'] + '[/INST]' }}\n {%- else %}\n {{- '[INST]' }}\n {%- for block in message['content'] %}\n {%- if block['type'] == 'text' %}\n {{- block['text'] }}\n {%- elif block['type'] in ['image', 'image_url'] %}\n {{- '[IMG]' }}\n {%- else %}\n {{- raise_exception('Only text and image blocks are supported in message content!') }}\n {%- endif %}\n {%- endfor %}\n {{- '[/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'system' %}\n {%- if message['content'] is string %}\n {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n {%- else %}\n {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {%- if message['content'] is string %}\n {{- message['content'] + eos_token }}\n {%- else %}\n {{- message['content'][0]['text'] + eos_token }}\n {%- endif %}\n {%- else %}\n {{- raise_exception('Only user, system and assistant roles are supported!') }}\n {%- endif %}\n{%- endfor %}"
	`3`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -68,15 +68,13 @@`
`68`	`68`	`model=model,`
`69`	`69`	`**oneshot_kwargs,`
`70`	`70`	`stage="sparsity_stage",`
`71`		`- output_dir=output_dir,`
`72`	`71`	`)`
`73`	`72`
`74`	`73`	`# Sparse finetune`
`75`	`74`	`finetune_applied_model = train(`
`76`	`75`	`model=oneshot_applied_model,`
`77`	`76`	`**oneshot_kwargs,`
`78`	`77`	`**training_kwargs,`
`79`		`- output_dir=output_dir,`
`80`	`78`	`stage="finetuning_stage",`
`81`	`79`	`)`
`82`	`80`