vllm-project · dsikka · Jun 25, 2025 · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025
diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py
@@ -80,13 +80,6 @@ class ModelArguments:
         default=True,
         metadata={"help": "Whether to compress sparse models during save"},
     )
-    oneshot_device: Optional[str] = field(
-        default="cuda",
-        metadata={
-            "help": "This argument is deprecated and nonfunctional "
-            "and will be removed in future release"
-        },
-    )
     model_revision: str = field(
         default="main",
         metadata={

diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -208,7 +208,6 @@ def oneshot(
     tie_word_embeddings: bool = False,
     trust_remote_code_model: bool = False,
     save_compressed: bool = True,
-    oneshot_device: str = "cuda:0",
     model_revision: str = "main",
     # Recipe arguments
     recipe: Optional[Union[str, List[str]]] = None,
@@ -259,7 +258,6 @@ def oneshot(
     :param trust_remote_code_model: Whether to allow for custom models to execute
         their own modeling files.
     :param save_compressed: Whether to compress sparse models during save.
-    :param oneshot_device: Device to run oneshot calibration on.
     :param model_revision: The specific model version to use (can be branch name,
         tag, or commit id).
 

diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py
@@ -40,7 +40,6 @@ def preprocessing_func(example):
             recipe=None,
             num_train_epochs=self.num_train_epochs,
             concatenate_data=concatenate_data,
-            oneshot_device=self.device,
             text_column="text",
             dataset_path=dataset_path,
             preprocessing_func=preprocessing_func,

diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py b/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py
@@ -24,7 +24,6 @@ def test_finetune_without_recipe(self):
         from llmcompressor import train
 
         recipe_str = None
-        device = "cuda:0"
 
         concatenate_data = False
         max_steps = 50
@@ -38,7 +37,6 @@ def test_finetune_without_recipe(self):
             max_steps=max_steps,
             concatenate_data=concatenate_data,
             splits=splits,
-            oneshot_device=device,
         )
 
     def tearDown(self):

diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py
@@ -31,7 +31,6 @@ def _test_oneshot_and_finetune(self):
             splits=splits,
             recipe=self.recipe,
             num_calibration_samples=64,
-            oneshot_device=self.device,
             dataset_config_name=self.dataset_config_name,
             concatenate_data=self.concat_txt,
             output_dir=self.output,

diff --git a/tests/llmcompressor/transformers/finetune/test_safetensors.py b/tests/llmcompressor/transformers/finetune/test_safetensors.py
@@ -24,7 +24,6 @@ def setUp(self):
     def test_safetensors(self):
         from llmcompressor import train
 
-        device = "cuda:0"
         output_dir = self.output / "output1"
         max_steps = 10
         splits = {"train": "train[:10%]"}
@@ -35,7 +34,6 @@ def test_safetensors(self):
             output_dir=output_dir,
             max_steps=max_steps,
             splits=splits,
-            oneshot_device=device,
         )
 
         assert os.path.exists(output_dir / "model.safetensors")
@@ -49,7 +47,6 @@ def test_safetensors(self):
             output_dir=new_output_dir,
             max_steps=max_steps,
             splits=splits,
-            oneshot_device=device,
         )
 
     def tearDown(self):

diff --git a/tests/llmcompressor/transformers/gptq/test_oneshot.py b/tests/llmcompressor/transformers/gptq/test_oneshot.py
@@ -77,7 +77,6 @@ def test_oneshot_application(self):
             dataset=self.dataset,
             output_dir=self.output,
             recipe=self.recipe,
-            oneshot_device=self.device,
             num_calibration_samples=9,
         )
         model_loaded = AutoModelForCausalLM.from_pretrained(

diff --git a/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py b/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py
@@ -39,7 +39,6 @@ def _test_consecutive_runs(
             num_calibration_samples=num_calibration_samples,
             recipe=self.first_recipe,
             output_dir=self.output_first,
-            oneshot_device=self.device,
         )
 
         first_model = AutoModelForCausalLM.from_pretrained(
@@ -68,7 +67,6 @@ def _test_consecutive_runs(
             num_calibration_samples=num_calibration_samples,
             recipe=self.second_recipe,
             output_dir=self.output_second,
-            oneshot_device=self.device,
         )
 
         second_model = AutoModelForCausalLM.from_pretrained(

diff --git a/tests/llmcompressor/transformers/obcq/test_mask_structure_preservation.py b/tests/llmcompressor/transformers/obcq/test_mask_structure_preservation.py
@@ -60,7 +60,6 @@ def test_mask_structure_preserved(self):
             num_calibration_samples=num_calibration_samples,
             recipe=self.initial_pruning_only_recipe,
             output_dir=self.output_first,
-            oneshot_device=self.device,
             save_compressed=False,
         )
         targetted_layer = first_tiny_model.model.layers[0].self_attn.k_proj
@@ -82,7 +81,6 @@ def test_mask_structure_preserved(self):
             num_calibration_samples=num_calibration_samples,
             recipe=self.subsequent_prune_and_quant_recipe,
             output_dir=self.output_second,
-            oneshot_device=self.device,
             save_compressed=False,
         )
 

diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_completion.py b/tests/llmcompressor/transformers/obcq/test_obcq_completion.py
@@ -59,7 +59,6 @@ def _test_oneshot_completion(self, model_name: str = None):
             model=self.model,
             dataset=self.dataset,
             splits={"calibration": f"train[:{self.num_samples}]"},
-            oneshot_device=self.device,
             recipe=self.recipe,
             max_seq_length=512,
             num_calibration_samples=self.num_samples,

diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py b/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py
@@ -33,7 +33,6 @@ def test_sparsities(self):
         model = oneshot(
             model=self.model,
             dataset=self.dataset,
-            oneshot_device=self.device,
             recipe=self.recipe,
             max_seq_length=128,
             num_calibration_samples=64,
@@ -82,7 +81,6 @@ def test_sparsities_gpu(self):
         model = oneshot(
             model=self.model,
             dataset=self.dataset,
-            oneshot_device=self.device,
             recipe=self.recipe,
             max_seq_length=128,
             num_calibration_samples=64,

diff --git a/tests/llmcompressor/transformers/obcq/test_oneshot_with_modifier.py b/tests/llmcompressor/transformers/obcq/test_oneshot_with_modifier.py
@@ -31,7 +31,6 @@ def test_oneshot_with_modifier_object(self):
             SparseGPTModifier(sparsity=0.5, targets=[r"re:model.layers.\d+$"])
         ]
 
-        device = "cuda:0"
         concatenate_data = False
         num_calibration_samples = 64
         output_dir = self.output / "oneshot_out"
@@ -45,7 +44,6 @@ def test_oneshot_with_modifier_object(self):
             recipe=recipe_str,
             concatenate_data=concatenate_data,
             splits=splits,
-            oneshot_device=device,
         )
 
     def tearDown(self):

diff --git a/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py
@@ -47,9 +47,6 @@ def test_sparse_model_reload(compressed, config, dtype, tmp_path):
     recipe_str = "tests/llmcompressor/transformers/obcq/recipes/test_tiny2.yaml"
     expected_sparsity = 0.5
     model_path = "nm-testing/llama2.c-stories15M"
-    device = "cuda:0"
-    if not torch.cuda.is_available():
-        device = "cpu"
     dataset = "open_platypus"
     concatenate_data = False
     num_calibration_samples = 64
@@ -66,7 +63,6 @@ def test_sparse_model_reload(compressed, config, dtype, tmp_path):
         recipe=recipe_str,
         concatenate_data=concatenate_data,
         splits=splits,
-        oneshot_device=device,
         precision=dtype,
         clear_sparse_session=False,
     )
@@ -166,9 +162,7 @@ def test_quant_model_reload(format, dtype, tmp_path):
         "tests/llmcompressor/transformers/compression/recipes/new_quant_simple.yaml"
     )
     model_path = "nm-testing/llama2.c-stories15M"
-    device = "cuda:0"
-    if not torch.cuda.is_available():
-        device = "cpu"
+    device = "cuda:0" if not torch.cuda.is_available() else "cpu"
     dataset = "open_platypus"
     concatenate_data = False
     num_calibration_samples = 16
@@ -182,7 +176,6 @@ def test_quant_model_reload(format, dtype, tmp_path):
         recipe=recipe_str,
         concatenate_data=concatenate_data,
         splits=splits,
-        oneshot_device=device,
         precision=dtype,
         clear_sparse_session=False,
     )
@@ -362,9 +355,7 @@ def test_model_shared_tensors_gpu(
 def test_compressor_stacking(model_stub, recipe, sparse_format, quant_format, tmp_path):
     from llmcompressor.pytorch.model_load.helpers import get_session_model
 
-    device = "cuda"
-    if not torch.cuda.is_available():
-        device = "cpu"
+    device = "cuda:0" if not torch.cuda.is_available() else "cpu"
     dataset = "open_platypus"
     concatenate_data = False
     num_calibration_samples = 64
@@ -378,7 +369,6 @@ def test_compressor_stacking(model_stub, recipe, sparse_format, quant_format, tm
         recipe=recipe,
         concatenate_data=concatenate_data,
         splits=splits,
-        oneshot_device=device,
         clear_sparse_session=False,
     )
 
@@ -446,9 +436,7 @@ def test_compressor_stacking(model_stub, recipe, sparse_format, quant_format, tm
     ],
 )
 def test_sparse_24_compressor_is_lossless(model_stub, recipe, sparse_format, tmp_path):
-    device = "cuda"
-    if not torch.cuda.is_available():
-        device = "cpu"
+    device = "cuda:0" if not torch.cuda.is_available() else "cpu"
     dataset = "open_platypus"
     concatenate_data = False
     num_calibration_samples = 64
@@ -462,7 +450,6 @@ def test_sparse_24_compressor_is_lossless(model_stub, recipe, sparse_format, tmp
         recipe=recipe,
         concatenate_data=concatenate_data,
         splits=splits,
-        oneshot_device=device,
         clear_sparse_session=False,
     )