pytorch · vkuzo · Aug 22, 2025 · Aug 21, 2025 · Aug 21, 2025 · Aug 22, 2025
diff --git a/test/prototype/mx_formats/test_kernels.py b/test/prototype/mx_formats/test_kernels.py
@@ -41,7 +41,7 @@
     triton_to_mxfp8_dim1_reference,
     unpack_uint4,
 )
-from torchao.prototype.mx_formats.mx_tensor import MXTensor, ScaleCalculationMode, to_mx
+from torchao.prototype.mx_formats.mx_tensor import ScaleCalculationMode, to_mx
 from torchao.prototype.mx_formats.utils import to_blocked
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_8,
@@ -326,28 +326,6 @@ def test_fp4_pack_unpack():
     assert torch.all(orig_vals_dq == orig_vals)
 
 
-# TODO(future PR): fix or delete this test
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-@pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
-@pytest.mark.skipif(is_sm_at_least_89(), reason="broken on CUDA capability 8.9+")
-def test_fp4_triton_scaled_cast():
-    size = (256,)
-    orig_vals = torch.randn(size, dtype=torch.float, device="cuda") * 100
-    mxtensor_ref = MXTensor.to_mx(
-        orig_vals, block_size=32, elem_dtype=torch.float4_e2m1fn_x2
-    )
-    mxtensor_triton = MXTensor.to_mx(
-        orig_vals,
-        block_size=32,
-        elem_dtype=torch.float4_e2m1fn_x2,
-        use_fp4_custom_triton_dequant_kernel=True,
-    )
-
-    f32_ref = mxtensor_ref.to_dtype(torch.float)
-    f32_triton = mxtensor_triton.to_dtype(torch.float)
-    assert torch.all(torch.eq(f32_ref, f32_triton))
-
-
 @pytest.mark.parametrize("dtype_name", (DTYPE_FP6_E2M3, DTYPE_FP6_E3M2))
 def test_fp6_values(dtype_name):
     """

diff --git a/test/prototype/mx_formats/test_mx_tensor.py b/test/prototype/mx_formats/test_mx_tensor.py
@@ -380,14 +380,12 @@ def test_exponent_nan_out(elem_dtype, pack_fp6):
     else:
         raise AssertionError("unsupported")
     block_size = 4
-    use_fp4_custom_triton_dequant_kernel = False
     tensor_mx = MXTensor(
         data_bits,
         scale_e8m0,
         elem_dtype,
         block_size,
         torch.float,
-        use_fp4_custom_triton_dequant_kernel,
         MXGemmKernelChoice.EMULATED,
         pack_fp6,
         None,
@@ -427,22 +425,17 @@ def test_block_sizes(elem_dtype, B):
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES)
-@pytest.mark.parametrize("fp4_triton", [False, True])
-def test_transpose(elem_dtype, fp4_triton):
+def test_transpose(elem_dtype):
     """
     Verify that transposing an MX tensor works
     """
-    if elem_dtype != torch.float4_e2m1fn_x2 and fp4_triton:
-        pytest.skip("unsupported configuration")
-
     M, K = 128, 256
     block_size = 32
     tensor_hp = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
     tensor_mx = MXTensor.to_mx(
         tensor_hp,
         elem_dtype,
         block_size,
-        use_fp4_custom_triton_dequant_kernel=fp4_triton,
     )
     tensor_mx_dq_t = tensor_mx.to_dtype(tensor_hp.dtype).t()
 
@@ -510,15 +503,13 @@ def test_to_mx_from_mx_compile_numerics(elem_dtype, hp_dtype, all_zeros):
 
     to_dtype_c = torch.compile(to_dtype, fullgraph=True)
 
-    use_fp4_custom_triton_dequant_kernel = False
     pack_fp6 = False
     x_mx_dq = to_dtype(
         x_mx.qdata,
         x_mx._scale_e8m0,
         x_mx._elem_dtype,
         x_mx._block_size,
         hp_dtype,  # noqa: E501
-        use_fp4_custom_triton_dequant_kernel,
         pack_fp6,
     )
     x_mx_c_dq = to_dtype_c(
@@ -527,7 +518,6 @@ def test_to_mx_from_mx_compile_numerics(elem_dtype, hp_dtype, all_zeros):
         x_mx_c._elem_dtype,
         x_mx_c._block_size,
         hp_dtype,
-        use_fp4_custom_triton_dequant_kernel,
         pack_fp6,
     )
     torch.testing.assert_close(x_mx_dq, x_mx_c_dq, atol=0, rtol=0)

diff --git a/torchao/prototype/mx_formats/benchmarks/bench_qdq.py b/torchao/prototype/mx_formats/benchmarks/bench_qdq.py
diff --git a/torchao/prototype/mx_formats/config.py b/torchao/prototype/mx_formats/config.py
@@ -146,9 +146,6 @@ class MXLinearConfig(AOBaseConfig):
         MXFP8Dim1CastKernelChoice.TORCH
     )
 
-    # If True, uses a custom triton kernel for fp4 dequantize
-    use_fp4_custom_triton_dequant_kernel: bool = False
-
     scale_calculation_mode: ScaleCalculationMode = ScaleCalculationMode.FLOOR
 
     def __post_init__(self):
@@ -217,8 +214,6 @@ def short_str(self) -> str:
             s += f", lp_go_override={DTYPE_TO_SHORT_STR[self.elem_dtype_grad_output_override]}"
         s += f", kernel={self.gemm_kernel_choice.value}"
         s += f", mxfp8_cast_kernel_choice={self.mxfp8_cast_kernel_choice.value}"
-        if self.use_fp4_custom_triton_dequant_kernel:
-            s += ", use_fp4_custom_triton_dequant_kernel=True"
         if self.scale_calculation_mode != ScaleCalculationMode.FLOOR:
             s += f", scale_calculation_mode={self.scale_calculation_mode}"
         return s