[mxfp8 moe training] use dim1 cast cuda kernel in bwd (#2897)

danielvegamyhre · web-flow · commit 083d0c3e7f24 · 2025-08-29T13:02:00.000-07:00
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -20,7 +20,13 @@
 from torchao.prototype.moe_training.utils import (
     _is_column_major,
 )
+from torchao.prototype.mx_formats.config import (
+    MXFP8Dim1CastKernelChoice,
+    MXGemmKernelChoice,
+    ScaleCalculationMode,
+)
 from torchao.prototype.mx_formats.mx_tensor import to_mx
+from torchao.prototype.mx_formats.utils import _to_mxfp8_dim1_kernel_wrapper
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -376,17 +382,18 @@ def backward(ctx, grad_out: torch.Tensor):
         # Transpose A so we can scale along the M dimension, then un-transpose.
         # A_t_data shape: (K, M)
         # A_t_scales shape: (K, M//block_size)
-        A_t_scales, A_t_data = to_mx(
-            A.transpose(-2, -1).contiguous(),
+        A_t_mx = _to_mxfp8_dim1_kernel_wrapper(
+            A,
+            block_size,
             elem_dtype=torch.float8_e4m3fn,
-            block_size=block_size,
-        )
-
-        # A_data shape = (M, K)
-        A_data = A_t_data.transpose(-2, -1)
-
-        # A_scales shape = (M//block_size, K)
-        A_scales = A_t_scales.transpose(-2, -1)
+            hp_dtype=A.dtype,
+            gemm_kernel_choice=MXGemmKernelChoice.CUTLASS,  # Not used
+            cast_kernel_choice=MXFP8Dim1CastKernelChoice.CUDA,
+            scale_calculation_mode=ScaleCalculationMode.FLOOR,
+        )
+        A_mx = A_t_mx.t()
+        A_data = A_mx.qdata
+        A_scales = A_mx._scale_e8m0.t()
 
         # grad_B_t = scaled grouped mm of (N,M) @ (M,K) = (E,N,K)
         grad_B = _emulated_mxfp8_scaled_grouped_mm_2d_2d(
diff --git a/torchao/prototype/mx_formats/mx_linear.py b/torchao/prototype/mx_formats/mx_linear.py
@@ -11,86 +11,20 @@
 from typing import Any, Optional
 
 import torch
-from torch.distributed._tensor import DTensor
 
 from torchao.prototype.mx_formats.config import (
     MXFP8Dim1CastKernelChoice,
     MXGemmKernelChoice,
     MXLinearConfig,
     ScaleCalculationMode,
 )
-from torchao.prototype.mx_formats.kernels import (
-    mxfp8_quantize_cuda,
-    triton_to_mxfp8_dim1,
-)
 from torchao.prototype.mx_formats.mx_tensor import MXTensor
+from torchao.prototype.mx_formats.utils import _to_mxfp8_dim1_kernel_wrapper
 from torchao.quantization.transform_module import (
     register_quantize_module_handler,
 )
 
 
-def _to_mxfp8_dim1_kernel_wrapper(
-    a,
-    block_size,
-    elem_dtype,
-    hp_dtype,
-    gemm_kernel_choice,
-    cast_kernel_choice,
-    scale_calculation_mode: ScaleCalculationMode,
-):
-    if cast_kernel_choice == MXFP8Dim1CastKernelChoice.TRITON:
-        assert scale_calculation_mode == ScaleCalculationMode.FLOOR
-        a_data, a_scale = triton_to_mxfp8_dim1(a, block_size)
-    elif cast_kernel_choice == MXFP8Dim1CastKernelChoice.CUDA:
-        assert scale_calculation_mode in (
-            ScaleCalculationMode.FLOOR,
-            ScaleCalculationMode.RCEIL,
-        )
-        _, a_data, _, a_scale = mxfp8_quantize_cuda(
-            a,
-            rowwise=False,
-            colwise=True,
-            scaling_mode=scale_calculation_mode.value,
-        )
-    else:
-        raise ValueError(f"must be one of [CUDA, TRITON], got {cast_kernel_choice}")
-
-    if isinstance(a_data, DTensor):
-        assert isinstance(a_scale, DTensor)
-        a_data_local = a_data.to_local()
-        a_scale_local = a_scale.to_local()
-        inner = MXTensor(
-            a_data_local.t(),
-            a_scale_local,
-            elem_dtype,
-            block_size,
-            hp_dtype,
-            gemm_kernel_choice,
-            False,
-            None,
-        )
-        mx_tensor = DTensor.from_local(
-            inner,
-            a_data.device_mesh,
-            a_data.placements,
-            run_check=False,
-            shape=a_data.t().size(),
-            stride=a_data.t().stride(),
-        )
-    else:
-        mx_tensor = MXTensor(
-            a_data.t(),
-            a_scale,
-            elem_dtype,
-            block_size,
-            hp_dtype,
-            gemm_kernel_choice,
-            False,
-            None,
-        )
-    return mx_tensor
-
-
 @torch._dynamo.allow_in_graph
 class mx_mm(torch.autograd.Function):
     # There are three gemms in a forward + backward of a Linear layer:
diff --git a/torchao/prototype/mx_formats/utils.py b/torchao/prototype/mx_formats/utils.py
@@ -5,8 +5,18 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
-
-from torchao.prototype.mx_formats.kernels import triton_mx_block_rearrange
+from torch.distributed._tensor import DTensor
+
+from torchao.prototype.mx_formats.config import (
+    MXFP8Dim1CastKernelChoice,
+    ScaleCalculationMode,
+)
+from torchao.prototype.mx_formats.kernels import (
+    mxfp8_quantize_cuda,
+    triton_mx_block_rearrange,
+    triton_to_mxfp8_dim1,
+)
+from torchao.prototype.mx_formats.mx_tensor import MXTensor
 
 Tensor = torch.Tensor
 
@@ -99,3 +109,65 @@ def _to_blocked_single(scales: Tensor) -> Tensor:
     assert scales.shape == (128, 4)
     scales_tiled = scales.view(4, 32, 4)  # view as 4 - (32, 4) tiles
     return scales_tiled.transpose(0, 1).reshape(32, 16)  # Interleave tiles
+
+
+def _to_mxfp8_dim1_kernel_wrapper(
+    a,
+    block_size,
+    elem_dtype,
+    hp_dtype,
+    gemm_kernel_choice,
+    cast_kernel_choice,
+    scale_calculation_mode: ScaleCalculationMode,
+):
+    if cast_kernel_choice == MXFP8Dim1CastKernelChoice.TRITON:
+        assert scale_calculation_mode == ScaleCalculationMode.FLOOR
+        a_data, a_scale = triton_to_mxfp8_dim1(a, block_size)
+    elif cast_kernel_choice == MXFP8Dim1CastKernelChoice.CUDA:
+        assert scale_calculation_mode in (
+            ScaleCalculationMode.FLOOR,
+            ScaleCalculationMode.RCEIL,
+        )
+        _, a_data, _, a_scale = mxfp8_quantize_cuda(
+            a,
+            rowwise=False,
+            colwise=True,
+            scaling_mode=scale_calculation_mode.value,
+        )
+    else:
+        raise ValueError(f"must be one of [CUDA, TRITON], got {cast_kernel_choice}")
+
+    if isinstance(a_data, DTensor):
+        assert isinstance(a_scale, DTensor)
+        a_data_local = a_data.to_local()
+        a_scale_local = a_scale.to_local()
+        inner = MXTensor(
+            a_data_local.t(),
+            a_scale_local,
+            elem_dtype,
+            block_size,
+            hp_dtype,
+            gemm_kernel_choice,
+            False,
+            None,
+        )
+        mx_tensor = DTensor.from_local(
+            inner,
+            a_data.device_mesh,
+            a_data.placements,
+            run_check=False,
+            shape=a_data.t().size(),
+            stride=a_data.t().stride(),
+        )
+    else:
+        mx_tensor = MXTensor(
+            a_data.t(),
+            a_scale,
+            elem_dtype,
+            block_size,
+            hp_dtype,
+            gemm_kernel_choice,
+            False,
+            None,
+        )
+    return mx_tensor