[fp8 blockwise] wrap triton quantization kernels in custom ops for torch.compile compatibility

danielvegamyhre · danielvegamyhre · commit 7927d66bad8d · 2025-08-20T18:37:39.000-07:00
stack-info: PR: #2829, branch: danielvegamyhre/stack/47
diff --git a/benchmarks/prototype/blockwise_fp8_training/bench_1x128_128x128_gemms.py b/benchmarks/prototype/blockwise_fp8_training/bench_1x128_128x128_gemms.py
@@ -15,8 +15,8 @@
 from triton.testing import do_bench
 
 from torchao.prototype.blockwise_fp8_training.kernels import (
-    fp8_blockwise_act_quant_lhs,
-    fp8_blockwise_weight_quant_transposed_rhs,
+    triton_fp8_blockwise_act_quant_lhs,
+    triton_fp8_blockwise_weight_quant_transposed_rhs,
     triton_fp8_gemm_1x128_128x128,
 )
 
@@ -78,8 +78,8 @@ def run_experiment(config: ExperimentConfig) -> ExperimentResult:
     M, N, K = config.m, config.n, config.k
     A = torch.randn(M, K, dtype=config.out_dtype, device="cuda")
     B = torch.randn(N, K, dtype=config.out_dtype, device="cuda")
-    A_q, A_s = fp8_blockwise_act_quant_lhs(A, dtype=torch.float8_e4m3fn)
-    B_t_q, B_t_s = fp8_blockwise_weight_quant_transposed_rhs(
+    A_q, A_s = triton_fp8_blockwise_act_quant_lhs(A, dtype=torch.float8_e4m3fn)
+    B_t_q, B_t_s = triton_fp8_blockwise_weight_quant_transposed_rhs(
         B, dtype=torch.float8_e4m3fn
     )
 
diff --git a/benchmarks/prototype/blockwise_fp8_training/bench_1x128_128x1_gemms.py b/benchmarks/prototype/blockwise_fp8_training/bench_1x128_128x1_gemms.py
@@ -15,8 +15,8 @@
 from triton.testing import do_bench
 
 from torchao.prototype.blockwise_fp8_training.kernels import (
-    fp8_blockwise_act_quant_rhs,
-    fp8_blockwise_act_quant_transposed_lhs,
+    triton_fp8_blockwise_act_quant_rhs,
+    triton_fp8_blockwise_act_quant_transposed_lhs,
     triton_fp8_gemm_1x128_128x1,
 )
 
@@ -78,8 +78,10 @@ def run_experiment(config: ExperimentConfig) -> ExperimentResult:
     M, N, K = config.m, config.n, config.k
     A = torch.randn(M, N, dtype=config.out_dtype, device="cuda")
     B = torch.randn(M, K, dtype=config.out_dtype, device="cuda")
-    A_t_q, A_t_s = fp8_blockwise_act_quant_transposed_lhs(A, dtype=torch.float8_e4m3fn)
-    B_q, B_s = fp8_blockwise_act_quant_rhs(B, dtype=torch.float8_e4m3fn)
+    A_t_q, A_t_s = triton_fp8_blockwise_act_quant_transposed_lhs(
+        A, dtype=torch.float8_e4m3fn
+    )
+    B_q, B_s = triton_fp8_blockwise_act_quant_rhs(B, dtype=torch.float8_e4m3fn)
 
     def warmup(func, *args, **kwargs):
         for _ in range(10):
diff --git a/test/prototype/blockwise_fp8_training/test_blockwise_kernels.py b/test/prototype/blockwise_fp8_training/test_blockwise_kernels.py
@@ -12,14 +12,14 @@
 from packaging import version
 from torchao.float8.float8_utils import compute_error
 from torchao.prototype.blockwise_fp8_training.kernels import (
-    fp8_blockwise_act_quant_lhs,
-    fp8_blockwise_act_quant_rhs,
-    fp8_blockwise_act_quant_transposed_lhs,
-    fp8_blockwise_weight_quant_rhs,
-    fp8_blockwise_weight_quant_transposed_rhs,
     torch_blockwise_scale_act_quant_lhs,
     torch_blockwise_scale_act_quant_rhs,
     torch_blockwise_scale_weight_quant,
+    triton_fp8_blockwise_act_quant_lhs,
+    triton_fp8_blockwise_act_quant_rhs,
+    triton_fp8_blockwise_act_quant_transposed_lhs,
+    triton_fp8_blockwise_weight_quant_rhs,
+    triton_fp8_blockwise_weight_quant_transposed_rhs,
     triton_fp8_gemm_1x128_128x1,
     triton_fp8_gemm_1x128_128x128,
 )
@@ -51,8 +51,8 @@ def test_triton_fp8_gemm_1x128_128x128(M, N, K, dtype):
     A = torch.randn(M, K, dtype=torch.bfloat16, device="cuda")
     B = torch.randn(N, K, dtype=torch.bfloat16, device="cuda")
     C = A @ B.T
-    A_q, A_s = fp8_blockwise_act_quant_lhs(A, dtype=dtype)
-    B_t_q, B_t_s = fp8_blockwise_weight_quant_transposed_rhs(B, dtype=dtype)
+    A_q, A_s = triton_fp8_blockwise_act_quant_lhs(A, dtype=dtype)
+    B_t_q, B_t_s = triton_fp8_blockwise_weight_quant_transposed_rhs(B, dtype=dtype)
     C_q = triton_fp8_gemm_1x128_128x128(
         A_q, B_t_q, A_s, B_t_s, out_dtype=torch.bfloat16
     )
@@ -76,8 +76,8 @@ def test_triton_fp8_gemm_1x128_128x1(M, N, K, dtype):
     A = torch.randn(K, M, dtype=torch.bfloat16, device="cuda")
     B = torch.randn(K, N, dtype=torch.bfloat16, device="cuda")
     C = A.T @ B
-    A_t_q, A_t_s = fp8_blockwise_act_quant_transposed_lhs(A, dtype=dtype)
-    B_q, B_s = fp8_blockwise_act_quant_rhs(B, dtype=dtype)
+    A_t_q, A_t_s = triton_fp8_blockwise_act_quant_transposed_lhs(A, dtype=dtype)
+    B_q, B_s = triton_fp8_blockwise_act_quant_rhs(B, dtype=dtype)
     C_q = triton_fp8_gemm_1x128_128x1(A_t_q, B_q, A_t_s, B_s, out_dtype=torch.bfloat16)
 
     assert not C_q.isnan().any(), "C_q must not contain NaNs"
@@ -102,7 +102,7 @@ def test_triton_quantize_fp8_act_quant_lhs(block_size):
     x[0, :block_size] = 0.0
 
     # Get the quantized tensor and reciprocal scales using triton implementation
-    triton_fp8, triton_scale = fp8_blockwise_act_quant_lhs(
+    triton_fp8, triton_scale = triton_fp8_blockwise_act_quant_lhs(
         x,
         block_size=block_size,
     )
@@ -149,7 +149,7 @@ def test_triton_quantize_fp8_act_quant_rhs(block_size: int):
     x[:block_size, :block_size] = 0.0
 
     # Get the quantized tensor and reciprocal scales using triton implementation
-    triton_fp8, triton_scale = fp8_blockwise_act_quant_rhs(
+    triton_fp8, triton_scale = triton_fp8_blockwise_act_quant_rhs(
         x,
         block_size=block_size,
     )
@@ -196,7 +196,7 @@ def test_triton_quantize_fp8_act_quant_transposed_lhs(M, K, block_size: int):
     x[0, :block_size] = 0.0
 
     # Get the quantized tensor and reciprocal scales using triton implementation
-    triton_fp8, triton_scale = fp8_blockwise_act_quant_transposed_lhs(
+    triton_fp8, triton_scale = triton_fp8_blockwise_act_quant_transposed_lhs(
         x,
         block_size=block_size,
     )
@@ -245,7 +245,7 @@ def test_triton_quantize_fp8_weight_quant_rhs(M, K, block_size: int):
     x[:block_size, :block_size] = 0.0
 
     # Get the quantized tensor and reciprocal scales using triton implementation
-    triton_fp8, triton_scale = fp8_blockwise_weight_quant_rhs(
+    triton_fp8, triton_scale = triton_fp8_blockwise_weight_quant_rhs(
         x,
         block_size=block_size,
     )
@@ -292,7 +292,7 @@ def test_triton_quantize_fp8_weight_quant_transposed_rhs(block_size: int):
     x[:block_size, :block_size] = 0.0
 
     # Get the quantized tensor and reciprocal scales using triton implementation
-    triton_fp8, triton_scale = fp8_blockwise_weight_quant_transposed_rhs(
+    triton_fp8, triton_scale = triton_fp8_blockwise_weight_quant_transposed_rhs(
         x,
         block_size=block_size,
     )
diff --git a/torchao/prototype/blockwise_fp8_training/kernels.py b/torchao/prototype/blockwise_fp8_training/kernels.py
@@ -9,6 +9,7 @@
 import torch
 import triton
 import triton.language as tl
+from torch.library import triton_op, wrap_triton
 
 from torchao.prototype.moe_training.utils import (
     _is_column_major,
@@ -119,7 +120,7 @@ def triton_fp8_gemm_1x128_128x128(
         triton.cdiv(M, META["BLOCK_SIZE_M"]),
         triton.cdiv(N, META["BLOCK_SIZE_N"]),
     )
-    triton_fp8_gemm_1x128_128x128_kernel[grid](
+    wrap_triton(triton_fp8_gemm_1x128_128x128_kernel)[grid](
         a,
         a.stride(0),
         a.stride(1),
@@ -234,7 +235,7 @@ def triton_fp8_gemm_1x128_128x1(
         triton.cdiv(M, META["BLOCK_SIZE_M"]),
         triton.cdiv(N, META["BLOCK_SIZE_N"]),
     )
-    triton_fp8_gemm_1x128_128x1_kernel[grid](
+    wrap_triton(triton_fp8_gemm_1x128_128x1_kernel)[grid](
         a,
         a.stride(0),
         a.stride(1),
@@ -281,7 +282,7 @@ def triton_fp8_gemm_1x128_128x1(
 
 @triton.autotune(configs=quant_kernel_configs_with_groups, key=["K"])
 @triton.jit
-def fp8_blockwise_act_quant_lhs_kernel(
+def triton_fp8_blockwise_act_quant_lhs_kernel(
     x_ptr,
     x_stride_dim_0,
     x_stride_dim_1,
@@ -327,7 +328,8 @@ def fp8_blockwise_act_quant_lhs_kernel(
     tl.store(s_ptr + scale_offs, tl.div_rn(1.0, scale))
 
 
-def fp8_blockwise_act_quant_lhs(
+@triton_op("torchao::triton_fp8_blockwise_act_quant_lhs", mutates_args={})
+def triton_fp8_blockwise_act_quant_lhs(
     x: torch.Tensor, block_size: int = 128, dtype: torch.dtype = torch.float8_e4m3fn
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
@@ -352,7 +354,7 @@ def fp8_blockwise_act_quant_lhs(
         triton.cdiv(M, meta["NUM_GROUPS"]),
         triton.cdiv(K, meta["BLOCK_SIZE"]),
     )
-    fp8_blockwise_act_quant_lhs_kernel[grid](
+    wrap_triton(triton_fp8_blockwise_act_quant_lhs_kernel)[grid](
         x,
         x.stride(0),
         x.stride(1),
@@ -372,7 +374,7 @@ def fp8_blockwise_act_quant_lhs(
 
 @triton.autotune(configs=quant_kernel_configs_with_groups, key=["K"])
 @triton.jit
-def fp8_blockwise_act_quant_rhs_kernel(
+def triton_fp8_blockwise_act_quant_rhs_kernel(
     x_ptr,
     x_stride_dim_0,
     x_stride_dim_1,
@@ -420,7 +422,8 @@ def fp8_blockwise_act_quant_rhs_kernel(
     tl.store(s_ptr + scale_offs, tl.div_rn(1.0, scale))
 
 
-def fp8_blockwise_act_quant_rhs(
+@triton_op("torchao::triton_fp8_blockwise_act_quant_rhs", mutates_args={})
+def triton_fp8_blockwise_act_quant_rhs(
     x: torch.Tensor, block_size: int = 128, dtype: torch.dtype = torch.float8_e4m3fn
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
@@ -444,7 +447,7 @@ def fp8_blockwise_act_quant_rhs(
         triton.cdiv(M, meta["BLOCK_SIZE"]),
         triton.cdiv(K, meta["NUM_GROUPS"]),
     )
-    fp8_blockwise_act_quant_rhs_kernel[grid](
+    wrap_triton(triton_fp8_blockwise_act_quant_rhs_kernel)[grid](
         x,
         x.stride(0),
         x.stride(1),
@@ -464,7 +467,7 @@ def fp8_blockwise_act_quant_rhs(
 
 @triton.autotune(configs=quant_kernel_configs_with_groups, key=["K"])
 @triton.jit
-def fp8_blockwise_act_quant_transposed_lhs_kernel(
+def triton_fp8_blockwise_act_quant_transposed_lhs_kernel(
     x_ptr,
     x_stride_dim_0,
     x_stride_dim_1,
@@ -524,7 +527,8 @@ def fp8_blockwise_act_quant_transposed_lhs_kernel(
     tl.store(s_ptr + scale_offs, tl.div_rn(1.0, scale), mask=scale_mask)
 
 
-def fp8_blockwise_act_quant_transposed_lhs(
+@triton_op("torchao::triton_fp8_blockwise_act_quant_transposed_lhs", mutates_args={})
+def triton_fp8_blockwise_act_quant_transposed_lhs(
     x: torch.Tensor, block_size: int = 128, dtype: torch.dtype = torch.float8_e4m3fn
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     assert x.is_contiguous(), "Input tensor must be contiguous"
@@ -550,7 +554,7 @@ def fp8_blockwise_act_quant_transposed_lhs(
         triton.cdiv(K, meta["NUM_GROUPS"]),
     )
 
-    fp8_blockwise_act_quant_transposed_lhs_kernel[grid](
+    wrap_triton(triton_fp8_blockwise_act_quant_transposed_lhs_kernel)[grid](
         x,
         x.stride(0),
         x.stride(1),
@@ -570,7 +574,7 @@ def fp8_blockwise_act_quant_transposed_lhs(
 
 @triton.autotune(configs=quant_kernel_configs, key=["M", "N"])
 @triton.jit
-def fp8_blockwise_weight_quant_rhs_kernel(
+def triton_fp8_blockwise_weight_quant_rhs_kernel(
     x_ptr,
     x_stride_dim_0,
     x_stride_dim_1,
@@ -615,8 +619,9 @@ def fp8_blockwise_weight_quant_rhs_kernel(
     tl.store(s_ptr + scale_m_off + scale_n_off, tl.div_rn(1.0, scale))
 
 
-def fp8_blockwise_weight_quant_rhs(
-    x: torch.Tensor, block_size: int = 128, dtype=torch.float8_e4m3fn
+@triton_op("torchao::triton_fp8_blockwise_weight_quant_rhs", mutates_args={})
+def triton_fp8_blockwise_weight_quant_rhs(
+    x: torch.Tensor, block_size: int = 128, dtype: torch.dtype = torch.float8_e4m3fn
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     assert x.is_contiguous(), "Input tensor must be contiguous"
     assert x.dim() == 2, "Input tensor must have 2 dimensions"
@@ -638,7 +643,7 @@ def fp8_blockwise_weight_quant_rhs(
         triton.cdiv(M, meta["BLOCK_SIZE"]),
         triton.cdiv(N, meta["BLOCK_SIZE"]),
     )
-    fp8_blockwise_weight_quant_rhs_kernel[grid](
+    wrap_triton(triton_fp8_blockwise_weight_quant_rhs_kernel)[grid](
         x,
         x.stride(0),
         x.stride(1),
@@ -658,7 +663,7 @@ def fp8_blockwise_weight_quant_rhs(
 
 @triton.autotune(configs=quant_kernel_configs, key=["M", "N"])
 @triton.jit
-def fp8_blockwise_weight_quant_transposed_rhs_kernel(
+def triton_fp8_blockwise_weight_quant_transposed_rhs_kernel(
     x_ptr,
     x_stride_dim_0,
     x_stride_dim_1,
@@ -719,8 +724,9 @@ def fp8_blockwise_weight_quant_transposed_rhs_kernel(
     tl.store(s_ptr + scale_offs, tl.div_rn(1.0, scale), mask=scale_mask)
 
 
-def fp8_blockwise_weight_quant_transposed_rhs(
-    x: torch.Tensor, block_size: int = 128, dtype=torch.float8_e4m3fn
+@triton_op("torchao::triton_fp8_blockwise_weight_quant_transposed_rhs", mutates_args={})
+def triton_fp8_blockwise_weight_quant_transposed_rhs(
+    x: torch.Tensor, block_size: int = 128, dtype: torch.dtype = torch.float8_e4m3fn
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     assert x.is_contiguous(), "Input tensor must be contiguous"
     assert x.dim() == 2, "Input tensor must have 2 dimensions"
@@ -742,7 +748,7 @@ def fp8_blockwise_weight_quant_transposed_rhs(
         triton.cdiv(M, meta["BLOCK_SIZE"]),
         triton.cdiv(N, meta["BLOCK_SIZE"]),
     )
-    fp8_blockwise_weight_quant_transposed_rhs_kernel[grid](
+    wrap_triton(triton_fp8_blockwise_weight_quant_transposed_rhs_kernel)[grid](
         x,
         x.stride(0),
         x.stride(1),
diff --git a/torchao/prototype/blockwise_fp8_training/linear.py b/torchao/prototype/blockwise_fp8_training/linear.py
@@ -9,11 +9,11 @@
 
 from torchao.core.config import AOBaseConfig
 from torchao.prototype.blockwise_fp8_training.kernels import (
-    fp8_blockwise_act_quant_lhs,
-    fp8_blockwise_act_quant_rhs,
-    fp8_blockwise_act_quant_transposed_lhs,
-    fp8_blockwise_weight_quant_rhs,
-    fp8_blockwise_weight_quant_transposed_rhs,
+    triton_fp8_blockwise_act_quant_lhs,
+    triton_fp8_blockwise_act_quant_rhs,
+    triton_fp8_blockwise_act_quant_transposed_lhs,
+    triton_fp8_blockwise_weight_quant_rhs,
+    triton_fp8_blockwise_weight_quant_transposed_rhs,
     triton_fp8_gemm_1x128_128x1,
     triton_fp8_gemm_1x128_128x128,
 )
@@ -33,10 +33,10 @@ def forward(ctx, x, weight, block_size, out_dtype=torch.bfloat16, use_triton=Fal
         x = x.reshape(-1, x_orig_shape[-1])
 
         # Cast inputs to fp8 blockwise using (1, block_size) scaling granularity in row major format.
-        x_fp8, x_scale = fp8_blockwise_act_quant_lhs(x, block_size)
+        x_fp8, x_scale = triton_fp8_blockwise_act_quant_lhs(x, block_size)
 
         # Cast weight to fp8 blockwise using (block_size, block_size) scaling granularity, with transposed dims in column major format.
-        weight_t_fp8, weight_t_scale = fp8_blockwise_weight_quant_transposed_rhs(
+        weight_t_fp8, weight_t_scale = triton_fp8_blockwise_weight_quant_transposed_rhs(
             weight,
             block_size=block_size,
         )
@@ -74,13 +74,13 @@ def backward(ctx, grad_output):
         assert grad_output.shape[1] % 128 == 0, "unsupported"
 
         # Cast grad_output to fp8 blockwise 1x128 since it is the grad of the output activation.
-        grad_output_fp8, grad_output_scale = fp8_blockwise_act_quant_lhs(
+        grad_output_fp8, grad_output_scale = triton_fp8_blockwise_act_quant_lhs(
             grad_output,
             block_size,
         )
 
         # Cast weight to fp8 blockwise to 128x128 in column major format.
-        weight_fp8, weight_scale = fp8_blockwise_weight_quant_rhs(
+        weight_fp8, weight_scale = triton_fp8_blockwise_weight_quant_rhs(
             weight,
             block_size=block_size,
         )
@@ -100,15 +100,17 @@ def backward(ctx, grad_output):
         # Cast grad_output_t to fp8 blockwise with (1 x block_size) scaling groups, since it is
         # the grad of the output activation.
         # Write directly with transposed dims in row major format, as needed for dW calc.
-        grad_output_t_fp8, grad_output_t_scale = fp8_blockwise_act_quant_transposed_lhs(
-            grad_output,
-            block_size,
+        grad_output_t_fp8, grad_output_t_scale = (
+            triton_fp8_blockwise_act_quant_transposed_lhs(
+                grad_output,
+                block_size,
+            )
         )
 
         # Cast x to fp8 blockwise with (block_size x 1) scaling groups, in column major format.
         # RHS should have groupwise scales calculated colwise, so scaling groups do not cross the
         # contracting (K) dim.
-        x_fp8, x_scale = fp8_blockwise_act_quant_rhs(x, block_size)
+        x_fp8, x_scale = triton_fp8_blockwise_act_quant_rhs(x, block_size)
 
         # grad_weight = grad_output.T @ x
         fp8_gemm_1x128_128x1 = (