pytorch
diff --git a/‎benchmarks/prototype/moe_training/benchmark_per_group_scaling_kernels.py renamed to ‎benchmarks/prototype/moe_training/benchmark_per_group_colwise_scaling_kernels.py
Lines changed: 59 additions & 64 deletions b/‎benchmarks/prototype/moe_training/benchmark_per_group_scaling_kernels.py renamed to ‎benchmarks/prototype/moe_training/benchmark_per_group_colwise_scaling_kernels.py
Lines changed: 59 additions & 64 deletions
@@ -16,12 +16,10 @@
 
 from torchao.prototype.moe_training.kernels.jagged_float8_scales import (
     triton_fp8_per_group_colwise_scales,
-    triton_fp8_per_group_rowwise_scales,
 )
 from torchao.prototype.moe_training.utils import (
     generate_jagged_offs,
     torch_to_float8_per_group_colwise,
-    torch_to_float8_per_group_rowwise,
 )
 
 device = torch.device("cuda")
@@ -39,7 +37,7 @@ class ExperimentConfig:
 
 @dataclass(frozen=True)
 class ExperimentResult:
-    torch_time_us: float
+    torch_loop_time_us: float
     triton_time_us: float
     torch_mem_bw_gbps: float
     triton_mem_bw_gbps: float
@@ -53,7 +51,7 @@ class Experiment:
 
 def get_configs() -> List[ExperimentConfig]:
     input_shapes = [(16640, 5120)]  # (Mg, K)
-    n_groups_list = [1, 16, 128]
+    n_groups_list = [1, 16, 64]
     high_precision_dtypes = [torch.bfloat16]
     configs = []
     for input_shape, n_groups, high_precision_dtype in itertools.product(
@@ -70,85 +68,82 @@ def get_configs() -> List[ExperimentConfig]:
 
 
 def run_experiment(config: ExperimentConfig) -> ExperimentResult:
-    # define test inputs
-    input_tensor = torch.randn(
-        *config.input_shape,
-        dtype=config.high_precision_dtype,
-        device=device,
+    # Define test inputs
+    Mg, K = config.input_shape
+
+    # Column major input tensor.
+    # Right operand in grad_weight = grad_output_t @ input
+    input_tensor = (
+        torch.randn(
+            Mg,
+            K,
+            dtype=config.high_precision_dtype,
+            device=device,
+        )
+        .transpose(-2, -1)
+        .contiguous()
+        .transpose(-2, -1)
     )
-    input_row_major = input_tensor.clone().detach()
-    input_col_major = input_tensor.clone().detach().t()
 
     # - configure input to be row-major with groups divided along the column dimension,
     #   representing the left operand of grad_weight = grad_output_t @ input
     #   that occurs in the backward pass of the differentiable scaled grouped mm.
     # - the transposed tensor in col-major format with groups along the row dimension,
     #    which represents the right operand.
     n_groups = config.n_groups
-    Mg = input_row_major.shape[0]
     offs = generate_jagged_offs(n_groups, Mg, multiple_of=16)
 
     def warmup(func, *args, **kwargs):
         for _ in range(10):
             func(*args, **kwargs)
 
-    def run_torch(
-        input_row_major: torch.Tensor, input_col_major: torch.Tensor, offs: torch.Tensor
-    ):
-        _ = torch_to_float8_per_group_rowwise(
-            input_row_major,
-            offs,
-            target_dtype=torch.float8_e4m3fn,
-            round_scales_to_power_of_2=True,
-        )
-        _ = torch_to_float8_per_group_colwise(
-            input_col_major,
-            offs,
-            target_dtype=torch.float8_e4m3fn,
-            round_scales_to_power_of_2=True,
-        )
-
-    def run_triton(
-        input_row_major: torch.Tensor, input_col_major: torch.Tensor, offs: torch.Tensor
-    ):
-        _ = triton_fp8_per_group_rowwise_scales(
-            input_row_major,
-            offs,
-            output_dtype=torch.float8_e4m3fn,
-            round_scales_to_power_of_2=True,
-        )
-        _ = triton_fp8_per_group_colwise_scales(
-            input_col_major,
-            offs,
-            output_dtype=torch.float8_e4m3fn,
-            round_scales_to_power_of_2=True,
-        )
-
-    # bench torch
-    compiled_run_torch = torch.compile(run_torch)
-    warmup(compiled_run_torch, input_row_major, input_col_major, offs)
-    torch_time_us = benchmark_cuda_function_in_microseconds(
-        compiled_run_torch, input_row_major, input_col_major, offs
+    # Bench torch per group colwise
+    torch_to_float8_per_group_colwise_c = torch.compile(
+        torch_to_float8_per_group_colwise
+    )
+    warmup(
+        torch_to_float8_per_group_colwise_c,
+        input_tensor,
+        offs,
+        target_dtype=torch.float8_e4m3fn,
+    )
+    torch_loop_time_us = benchmark_cuda_function_in_microseconds(
+        torch_to_float8_per_group_colwise_c,
+        input_tensor,
+        offs,
+        target_dtype=torch.float8_e4m3fn,
     )
 
-    # bench triton
-    warmup(run_triton, input_row_major, input_col_major, offs)
+    # Bench triton per group colwise
+    warmup(
+        triton_fp8_per_group_colwise_scales,
+        input_tensor,
+        offs,
+        output_dtype=torch.float8_e4m3fn,
+        round_scales_to_power_of_2=True,
+    )
     triton_time_us = benchmark_cuda_function_in_microseconds(
-        run_triton, input_row_major, input_col_major, offs
+        triton_fp8_per_group_colwise_scales,
+        input_tensor,
+        offs,
+        output_dtype=torch.float8_e4m3fn,
+        round_scales_to_power_of_2=True,
     )
 
-    # mem bw calculations - excluding scales to simplify calculation
-    # but still get an accurate estimate.
+    # Mem bw calculations
     bytes_per_input_el = torch.finfo(config.high_precision_dtype).bits / 8
     num_elements = input_tensor.numel()
-    read_bytes = num_elements * bytes_per_input_el
-    write_bytes = num_elements  # 1 byte per element in float8_e4m3fn
+    # 2x read_bytes because we are reading the input tensor twice (once to compute scales, once to apply them)
+    read_bytes = 2 * num_elements * bytes_per_input_el
+    write_bytes = num_elements + 4 * (
+        n_groups * K
+    )  # 1 byte per element in float8_e4m3fn + 4 bytes per fp32 scale
     read_write_bytes = read_bytes + write_bytes
-    torch_mem_bw_gbps = (read_write_bytes) / (torch_time_us / 1e6) / 1e9
+    torch_mem_bw_gbps = (read_write_bytes) / (torch_loop_time_us / 1e6) / 1e9
     triton_mem_bw_gbps = (read_write_bytes) / (triton_time_us / 1e6) / 1e9
 
     return ExperimentResult(
-        torch_time_us=torch_time_us,
+        torch_loop_time_us=torch_loop_time_us,
         triton_time_us=triton_time_us,
         torch_mem_bw_gbps=torch_mem_bw_gbps,
         triton_mem_bw_gbps=triton_mem_bw_gbps,
@@ -157,10 +152,10 @@ def run_triton(
 
 def print_results(experiments: List[Experiment]):
     headers = [
-        "input_shape",
+        "Mg,K",
         "n_groups",
         "high_precision_dtype",
-        "torch_time_us",
+        "torch_loop_time_us",
         "triton_time_us",
         "torch_mem_bw_gbps",
         "triton_mem_bw_gbps",
@@ -176,18 +171,18 @@ def print_results(experiments: List[Experiment]):
                 input_shape,
                 experiment.config.n_groups,
                 experiment.config.high_precision_dtype,
-                experiment.result.torch_time_us,
+                experiment.result.torch_loop_time_us,
                 experiment.result.triton_time_us,
                 round(experiment.result.torch_mem_bw_gbps, 3),
                 round(experiment.result.triton_mem_bw_gbps, 3),
-                f"{experiment.result.torch_time_us / experiment.result.triton_time_us:.2f}x",
+                f"{experiment.result.torch_loop_time_us / experiment.result.triton_time_us:.2f}x",
             ]
         )
     print(tabulate(rows, headers=headers))
 
 
-def benchmark_cuda_function_in_microseconds(f, *args):
-    return do_bench(lambda: f(*args), return_mode="median") * 1e3
+def benchmark_cuda_function_in_microseconds(f, *args, **kwargs):
+    return do_bench(lambda: f(*args, **kwargs), return_mode="median") * 1e3
 
 
 def main():