float16 constexpr type support #3044

lipracer · 2024-01-30T16:39:22Z

lipracer
Jan 30, 2024

When I try to solve this issue #2680 and simplify it to the following test:
Let's run this test:

import torch

torch.manual_seed(0)
# device = 'cpu'
dtype = torch.float16

dimension = (43,)
shape = (1, 2, dimension[0])

def get_tensor(shape, fill_value=0.9805, requires_grad=False, device=device, dtype=dtype, rand=False):
    if not rand:
        return torch.full(shape, fill_value=fill_value, requires_grad=requires_grad, device=device, dtype=dtype)
    else:
        return torch.rand(*shape, requires_grad=requires_grad, device=device, dtype=dtype)

a = get_tensor(shape, fill_value=0.9805, requires_grad=True, device=device, dtype=dtype)
a_bias = get_tensor(dimension, fill_value=0.2788, requires_grad=True, device=device, dtype=dtype)

b = a.detach().clone().cuda().requires_grad_(True)
b_bias = a_bias.detach().cuda().clone().requires_grad_(True)

output = bias_gelu_impl(a, a_bias)
target_output = fused_bias_gelu_triton(b, b_bias)

print('output:', output.view(-1)[42], output.view(-1)[43])
print('target_output:', target_output.view(-1)[42], target_output.view(-1)[43])

# CPU device output: 1.1282
# GPU eager  output: 1.1289 (0011110010000100)
# GPU triton output: 1.1279 (0011110010000011)

Surprisingly, triton has higher precision.

module {
  tt.func public @_fused_bias_gelu_fwd_kernel_0d1d2d34(%arg0: !tt.ptr<f16, 1> {tt.divisibility = 16 : i32} loc("/workspace/triton/issue_2680_full.py":42:0), %arg1: !tt.ptr<f16, 1> {tt.divisibility = 16 : i32} loc("/workspace/triton/issue_2680_full.py":42:0), %arg2: !tt.ptr<f16, 1> {tt.divisibility = 16 : i32} loc("/workspace/triton/issue_2680_full.py":42:0), %arg3: i32 loc("/workspace/triton/issue_2680_full.py":42:0), %arg4: i32 loc("/workspace/triton/issue_2680_full.py":42:0)) attributes {noinline = false} {
    %0 = tt.get_program_id x : i32 loc(#loc1)
    %1 = arith.muli %0, %arg3 : i32 loc(#loc2)
    %2 = tt.addptr %arg1, %1 : !tt.ptr<f16, 1>, i32 loc(#loc3)
    %3 = arith.muli %0, %arg3 : i32 loc(#loc4)
    %4 = tt.addptr %arg2, %3 : !tt.ptr<f16, 1>, i32 loc(#loc5)
    %5 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc6)
    %c0_i32 = arith.constant 0 : i32 loc(#loc7)
    %c1024_i32 = arith.constant 1024 : i32 loc(#loc7)
    %6 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc7)
    %7 = arith.bitcast %arg4 : i32 to i32 loc(#loc7)
    %8 = arith.bitcast %c1024_i32 : i32 to i32 loc(#loc7)
    %9 = llvm.mlir.undef : i32 loc(#loc7)
    scf.for %arg5 = %6 to %7 step %8  : i32 {
      %10 = tt.splat %arg5 : (i32) -> tensor<1024xi32> loc(#loc8)
      %11 = arith.addi %10, %5 : tensor<1024xi32> loc(#loc8)
      %12 = tt.splat %arg4 : (i32) -> tensor<1024xi32> loc(#loc9)
      %13 = arith.cmpi slt, %11, %12 : tensor<1024xi32> loc(#loc9)
      %14 = tt.splat %arg0 : (!tt.ptr<f16, 1>) -> tensor<1024x!tt.ptr<f16, 1>> loc(#loc10)
      %15 = tt.addptr %14, %11 : tensor<1024x!tt.ptr<f16, 1>>, tensor<1024xi32> loc(#loc10)
      %cst = arith.constant 0.000000e+00 : f32 loc(#loc11)
      %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc11)
      %16 = arith.truncf %cst_0 : tensor<1024xf32> to tensor<1024xf16> loc(#loc11)
      %17 = tt.load %15, %13, %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf16> loc(#loc11)
      %18 = arith.extf %17 : tensor<1024xf16> to tensor<1024xf32> loc(#loc12)
      %19 = tt.splat %2 : (!tt.ptr<f16, 1>) -> tensor<1024x!tt.ptr<f16, 1>> loc(#loc13)
      %20 = tt.addptr %19, %11 : tensor<1024x!tt.ptr<f16, 1>>, tensor<1024xi32> loc(#loc13)
      %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc14)
      %cst_2 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc14)
      %21 = arith.truncf %cst_2 : tensor<1024xf32> to tensor<1024xf16> loc(#loc14)
      %22 = tt.load %20, %13, %21 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf16> loc(#loc14)
      %23 = arith.extf %22 : tensor<1024xf16> to tensor<1024xf32> loc(#loc15)
      %24 = arith.addf %18, %23 : tensor<1024xf32> loc(#loc16)
      %cst_3 = arith.constant 0.797884583 : f32 loc(#loc17)
      %cst_4 = arith.constant dense<0.797884583> : tensor<1024xf32> loc(#loc17)
      %25 = arith.mulf %24, %cst_4 : tensor<1024xf32> loc(#loc17)
      %cst_5 = arith.constant 4.471500e-02 : f32 loc(#loc18)
      %cst_6 = arith.constant dense<4.471500e-02> : tensor<1024xf32> loc(#loc18)
      %26 = arith.mulf %24, %cst_6 : tensor<1024xf32> loc(#loc18)
      %27 = arith.mulf %26, %24 : tensor<1024xf32> loc(#loc19)
      %c1_i32 = arith.constant 1 : i32 loc(#loc20)
      %cst_7 = arith.constant dense<1> : tensor<1024xi32> loc(#loc20)
      %28 = arith.sitofp %cst_7 : tensor<1024xi32> to tensor<1024xf32> loc(#loc20)
      %29 = arith.addf %27, %28 : tensor<1024xf32> loc(#loc20)
      %30 = arith.mulf %25, %29 : tensor<1024xf32> loc(#loc21)
      %31 = tt.extern_elementwise %30 {libname = "", libpath = "", pure = true, symbol = "__nv_tanhf"} : (tensor<1024xf32>) -> tensor<1024xf32> loc(#loc22)
      %cst_8 = arith.constant 5.000000e-01 : f32 loc(#loc23)
      %cst_9 = arith.constant dense<5.000000e-01> : tensor<1024xf32> loc(#loc23)
      %32 = arith.mulf %24, %cst_9 : tensor<1024xf32> loc(#loc23)
      %cst_10 = arith.constant 1.000000e+00 : f32 loc(#loc24)
      %cst_11 = arith.constant dense<1.000000e+00> : tensor<1024xf32> loc(#loc24)
      %33 = arith.addf %31, %cst_11 : tensor<1024xf32> loc(#loc24)
      %34 = arith.mulf %32, %33 : tensor<1024xf32> loc(#loc25)
      %35 = tt.splat %4 : (!tt.ptr<f16, 1>) -> tensor<1024x!tt.ptr<f16, 1>> loc(#loc26)
      %36 = tt.addptr %35, %11 : tensor<1024x!tt.ptr<f16, 1>>, tensor<1024xi32> loc(#loc26)
      %37 = arith.truncf %34 : tensor<1024xf32> to tensor<1024xf16> loc(#loc27)
      tt.store %36, %37, %13 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf16> loc(#loc27)
    } loc(#loc7)
    tt.return loc(#loc28)
  } loc(#loc)
} loc(#loc)

When I printed out the MLIR, I found that triton uses float32 operations and then converts to float16.
Is triton considering direct float16 computation, or can it accept a PR? Can we enable it as a compilation option?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

float16 constexpr type support #3044

{{title}}

Replies: 0 comments

Select a reply

float16 constexpr type support #3044

lipracer Jan 30, 2024

Replies: 0 comments

lipracer
Jan 30, 2024