Skip to content

Commit

Permalink
Fix CUDA kernel compiler error
Browse files Browse the repository at this point in the history
  • Loading branch information
zhongkaifu committed Aug 31, 2023
1 parent 9a85d73 commit c5ae2dd
Showing 1 changed file with 75 additions and 71 deletions.
146 changes: 75 additions & 71 deletions TensorSharp.CUDA/DeviceCode/AdvFuncKernels.cs
Original file line number Diff line number Diff line change
Expand Up @@ -537,42 +537,6 @@ __global__ void RoPE(float* __restrict__ result, float* __restrict__ src, int ro
}
}
__global__ void RoPEHalf(__half* __restrict__ result, __half* __restrict__ src, int rows, int cols, int seqLen)
{
for(int bid = 0; bid < rows; bid += gridDim.x)
{
int j = bid + blockIdx.x;
if(j < rows)
{
__half* resultRow = result + j * cols;
__half* srcRow = src + j * cols;
int m = j % seqLen;
for(int tid = 0; tid < cols; tid += blockDim.x)
{
int id = tid + threadIdx.x;
if(id < cols)
{
int i = id / 2;
float theta = __powf(10000.0, -2.0 * i / cols);
float theta_m = theta * m;
float cos_theta_m = __cosf(theta_m);
float sin_theta_m = __sinf(theta_m);
if (id % 2 == 0)
{
resultRow[id] = __float2half(__half2float(srcRow[id]) * cos_theta_m - __half2float(srcRow[id + 1]) * sin_theta_m);
}
else
{
resultRow[id] = __float2half(__half2float(srcRow[id]) * cos_theta_m + __half2float(srcRow[id - 1]) * sin_theta_m);
}
}
}
}
}
}
__global__ void RoPEGrad(float* __restrict__ grad, float* __restrict__ adj, int rows, int cols, int seqLen)
{
for(int bid = 0; bid < rows; bid += gridDim.x)
Expand Down Expand Up @@ -609,41 +573,7 @@ __global__ void RoPEGrad(float* __restrict__ grad, float* __restrict__ adj, int
}
}
__global__ void RoPEGradHalf(__half* __restrict__ grad, __half* __restrict__ adj, int rows, int cols, int seqLen)
{
for(int bid = 0; bid < rows; bid += gridDim.x)
{
int j = bid + blockIdx.x;
if(j < rows)
{
__half* gradRow = grad + j * cols;
__half* adjRow = adj + j * cols;
int m = j % seqLen;
for(int tid = 0; tid < cols; tid += blockDim.x)
{
int id = tid + threadIdx.x;
if(id < cols)
{
int i = id / 2;
float theta = __powf(10000.0, -2.0 * i / cols);
float theta_m = theta * m;
float cos_theta_m = __cosf(theta_m);
float sin_theta_m = __sinf(theta_m);
if (id % 2 == 0)
{
gradRow[id] = __float2half(__half2float(gradRow[id]) + __half2float(adjRow[id]) * cos_theta_m + __half2float(adjRow[id + 1]) * sin_theta_m);
}
else
{
gradRow[id] = __float2half(__half2float(gradRow[id]) + __half2float(adjRow[id]) * cos_theta_m - __half2float(adjRow[id - 1]) * sin_theta_m);
}
}
}
}
}
}
__global__ void BuildSrcTgtMask(float* __restrict__ result, float* __restrict__ srcOriginalLengths, float* __restrict__ tgtOriginalLengths, int rows, int cols, int tgtPaddedSeqLen, float value, float maskedValue)
{
Expand Down Expand Up @@ -1179,6 +1109,80 @@ __global__ void gLayerNormalizationGradHalf(__half* gradX,
}
}
__global__ void RoPEGradHalf(__half* __restrict__ grad, __half* __restrict__ adj, int rows, int cols, int seqLen)
{
for(int bid = 0; bid < rows; bid += gridDim.x)
{
int j = bid + blockIdx.x;
if(j < rows)
{
__half* gradRow = grad + j * cols;
__half* adjRow = adj + j * cols;
int m = j % seqLen;
for(int tid = 0; tid < cols; tid += blockDim.x)
{
int id = tid + threadIdx.x;
if(id < cols)
{
int i = id / 2;
float theta = __powf(10000.0, -2.0 * i / cols);
float theta_m = theta * m;
float cos_theta_m = __cosf(theta_m);
float sin_theta_m = __sinf(theta_m);
if (id % 2 == 0)
{
gradRow[id] = __float2half(__half2float(gradRow[id]) + __half2float(adjRow[id]) * cos_theta_m + __half2float(adjRow[id + 1]) * sin_theta_m);
}
else
{
gradRow[id] = __float2half(__half2float(gradRow[id]) + __half2float(adjRow[id]) * cos_theta_m - __half2float(adjRow[id - 1]) * sin_theta_m);
}
}
}
}
}
}
__global__ void RoPEHalf(__half* __restrict__ result, __half* __restrict__ src, int rows, int cols, int seqLen)
{
for(int bid = 0; bid < rows; bid += gridDim.x)
{
int j = bid + blockIdx.x;
if(j < rows)
{
__half* resultRow = result + j * cols;
__half* srcRow = src + j * cols;
int m = j % seqLen;
for(int tid = 0; tid < cols; tid += blockDim.x)
{
int id = tid + threadIdx.x;
if(id < cols)
{
int i = id / 2;
float theta = __powf(10000.0, -2.0 * i / cols);
float theta_m = theta * m;
float cos_theta_m = __cosf(theta_m);
float sin_theta_m = __sinf(theta_m);
if (id % 2 == 0)
{
resultRow[id] = __float2half(__half2float(srcRow[id]) * cos_theta_m - __half2float(srcRow[id + 1]) * sin_theta_m);
}
else
{
resultRow[id] = __float2half(__half2float(srcRow[id]) * cos_theta_m + __half2float(srcRow[id - 1]) * sin_theta_m);
}
}
}
}
}
}
__global__ void AdamHalf(__half* __restrict__ w, __half* __restrict__ g, float* __restrict__ v, float* __restrict__ m, unsigned rows, unsigned cols, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps)
{
for(int bid = 0; bid < rows; bid += gridDim.x)
Expand Down

0 comments on commit c5ae2dd

Please sign in to comment.