forked from karpathy/llm.c
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
470 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
// must run `python layernorm.py` first to generate the reference data | ||
// then compile for example as `gcc layernorm.c -o layernorm -lm` | ||
// and then run as `./layernorm` to see the output | ||
|
||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <math.h> | ||
|
||
void layernorm_forward(float* out, float* mean, float* rstd, | ||
float* inp, float* weight, float* bias, | ||
int B, int T, int C) { | ||
float eps = 1e-5f; | ||
for (int b = 0; b < B; b++) { | ||
for (int t = 0; t < T; t++) { | ||
// seek to the input position inp[b,t,:] | ||
float* x = inp + b * T * C + t * C; | ||
// calculate the mean | ||
float m = 0.0f; | ||
for (int i = 0; i < C; i++) { | ||
m += x[i]; | ||
} | ||
m = m/C; | ||
// calculate the variance (without any bias correction) | ||
float v = 0.0f; | ||
for (int i = 0; i < C; i++) { | ||
float xshift = x[i] - m; | ||
v += xshift * xshift; | ||
} | ||
v = v/C; | ||
// calculate the rstd | ||
float s = 1.0f / sqrtf(v + eps); | ||
// seek to the output position in out[b,t,:] | ||
float* out_bt = out + b * T * C + t * C; | ||
for (int i = 0; i < C; i++) { | ||
float n = (s * (x[i] - m)); // normalized output | ||
float o = n * weight[i] + bias[i]; // scale and shift it | ||
out_bt[i] = o; // write | ||
} | ||
// cache the mean and rstd for the backward pass later | ||
mean[b * T + t] = m; | ||
rstd[b * T + t] = s; | ||
} | ||
} | ||
} | ||
|
||
void layernorm_backward(float* dinp, float* dweight, float* dbias, | ||
float* dout, float* inp, float* weight, float* mean, float* rstd, | ||
int B, int T, int C) { | ||
for (int b = 0; b < B; b++) { | ||
for (int t = 0; t < T; t++) { | ||
float* dout_bt = dout + b * T * C + t * C; | ||
float* inp_bt = inp + b * T * C + t * C; | ||
float* dinp_bt = dinp + b * T * C + t * C; | ||
float mean_bt = mean[b * T + t]; | ||
float rstd_bt = rstd[b * T + t]; | ||
|
||
// first: two reduce operations | ||
float dnorm_mean = 0.0f; | ||
float dnorm_norm_mean = 0.0f; | ||
for (int i = 0; i < C; i++) { | ||
float norm_bti = (inp_bt[i] - mean_bt) * rstd_bt; | ||
float dnorm_i = weight[i] * dout_bt[i]; | ||
dnorm_mean += dnorm_i; | ||
dnorm_norm_mean += dnorm_i * norm_bti; | ||
} | ||
dnorm_mean = dnorm_mean / C; | ||
dnorm_norm_mean = dnorm_norm_mean / C; | ||
|
||
// now iterate again and accumulate all the gradients | ||
for (int i = 0; i < C; i++) { | ||
float norm_bti = (inp_bt[i] - mean_bt) * rstd_bt; | ||
float dnorm_i = weight[i] * dout_bt[i]; | ||
// gradient contribution to bias | ||
dbias[i] += dout_bt[i]; | ||
// gradient contribution to weight | ||
dweight[i] += norm_bti * dout_bt[i]; | ||
// gradient contribution to input | ||
float dval = 0.0f; | ||
dval += dnorm_i; // term 1 | ||
dval -= dnorm_mean; // term 2 | ||
dval -= norm_bti * dnorm_norm_mean; // term 3 | ||
dval *= rstd_bt; // final scale | ||
dinp_bt[i] += dval; | ||
} | ||
} | ||
} | ||
} | ||
|
||
// poor man's tensor checker | ||
int check_tensor(float *a, float *b, int n, char* label) { | ||
int ok = 1; | ||
printf("%s\n", label); | ||
for (int i = 0; i < n; i++) { | ||
if (fabs(a[i] - b[i]) <= 1e-5) { | ||
printf("OK "); | ||
} else { | ||
printf("NOT OK "); | ||
ok = 0; | ||
} | ||
printf("%f %f\n", a[i], b[i]); | ||
} | ||
return ok; | ||
} | ||
|
||
int main() { | ||
|
||
int B = 2; // batch | ||
int T = 3; // time / sequence length | ||
int C = 4; // number of channels | ||
|
||
float* x = (float*) malloc(B * T * C * sizeof(float)); | ||
float* w = (float*) malloc(C * sizeof(float)); | ||
float* b = (float*) malloc(C * sizeof(float)); | ||
float* out = (float*) malloc(B * T * C * sizeof(float)); | ||
float* mean = (float*) malloc(B * T * sizeof(float)); | ||
float* rstd = (float*) malloc(B * T * sizeof(float)); | ||
float* dout = (float*) malloc(B * T * C * sizeof(float)); | ||
float* dx = (float*) malloc(B * T * C * sizeof(float)); | ||
float* dw = (float*) malloc(C * sizeof(float)); | ||
float* db = (float*) malloc(C * sizeof(float)); | ||
|
||
// read reference information from Python | ||
FILE *file = fopen("ln.bin", "rb"); | ||
if (file == NULL) { | ||
printf("Error opening file\n"); | ||
return 1; | ||
} | ||
fread(x, sizeof(float), B * T * C, file); | ||
fread(w, sizeof(float), C, file); | ||
fread(b, sizeof(float), C, file); | ||
fread(out, sizeof(float), B * T * C, file); | ||
fread(mean, sizeof(float), B * T, file); | ||
fread(rstd, sizeof(float), B * T, file); | ||
fread(dout, sizeof(float), B * T * C, file); | ||
fread(dx, sizeof(float), B * T * C, file); | ||
fread(dw, sizeof(float), C, file); | ||
fread(db, sizeof(float), C, file); | ||
fclose(file); | ||
|
||
// now let's calculate everything ourselves | ||
|
||
// forward pass | ||
float* c_out = (float*) malloc(B * T * C * sizeof(float)); | ||
float* c_mean = (float*) malloc(B * T * sizeof(float)); | ||
float* c_rstd = (float*) malloc(B * T * sizeof(float)); | ||
layernorm_forward(c_out, c_mean, c_rstd, x, w, b, B, T, C); | ||
|
||
// check correctness of forward pass | ||
check_tensor(out, c_out, B*T*C, "out"); | ||
check_tensor(mean, c_mean, B*T, "mean"); | ||
check_tensor(rstd, c_rstd, B*T, "rstd"); | ||
|
||
// backward pass (note calloc inits grads to zero) | ||
float* c_dx = (float*) calloc(B * T * C, sizeof(float)); | ||
float* c_dw = (float*) calloc(B * T, sizeof(float)); | ||
float* c_db = (float*) calloc(B * T, sizeof(float)); | ||
layernorm_backward(c_dx, c_dw, c_db, dout, x, w, c_mean, c_rstd, B, T, C); | ||
|
||
// check correctness of backward pass | ||
check_tensor(c_dx, dx, B*T*C, "dx"); | ||
check_tensor(c_dw, dw, C, "dw"); | ||
check_tensor(c_db, db, C, "db"); | ||
|
||
// TODO free stuff | ||
return 0; | ||
} |
Oops, something went wrong.