Skip to content

Commit

Permalink
attach a simple tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
karpathy committed Apr 8, 2024
1 parent 3bcb9ba commit c39de59
Show file tree
Hide file tree
Showing 4 changed files with 470 additions and 0 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ make test_gpt2

This now loads the `gpt2_124M_debug_state.bin` file, runs a forward pass, compares the logits and loss with the PyTorch reference implementation, then it does 10 iterations of training with Adam and makes sure the losses match PyTorch.

## tutorial

I attached a very small tutorial here, in [doc/layernorm/layernorm.md](doc/layernorm/layernorm.md). It's a simple, step-by-step guide to implementing a single layer of the GPT-2 model, the layernorm layer. This is a good starting point to understand how the layers are implemented in C.

## license

MIT
166 changes: 166 additions & 0 deletions doc/layernorm/layernorm.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
// must run `python layernorm.py` first to generate the reference data
// then compile for example as `gcc layernorm.c -o layernorm -lm`
// and then run as `./layernorm` to see the output

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

void layernorm_forward(float* out, float* mean, float* rstd,
float* inp, float* weight, float* bias,
int B, int T, int C) {
float eps = 1e-5f;
for (int b = 0; b < B; b++) {
for (int t = 0; t < T; t++) {
// seek to the input position inp[b,t,:]
float* x = inp + b * T * C + t * C;
// calculate the mean
float m = 0.0f;
for (int i = 0; i < C; i++) {
m += x[i];
}
m = m/C;
// calculate the variance (without any bias correction)
float v = 0.0f;
for (int i = 0; i < C; i++) {
float xshift = x[i] - m;
v += xshift * xshift;
}
v = v/C;
// calculate the rstd
float s = 1.0f / sqrtf(v + eps);
// seek to the output position in out[b,t,:]
float* out_bt = out + b * T * C + t * C;
for (int i = 0; i < C; i++) {
float n = (s * (x[i] - m)); // normalized output
float o = n * weight[i] + bias[i]; // scale and shift it
out_bt[i] = o; // write
}
// cache the mean and rstd for the backward pass later
mean[b * T + t] = m;
rstd[b * T + t] = s;
}
}
}

void layernorm_backward(float* dinp, float* dweight, float* dbias,
float* dout, float* inp, float* weight, float* mean, float* rstd,
int B, int T, int C) {
for (int b = 0; b < B; b++) {
for (int t = 0; t < T; t++) {
float* dout_bt = dout + b * T * C + t * C;
float* inp_bt = inp + b * T * C + t * C;
float* dinp_bt = dinp + b * T * C + t * C;
float mean_bt = mean[b * T + t];
float rstd_bt = rstd[b * T + t];

// first: two reduce operations
float dnorm_mean = 0.0f;
float dnorm_norm_mean = 0.0f;
for (int i = 0; i < C; i++) {
float norm_bti = (inp_bt[i] - mean_bt) * rstd_bt;
float dnorm_i = weight[i] * dout_bt[i];
dnorm_mean += dnorm_i;
dnorm_norm_mean += dnorm_i * norm_bti;
}
dnorm_mean = dnorm_mean / C;
dnorm_norm_mean = dnorm_norm_mean / C;

// now iterate again and accumulate all the gradients
for (int i = 0; i < C; i++) {
float norm_bti = (inp_bt[i] - mean_bt) * rstd_bt;
float dnorm_i = weight[i] * dout_bt[i];
// gradient contribution to bias
dbias[i] += dout_bt[i];
// gradient contribution to weight
dweight[i] += norm_bti * dout_bt[i];
// gradient contribution to input
float dval = 0.0f;
dval += dnorm_i; // term 1
dval -= dnorm_mean; // term 2
dval -= norm_bti * dnorm_norm_mean; // term 3
dval *= rstd_bt; // final scale
dinp_bt[i] += dval;
}
}
}
}

// poor man's tensor checker
int check_tensor(float *a, float *b, int n, char* label) {
int ok = 1;
printf("%s\n", label);
for (int i = 0; i < n; i++) {
if (fabs(a[i] - b[i]) <= 1e-5) {
printf("OK ");
} else {
printf("NOT OK ");
ok = 0;
}
printf("%f %f\n", a[i], b[i]);
}
return ok;
}

int main() {

int B = 2; // batch
int T = 3; // time / sequence length
int C = 4; // number of channels

float* x = (float*) malloc(B * T * C * sizeof(float));
float* w = (float*) malloc(C * sizeof(float));
float* b = (float*) malloc(C * sizeof(float));
float* out = (float*) malloc(B * T * C * sizeof(float));
float* mean = (float*) malloc(B * T * sizeof(float));
float* rstd = (float*) malloc(B * T * sizeof(float));
float* dout = (float*) malloc(B * T * C * sizeof(float));
float* dx = (float*) malloc(B * T * C * sizeof(float));
float* dw = (float*) malloc(C * sizeof(float));
float* db = (float*) malloc(C * sizeof(float));

// read reference information from Python
FILE *file = fopen("ln.bin", "rb");
if (file == NULL) {
printf("Error opening file\n");
return 1;
}
fread(x, sizeof(float), B * T * C, file);
fread(w, sizeof(float), C, file);
fread(b, sizeof(float), C, file);
fread(out, sizeof(float), B * T * C, file);
fread(mean, sizeof(float), B * T, file);
fread(rstd, sizeof(float), B * T, file);
fread(dout, sizeof(float), B * T * C, file);
fread(dx, sizeof(float), B * T * C, file);
fread(dw, sizeof(float), C, file);
fread(db, sizeof(float), C, file);
fclose(file);

// now let's calculate everything ourselves

// forward pass
float* c_out = (float*) malloc(B * T * C * sizeof(float));
float* c_mean = (float*) malloc(B * T * sizeof(float));
float* c_rstd = (float*) malloc(B * T * sizeof(float));
layernorm_forward(c_out, c_mean, c_rstd, x, w, b, B, T, C);

// check correctness of forward pass
check_tensor(out, c_out, B*T*C, "out");
check_tensor(mean, c_mean, B*T, "mean");
check_tensor(rstd, c_rstd, B*T, "rstd");

// backward pass (note calloc inits grads to zero)
float* c_dx = (float*) calloc(B * T * C, sizeof(float));
float* c_dw = (float*) calloc(B * T, sizeof(float));
float* c_db = (float*) calloc(B * T, sizeof(float));
layernorm_backward(c_dx, c_dw, c_db, dout, x, w, c_mean, c_rstd, B, T, C);

// check correctness of backward pass
check_tensor(c_dx, dx, B*T*C, "dx");
check_tensor(c_dw, dw, C, "dw");
check_tensor(c_db, db, C, "db");

// TODO free stuff
return 0;
}
Loading

0 comments on commit c39de59

Please sign in to comment.