attach a simple tutorial

suisui-koubou · Apr 8, 2024 · c39de59 · c39de59
1 parent 3bcb9ba
commit c39de59
Show file tree

Hide file tree

Showing 4 changed files with 470 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -106,6 +106,10 @@ make test_gpt2
 
 This now loads the `gpt2_124M_debug_state.bin` file, runs a forward pass, compares the logits and loss with the PyTorch reference implementation, then it does 10 iterations of training with Adam and makes sure the losses match PyTorch.
 
+## tutorial
+
+I attached a very small tutorial here, in [doc/layernorm/layernorm.md](doc/layernorm/layernorm.md). It's a simple, step-by-step guide to implementing a single layer of the GPT-2 model, the layernorm layer. This is a good starting point to understand how the layers are implemented in C.
+
 ## license
 
 MIT
diff --git a/doc/layernorm/layernorm.c b/doc/layernorm/layernorm.c
@@ -0,0 +1,166 @@
+// must run `python layernorm.py` first to generate the reference data
+// then compile for example as `gcc layernorm.c -o layernorm -lm`
+// and then run as `./layernorm` to see the output
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+void layernorm_forward(float* out, float* mean, float* rstd,
+                       float* inp, float* weight, float* bias,
+                       int B, int T, int C) {
+    float eps = 1e-5f;
+    for (int b = 0; b < B; b++) {
+        for (int t = 0; t < T; t++) {
+            // seek to the input position inp[b,t,:]
+            float* x = inp + b * T * C + t * C;
+            // calculate the mean
+            float m = 0.0f;
+            for (int i = 0; i < C; i++) {
+                m += x[i];
+            }
+            m = m/C;
+            // calculate the variance (without any bias correction)
+            float v = 0.0f;
+            for (int i = 0; i < C; i++) {
+                float xshift = x[i] - m;
+                v += xshift * xshift;
+            }
+            v = v/C;
+            // calculate the rstd
+            float s = 1.0f / sqrtf(v + eps);
+            // seek to the output position in out[b,t,:]
+            float* out_bt = out + b * T * C + t * C;
+            for (int i = 0; i < C; i++) {
+                float n = (s * (x[i] - m)); // normalized output
+                float o = n * weight[i] + bias[i]; // scale and shift it
+                out_bt[i] = o; // write
+            }
+            // cache the mean and rstd for the backward pass later
+            mean[b * T + t] = m;
+            rstd[b * T + t] = s;
+        }
+    }
+}
+
+void layernorm_backward(float* dinp, float* dweight, float* dbias,
+                        float* dout, float* inp, float* weight, float* mean, float* rstd,
+                        int B, int T, int C) {
+    for (int b = 0; b < B; b++) {
+        for (int t = 0; t < T; t++) {
+            float* dout_bt = dout + b * T * C + t * C;
+            float* inp_bt = inp + b * T * C + t * C;
+            float* dinp_bt = dinp + b * T * C + t * C;
+            float mean_bt = mean[b * T + t];
+            float rstd_bt = rstd[b * T + t];
+
+            // first: two reduce operations
+            float dnorm_mean = 0.0f;
+            float dnorm_norm_mean = 0.0f;
+            for (int i = 0; i < C; i++) {
+                float norm_bti = (inp_bt[i] - mean_bt) * rstd_bt;
+                float dnorm_i = weight[i] * dout_bt[i];
+                dnorm_mean += dnorm_i;
+                dnorm_norm_mean += dnorm_i * norm_bti;
+            }
+            dnorm_mean = dnorm_mean / C;
+            dnorm_norm_mean = dnorm_norm_mean / C;
+
+            // now iterate again and accumulate all the gradients
+            for (int i = 0; i < C; i++) {
+                float norm_bti = (inp_bt[i] - mean_bt) * rstd_bt;
+                float dnorm_i = weight[i] * dout_bt[i];
+                // gradient contribution to bias
+                dbias[i] += dout_bt[i];
+                // gradient contribution to weight
+                dweight[i] += norm_bti * dout_bt[i];
+                // gradient contribution to input
+                float dval = 0.0f;
+                dval += dnorm_i; // term 1
+                dval -= dnorm_mean; // term 2
+                dval -= norm_bti * dnorm_norm_mean; // term 3
+                dval *= rstd_bt; // final scale
+                dinp_bt[i] += dval;
+            }
+        }
+    }
+}
+
+// poor man's tensor checker
+int check_tensor(float *a, float *b, int n, char* label) {
+    int ok = 1;
+    printf("%s\n", label);
+    for (int i = 0; i < n; i++) {
+        if (fabs(a[i] - b[i]) <= 1e-5) {
+            printf("OK ");
+        } else {
+            printf("NOT OK ");
+            ok = 0;
+        }
+        printf("%f %f\n", a[i], b[i]);
+    }
+    return ok;
+}
+
+int main() {
+
+    int B = 2; // batch
+    int T = 3; // time / sequence length
+    int C = 4; // number of channels
+
+    float* x = (float*) malloc(B * T * C * sizeof(float));
+    float* w = (float*) malloc(C * sizeof(float));
+    float* b = (float*) malloc(C * sizeof(float));
+    float* out = (float*) malloc(B * T * C * sizeof(float));
+    float* mean = (float*) malloc(B * T * sizeof(float));
+    float* rstd = (float*) malloc(B * T * sizeof(float));
+    float* dout = (float*) malloc(B * T * C * sizeof(float));
+    float* dx = (float*) malloc(B * T * C * sizeof(float));
+    float* dw = (float*) malloc(C * sizeof(float));
+    float* db = (float*) malloc(C * sizeof(float));
+
+    // read reference information from Python
+    FILE *file = fopen("ln.bin", "rb");
+    if (file == NULL) {
+        printf("Error opening file\n");
+        return 1;
+    }
+    fread(x, sizeof(float), B * T * C, file);
+    fread(w, sizeof(float), C, file);
+    fread(b, sizeof(float), C, file);
+    fread(out, sizeof(float), B * T * C, file);
+    fread(mean, sizeof(float), B * T, file);
+    fread(rstd, sizeof(float), B * T, file);
+    fread(dout, sizeof(float), B * T * C, file);
+    fread(dx, sizeof(float), B * T * C, file);
+    fread(dw, sizeof(float), C, file);
+    fread(db, sizeof(float), C, file);
+    fclose(file);
+
+    // now let's calculate everything ourselves
+
+    // forward pass
+    float* c_out = (float*) malloc(B * T * C * sizeof(float));
+    float* c_mean = (float*) malloc(B * T * sizeof(float));
+    float* c_rstd = (float*) malloc(B * T * sizeof(float));
+    layernorm_forward(c_out, c_mean, c_rstd, x, w, b, B, T, C);
+
+    // check correctness of forward pass
+    check_tensor(out, c_out, B*T*C, "out");
+    check_tensor(mean, c_mean, B*T, "mean");
+    check_tensor(rstd, c_rstd, B*T, "rstd");
+
+    // backward pass (note calloc inits grads to zero)
+    float* c_dx = (float*) calloc(B * T * C, sizeof(float));
+    float* c_dw = (float*) calloc(B * T, sizeof(float));
+    float* c_db = (float*) calloc(B * T, sizeof(float));
+    layernorm_backward(c_dx, c_dw, c_db, dout, x, w, c_mean, c_rstd, B, T, C);
+
+    // check correctness of backward pass
+    check_tensor(c_dx, dx, B*T*C, "dx");
+    check_tensor(c_dw, dw, C, "dw");
+    check_tensor(c_db, db, C, "db");
+
+    // TODO free stuff
+    return 0;
+}