commit e7e2ce551cfbb10d71554fd4ed45a96138a20372
Author: Yaossg <yaossg@outlook.com>
Date:   Thu Jul 18 21:38:25 2024 +0800

    first commit

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c3d8f61
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+gpt
+gpt2_124M.bin
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..98af70c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,44 @@
+# GPT2
+
+原作者：Andrej Karpathy @ https://github.com/karpathy/llm.c
+
+## 背景
+
+GPT 很酷，能不能在我自己的电脑上跑一个呢？当然可以！
+
+![](show.gif)
+
+现在给你提供 GPT2 的预训练模型：[点击这里](https://alist.yaossg.com/share/model/gpt2_124M.bin)，请把该模型放在本仓库代码的同目录下，按照下面的指示即可运行该程序。
+
+## 依赖
+
+需要安装 GCC 和 Python3 以及下面的 Python 包
+
+```bash
+pip3 install tiktoken
+```
+
+## 编译
+
+```bash
+bash build.sh
+```
+
+## 运行
+
+```bash
+python3 chat.py
+```
+
+## 目标
+
+你可能已经发现了，你的程序可能并没有我演示的跑的那么快（~~神机请忽略~~）。
+
+你的目标就是优化该程序的性能，在保证结果不变的情况下更快的完成文本的补全。
+
+我会使用一些测试点来评测你的程序的正确性和执行时间。期待更高的效率和更多样的优化方案。
+
+此外，请在 wp 中回答下面的问题：
+
+- 什么是阿姆达尔定律？根据阿姆达尔定律，我们应该把优化的重点放在哪里？
+- 你的优化方案和思路是什么？优化的效果受到哪些因素影响？
diff --git a/build.sh b/build.sh
new file mode 100644
index 0000000..fd20f0d
--- /dev/null
+++ b/build.sh
@@ -0,0 +1 @@
+gcc gpt.c -lm -O3 -std=gnu11 -ggdb -Wall -Werror -Wno-unused-result -Wno-unused-value -Wno-unused-variable -o gpt
diff --git a/chat.py b/chat.py
new file mode 100644
index 0000000..c608c8d
--- /dev/null
+++ b/chat.py
@@ -0,0 +1,30 @@
+import tiktoken
+import subprocess
+import time
+
+length = input("Completion length: ")
+length = str(int(length)) # ensure input a valid integer
+
+text = input("Text to complete: ")
+enc = tiktoken.get_encoding("gpt2")
+
+tokens = [
+    str(tok) for tok in enc.encode(text)
+]
+
+start = time.time()
+
+proc = subprocess.Popen(
+    ["./gpt", length, *tokens],
+    stdout=subprocess.PIPE,
+    text=True
+)
+
+while (line := proc.stdout.readline()):
+    token = int(line)
+    print(enc.decode([token]), end='', flush=True)
+
+print()
+
+end = time.time()
+print(f"It took {end - start:.2f}s to complete the text.")
\ No newline at end of file
diff --git a/gpt.c b/gpt.c
new file mode 100644
index 0000000..f2fd27e
--- /dev/null
+++ b/gpt.c
@@ -0,0 +1,600 @@
+// Original Author: Andrej Karpathy
+// https://github.com/karpathy/llm.c
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <string.h>
+#include <unistd.h>
+
+// ----------------------------------------------------------------------------
+// all the individual layers' forward passes
+// B = batch_size, T = sequence_length, C = channels, V = vocab_size
+
+void encoder_forward(float* out,
+                   int* inp, float* wte, float* wpe,
+                   int B, int T, int C) {
+    // out is (B,T,C). At each position (b,t), a C-dimensional vector summarizing token & position
+    // inp is (B,T) of integers, holding the token ids at each (b,t) position
+    // wte is (V,C) of token embeddings, short for "weight token embeddings"
+    // wpe is (maxT,C) of position embeddings, short for "weight positional embedding"
+    for (int b = 0; b < B; b++) {
+        for (int t = 0; t < T; t++) {
+            // seek to the output position in out[b,t,:]
+            float* out_bt = out + b * T * C + t * C;
+            // get the index of the token at inp[b, t]
+            int ix = inp[b * T + t];
+            // seek to the position in wte corresponding to the token
+            float* wte_ix = wte + ix * C;
+            // seek to the position in wpe corresponding to the position
+            float* wpe_t = wpe + t * C;
+            // add the two vectors and store the result in out[b,t,:]
+            for (int i = 0; i < C; i++) {
+                out_bt[i] = wte_ix[i] + wpe_t[i];
+            }
+        }
+    }
+}
+
+void layernorm_forward(float* out, float* mean, float* rstd,
+                       float* inp, float* weight, float* bias,
+                       int B, int T, int C) {
+    // reference: https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html
+    // both inp and out are (B,T,C) of the activations
+    // mean and rstd are (B,T) buffers, to be used later in backward pass
+    // at each position (b,t) of the input, the C-dimensional vector
+    // of activations gets normalized, then scaled and shifted
+    float eps = 1e-5f;
+    for (int b = 0; b < B; b++) {
+        for (int t = 0; t < T; t++) {
+            // seek to the input position inp[b,t,:]
+            float* x = inp + b * T * C + t * C;
+            // calculate the mean
+            float m = 0.0f;
+            for (int i = 0; i < C; i++) {
+                m += x[i];
+            }
+            m = m/C;
+            // calculate the variance (without any bias correction)
+            float v = 0.0f;
+            for (int i = 0; i < C; i++) {
+                float xshift = x[i] - m;
+                v += xshift * xshift;
+            }
+            v = v/C;
+            // calculate the rstd (reciprocal standard deviation)
+            float s = 1.0f / sqrtf(v + eps);
+            // seek to the output position in out[b,t,:]
+            float* out_bt = out + b * T * C + t * C;
+            for (int i = 0; i < C; i++) {
+                float n = (s * (x[i] - m)); // normalize
+                float o = n * weight[i] + bias[i]; // scale and shift
+                out_bt[i] = o; // write
+            }
+            // cache the mean and rstd for the backward pass later
+            mean[b * T + t] = m;
+            rstd[b * T + t] = s;
+        }
+    }
+}
+
+void matmul_forward(float* out,
+                    float* inp, float* weight, float* bias,
+                    int B, int T, int C, int OC) {
+    // most of the running time is spent here and in matmul_backward
+    // OC is short for "output channels"
+    // inp is (B,T,C), weight is (OC, C), bias is (OC)
+    // out will be (B,T,OC)
+    for (int b = 0; b < B; b++) {
+        for (int t = 0; t < T; t++) {
+            float* out_bt = out + b * T * OC + t * OC;
+            float* inp_bt = inp + b * T * C + t * C;
+            for (int o = 0; o < OC; o++) {
+                float val = (bias != NULL) ? bias[o] : 0.0f;
+                float* wrow = weight + o*C;
+                for (int i = 0; i < C; i++) {
+                    val += inp_bt[i] * wrow[i];
+                }
+                out_bt[o] = val;
+            }
+        }
+    }
+}
+
+void attention_forward(float* out, float* preatt, float* att,
+                       float* inp,
+                       int B, int T, int C, int NH) {
+    // input is (B, T, 3C) holding the query, key, value (Q, K, V) vectors
+    // preatt, att are (B, NH, T, T). NH = number of heads, T = sequence length
+    // that holds the pre-attention and post-attention scores (used in backward)
+    // output is (B, T, C)
+    // attention is the only layer that mixes information across time
+    // every other operation is applied at every (b,t) position independently
+    // (and of course, no layer mixes information across batch)
+    int C3 = C*3;
+    int hs = C / NH; // head size
+    float scale = 1.0 / sqrtf(hs);
+
+    for (int b = 0; b < B; b++) {
+        for (int t = 0; t < T; t++) {
+            for (int h = 0; h < NH; h++) {
+                float* query_t = inp + b * T * C3 + t * C3 + h * hs;
+                float* preatt_bth = preatt + b*NH*T*T + h*T*T + t*T;
+                float* att_bth = att + b*NH*T*T + h*T*T + t*T;
+
+                // pass 1: calculate query dot key and maxval
+                float maxval = -10000.0f; // TODO something better
+                for (int t2 = 0; t2 <= t; t2++) {
+                    float* key_t2 = inp + b * T * C3 + t2 * C3 + h * hs + C; // +C because it's key
+
+                    // (query_t) dot (key_t2)
+                    float val = 0.0f;
+                    for (int i = 0; i < hs; i++) {
+                        val += query_t[i] * key_t2[i];
+                    }
+                    val *= scale;
+                    if (val > maxval) {
+                        maxval = val;
+                    }
+
+                    preatt_bth[t2] = val;
+                }
+
+                // pass 2: calculate the exp and keep track of sum
+                // maxval is being calculated and subtracted only for numerical stability
+                float expsum = 0.0f;
+                for (int t2 = 0; t2 <= t; t2++) {
+                    float expv = expf(preatt_bth[t2] - maxval);
+                    expsum += expv;
+                    att_bth[t2] = expv;
+                }
+                float expsum_inv = expsum == 0.0f ? 0.0f : 1.0f / expsum;
+
+                // pass 3: normalize to get the softmax
+                for (int t2 = 0; t2 < T; t2++) {
+                    if (t2 <= t) {
+                        att_bth[t2] *= expsum_inv;
+                    } else {
+                        // causal attention mask. not strictly necessary to set to zero here
+                        // only doing this explicitly for debugging and checking to PyTorch
+                        att_bth[t2] = 0.0f;
+                    }
+                }
+
+                // pass 4: accumulate weighted values into the output of attention
+                float* out_bth = out + b * T * C + t * C + h * hs;
+                for (int i = 0; i < hs; i++) { out_bth[i] = 0.0f; }
+                for (int t2 = 0; t2 <= t; t2++) {
+                    float* value_t2 = inp + b * T * C3 + t2 * C3 + h * hs + C*2; // +C*2 because it's value
+                    float att_btht2 = att_bth[t2];
+                    for (int i = 0; i < hs; i++) {
+                        out_bth[i] += att_btht2 * value_t2[i];
+                    }
+                }
+            }
+        }
+    }
+}
+
+#define GELU_SCALING_FACTOR sqrtf(2.0f / M_PI)
+void gelu_forward(float* out, float* inp, int N) {
+    // (approximate) GeLU elementwise non-linearity in the MLP block of Transformer
+    for (int i = 0; i < N; i++) {
+        float x = inp[i];
+        float cube = 0.044715f * x * x * x;
+        out[i] = 0.5f * x * (1.0f + tanhf(GELU_SCALING_FACTOR * (x + cube)));
+    }
+}
+
+void residual_forward(float* out, float* inp1, float* inp2, int N) {
+    for (int i = 0; i < N; i++) {
+        out[i] = inp1[i] + inp2[i];
+    }
+}
+
+void softmax_forward(float* probs, float* logits, int B, int T, int V) {
+    // output: probs are (B,T,V) of the probabilities (sums to 1.0 in each b,t position)
+    // input: logits is (B,T,V) of the unnormalized log probabilities
+    for (int b = 0; b < B; b++) {
+        for (int t = 0; t < T; t++) {
+            // probs <- softmax(logits)
+            float* logits_bt = logits + b * T * V + t * V;
+            float* probs_bt = probs + b * T * V + t * V;
+
+            // maxval is only calculated and subtracted for numerical stability
+            float maxval = -10000.0f; // TODO something better
+            for (int i = 0; i < V; i++) {
+                if (logits_bt[i] > maxval) {
+                    maxval = logits_bt[i];
+                }
+            }
+            float sum = 0.0f;
+            for (int i = 0; i < V; i++) {
+                probs_bt[i] = expf(logits_bt[i] - maxval);
+                sum += probs_bt[i];
+            }
+            for (int i = 0; i < V; i++) {
+                probs_bt[i] /= sum;
+            }
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+// GPT-2 model definition
+
+// the parameters of the model
+#define NUM_PARAMETER_TENSORS 16
+typedef struct {
+    float* wte; // (V, C)
+    float* wpe; // (maxT, C)
+    float* ln1w; // (L, C)
+    float* ln1b; // (L, C)
+    float* qkvw; // (L, 3*C, C)
+    float* qkvb; // (L, 3*C)
+    float* attprojw; // (L, C, C)
+    float* attprojb; // (L, C)
+    float* ln2w; // (L, C)
+    float* ln2b; // (L, C)
+    float* fcw; // (L, 4*C, C)
+    float* fcb; // (L, 4*C)
+    float* fcprojw; // (L, C, 4*C)
+    float* fcprojb; // (L, C)
+    float* lnfw; // (C)
+    float* lnfb; // (C)
+} ParameterTensors;
+
+// allocate memory for the parameters and point the individual tensors to the right places
+float* malloc_and_point_parameters(ParameterTensors* params, size_t* param_sizes) {
+    size_t num_parameters = 0;
+    for (size_t i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+        num_parameters += param_sizes[i];
+    }
+    // malloc all parameters all at once
+    float* params_memory = (float*)malloc(num_parameters * sizeof(float));
+    // assign all the tensors
+    float** ptrs[] = {
+        &params->wte, &params->wpe, &params->ln1w, &params->ln1b, &params->qkvw, &params->qkvb,
+        &params->attprojw, &params->attprojb, &params->ln2w, &params->ln2b, &params->fcw, &params->fcb,
+        &params->fcprojw, &params->fcprojb, &params->lnfw, &params->lnfb
+    };
+    float* params_memory_iterator = params_memory;
+    for (size_t i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+        *(ptrs[i]) = params_memory_iterator;
+        params_memory_iterator += param_sizes[i];
+    }
+    return params_memory;
+}
+
+#define NUM_ACTIVATION_TENSORS 23
+typedef struct {
+    float* encoded; // (B, T, C)
+    float* ln1; // (L, B, T, C)
+    float* ln1_mean; // (L, B, T)
+    float* ln1_rstd; // (L, B, T)
+    float* qkv; // (L, B, T, 3*C)
+    float* atty; // (L, B, T, C)
+    float* preatt; // (L, B, NH, T, T)
+    float* att; // (L, B, NH, T, T)
+    float* attproj; // (L, B, T, C)
+    float* residual2; // (L, B, T, C)
+    float* ln2; // (L, B, T, C)
+    float* ln2_mean; // (L, B, T)
+    float* ln2_rstd; // (L, B, T)
+    float* fch; // (L, B, T, 4*C)
+    float* fch_gelu; // (L, B, T, 4*C)
+    float* fcproj; // (L, B, T, C)
+    float* residual3; // (L, B, T, C)
+    float* lnf; // (B, T, C)
+    float* lnf_mean; // (B, T)
+    float* lnf_rstd; // (B, T)
+    float* logits; // (B, T, V)
+    float* probs; // (B, T, V)
+    float* losses; // (B, T)
+} ActivationTensors;
+
+float* malloc_and_point_activations(ActivationTensors* acts, size_t* act_sizes) {
+    size_t num_activations = 0;
+    for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
+        num_activations += act_sizes[i];
+    }
+    float* acts_memory = (float*)malloc(num_activations * sizeof(float));
+    float** ptrs[] = {
+        &acts->encoded, &acts->ln1, &acts->ln1_mean, &acts->ln1_rstd, &acts->qkv, &acts->atty,
+        &acts->preatt, &acts->att, &acts->attproj, &acts->residual2, &acts->ln2, &acts->ln2_mean,
+        &acts->ln2_rstd, &acts->fch, &acts->fch_gelu, &acts->fcproj, &acts->residual3, &acts->lnf,
+        &acts->lnf_mean, &acts->lnf_rstd, &acts->logits, &acts->probs, &acts->losses
+    };
+    float* acts_memory_iterator = acts_memory;
+    for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
+        *(ptrs[i]) = acts_memory_iterator;
+        acts_memory_iterator += act_sizes[i];
+    }
+    return acts_memory;
+}
+
+typedef struct {
+    int max_seq_len; // max sequence length, e.g. 1024
+    int vocab_size; // vocab size, e.g. 50257
+    int num_layers; // number of layers, e.g. 12
+    int num_heads; // number of heads in attention, e.g. 12
+    int channels; // number of channels, e.g. 768
+} GPT2Config;
+
+typedef struct {
+    GPT2Config config;
+    // the weights (parameters) of the model, and their sizes
+    ParameterTensors params;
+    size_t param_sizes[NUM_PARAMETER_TENSORS];
+    float* params_memory;
+    int num_parameters;
+    // gradients of the weights
+    ParameterTensors grads;
+    float* grads_memory;
+    // buffers for the AdamW optimizer
+    float* m_memory;
+    float* v_memory;
+    // the activations of the model, and their sizes
+    ActivationTensors acts;
+    size_t act_sizes[NUM_ACTIVATION_TENSORS];
+    float* acts_memory;
+    int num_activations;
+    // gradients of the activations
+    ActivationTensors grads_acts;
+    float* grads_acts_memory;
+    // other run state configuration
+    int batch_size; // the batch size (B) of current forward pass
+    int seq_len; // the sequence length (T) of current forward pass
+    int* inputs; // the input tokens for the current forward pass
+    int* targets; // the target tokens for the current forward pass
+    float mean_loss; // after a forward pass with targets, will be populated with the mean loss
+} GPT2;
+
+void gpt2_build_from_checkpoint(GPT2 *model, char* checkpoint_path) {
+
+    // read in model from a checkpoint file
+    FILE *model_file = fopen(checkpoint_path, "rb");
+    if (model_file == NULL) { printf("Error opening model file\n"); exit(1); }
+    int model_header[256];
+    fread(model_header, sizeof(int), 256, model_file);
+    if (model_header[0] != 20240326) { printf("Bad magic model file"); exit(1); }
+    if (model_header[1] != 1) { printf("Bad version in model file"); exit(1); }
+
+    // read in hyperparameters
+    int maxT, V, L, NH, C;
+    model->config.max_seq_len = maxT = model_header[2];
+    model->config.vocab_size = V = model_header[3];
+    model->config.num_layers = L = model_header[4];
+    model->config.num_heads = NH = model_header[5];
+    model->config.channels = C = model_header[6];
+
+    // allocate space for all the parameters and read them in
+    model->param_sizes[0] = V * C; // wte
+    model->param_sizes[1] = maxT * C; // wpe
+    model->param_sizes[2] = L * C; // ln1w
+    model->param_sizes[3] = L * C; // ln1b
+    model->param_sizes[4] = L * (3 * C) * C; // qkvw
+    model->param_sizes[5] = L * (3 * C); // qkvb
+    model->param_sizes[6] = L * C * C; // attprojw
+    model->param_sizes[7] = L * C; // attprojb
+    model->param_sizes[8] = L * C; // ln2w
+    model->param_sizes[9] = L * C; // ln2b
+    model->param_sizes[10] = L * (4 * C) * C; // fcw
+    model->param_sizes[11] = L * (4 * C); // fcb
+    model->param_sizes[12] = L * C * (4 * C); // fcprojw
+    model->param_sizes[13] = L * C; // fcprojb
+    model->param_sizes[14] = C; // lnfw
+    model->param_sizes[15] = C; // lnfb
+
+    // cound the number of paramaters
+    size_t num_parameters = 0;
+    for (size_t i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+        num_parameters += model->param_sizes[i];
+    }
+    model->num_parameters = num_parameters;
+
+    // read in all the parameters from file
+    model->params_memory = malloc_and_point_parameters(&model->params, model->param_sizes);
+    fread(model->params_memory, sizeof(float), num_parameters, model_file);
+    fclose(model_file);
+
+    // other inits
+    model->acts_memory = NULL;
+    model->grads_memory = NULL;
+    model->m_memory = NULL;
+    model->v_memory = NULL;
+    model->grads_acts_memory = NULL;
+    model->inputs = NULL;
+    model->targets = NULL;
+    model->batch_size = 0;
+    model->seq_len = 0;
+    model->mean_loss = -1.0f; // -1.0f will designate no loss
+}
+
+void gpt2_forward(GPT2 *model, int* inputs, int B, int T) {
+    // convenience parameters
+    int V = model->config.vocab_size;
+    int L = model->config.num_layers;
+    int NH = model->config.num_heads;
+    int C = model->config.channels;
+
+    // record the current B,T as well
+    model->batch_size = B;
+    model->seq_len = T;
+    // and now allocate the space
+    model->act_sizes[0] = B * T * C; // encoded
+    model->act_sizes[1] = L * B * T * C; // ln1
+    model->act_sizes[2] = L * B * T;  // ln1_mean
+    model->act_sizes[3] = L * B * T;  // ln1_rstd
+    model->act_sizes[4] = L * B * T * 3*C; // qkv
+    model->act_sizes[5] = L * B * T * C;  // atty
+    model->act_sizes[6] = L * B * NH * T * T;  // preatt
+    model->act_sizes[7] = L * B * NH * T * T;  // att
+    model->act_sizes[8] = L * B * T * C; // attproj
+    model->act_sizes[9] = L * B * T * C; // residual2
+    model->act_sizes[10] = L * B * T * C; // ln2
+    model->act_sizes[11] = L * B * T; // ln2_mean
+    model->act_sizes[12] = L * B * T; // ln2_rstd
+    model->act_sizes[13] = L * B * T * 4*C; // fch
+    model->act_sizes[14] = L * B * T * 4*C; // fch_gelu
+    model->act_sizes[15] = L * B * T * C; // fcproj
+    model->act_sizes[16] = L * B * T * C; // residual3
+    model->act_sizes[17] = B * T * C; // lnf
+    model->act_sizes[18] = B * T; // lnf_mean
+    model->act_sizes[19] = B * T; // lnf_rstd
+    model->act_sizes[20] = B * T * V; // logits
+    model->act_sizes[21] = B * T * V; // probs
+    model->act_sizes[22] = B * T; // losses
+    size_t num_activations = 0;
+    for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
+        num_activations += model->act_sizes[i];
+    }
+    model->num_activations = num_activations;
+
+    if (model->acts_memory) {
+        free(model->acts_memory);
+        model->acts_memory = NULL;
+    }
+    model->acts_memory = malloc_and_point_activations(&model->acts, model->act_sizes);
+
+    // also create memory for caching inputs and targets
+    if (model->inputs) {
+        free(model->inputs);
+    }
+    model->inputs = (int*)malloc(B * T * sizeof(int));
+
+    // cache the inputs/targets
+    memcpy(model->inputs, inputs, B * T * sizeof(int));
+
+    // forward pass
+    ParameterTensors params = model->params; // for brevity
+    ActivationTensors acts = model->acts;
+    float* residual;
+    encoder_forward(acts.encoded, inputs, params.wte, params.wpe, B, T, C); // encoding goes into residual[0]
+    for (int l = 0; l < L; l++) {
+
+        residual = l == 0 ? acts.encoded : acts.residual3 + (l-1) * B * T * C;
+
+        // get the pointers of the weights for this layer
+        float* l_ln1w = params.ln1w + l * C;
+        float* l_ln1b = params.ln1b + l * C;
+        float* l_qkvw = params.qkvw + l * 3*C * C;
+        float* l_qkvb = params.qkvb + l * 3*C;
+        float* l_attprojw = params.attprojw + l * C * C;
+        float* l_attprojb = params.attprojb + l * C;
+        float* l_ln2w = params.ln2w + l * C;
+        float* l_ln2b = params.ln2b + l * C;
+        float* l_fcw = params.fcw + l * 4*C * C;
+        float* l_fcb = params.fcb + l * 4*C;
+        float* l_fcprojw = params.fcprojw + l * C * 4*C;
+        float* l_fcprojb = params.fcprojb + l * C;
+
+        // get the pointers of the activations for this layer
+        float* l_ln1 = acts.ln1 + l * B * T * C;
+        float* l_ln1_mean = acts.ln1_mean + l * B * T;
+        float* l_ln1_rstd = acts.ln1_rstd + l * B * T;
+        float* l_qkv = acts.qkv + l * B * T * 3*C;
+        float* l_atty = acts.atty + l * B * T * C;
+        float* l_preatt = acts.preatt + l * B * NH * T * T;
+        float* l_att = acts.att + l * B * NH * T * T;
+        float* l_attproj = acts.attproj + l * B * T * C;
+        float* l_residual2 = acts.residual2 + l * B * T * C;
+        float* l_ln2 = acts.ln2 + l * B * T * C;
+        float* l_ln2_mean = acts.ln2_mean + l * B * T;
+        float* l_ln2_rstd = acts.ln2_rstd + l * B * T;
+        float* l_fch = acts.fch + l * B * T * 4*C;
+        float* l_fch_gelu = acts.fch_gelu + l * B * T * 4*C;
+        float* l_fcproj = acts.fcproj + l * B * T * C;
+        float* l_residual3 = acts.residual3 + l * B * T * C;
+
+        // now do the forward pass
+        layernorm_forward(l_ln1, l_ln1_mean, l_ln1_rstd, residual, l_ln1w, l_ln1b, B, T, C);
+        matmul_forward(l_qkv, l_ln1, l_qkvw, l_qkvb, B, T, C, 3*C);
+        attention_forward(l_atty, l_preatt, l_att, l_qkv, B, T, C, NH);
+        matmul_forward(l_attproj, l_atty, l_attprojw, l_attprojb, B, T, C, C);
+        residual_forward(l_residual2, residual, l_attproj, B*T*C);
+        layernorm_forward(l_ln2, l_ln2_mean, l_ln2_rstd, l_residual2, l_ln2w, l_ln2b, B, T, C);
+        matmul_forward(l_fch, l_ln2, l_fcw, l_fcb, B, T, C, 4*C);
+        gelu_forward(l_fch_gelu, l_fch, B*T*4*C);
+        matmul_forward(l_fcproj, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4*C, C);
+        residual_forward(l_residual3, l_residual2, l_fcproj, B*T*C);
+    }
+    residual = acts.residual3 + (L-1) * B * T * C; // last residual is in residual3
+    layernorm_forward(acts.lnf, acts.lnf_mean, acts.lnf_rstd, residual, params.lnfw, params.lnfb, B, T, C);
+    matmul_forward(acts.logits, acts.lnf, params.wte, NULL, B, T, C, V);
+    softmax_forward(acts.probs, acts.logits, B, T, V);
+}
+
+void gpt2_zero_grad(GPT2 *model) {
+    if(model->grads_memory != NULL) { memset(model->grads_memory, 0, model->num_parameters * sizeof(float)); }
+    if(model->grads_acts_memory != NULL) { memset(model->grads_acts_memory, 0, model->num_activations * sizeof(float)); }
+}
+
+void gpt2_free(GPT2 *model) {
+    free(model->params_memory);
+    free(model->grads_memory);
+    free(model->m_memory);
+    free(model->v_memory);
+    free(model->acts_memory);
+    free(model->grads_acts_memory);
+    free(model->inputs);
+    free(model->targets);
+}
+
+int sample_mult(float* probabilities, int n) {
+    // sample index from probabilities (they must sum to 1!)
+    // coin can be a random number in [0, 1), usually from random_f32()
+    float cdf = 0.0f, coin = 0.5f;
+    for (int i = 0; i < n; i++) {
+        cdf += probabilities[i];
+        if (coin < cdf) {
+            return i;
+        }
+    }
+    return n - 1; // in case of rounding errors
+}
+
+// the GPT-2 end-of-text token id
+#define GPT2_EOT 50256
+
+int main(int argc, char* argv[]) {
+    GPT2 model;
+    gpt2_build_from_checkpoint(&model, "gpt2_124M.bin");
+
+    if (argc < 3) {
+        printf("Provide completion length and at least one token.\n");
+        exit(1);
+    }
+
+    const int input_offset = 2;
+    int completion_length = atoi(argv[1]);
+    int input_length = argc - input_offset;
+    if (input_length > completion_length) {
+        printf("Tow many tokens.\n");
+        exit(1);
+    }
+
+    int tokens[completion_length];
+
+    for (int i = 0; i < completion_length; i++) {
+        if (i < input_length) {
+            tokens[i] = atoi(argv[input_offset + i]);
+        } else {
+            tokens[i] = GPT2_EOT;
+        }
+    }
+
+    for (int t = input_length; t < completion_length; t++) {
+        gpt2_forward(&model, tokens, 1, t);
+        float* probs = model.acts.probs + (t-1) * model.config.vocab_size;
+        int next_token = sample_mult(probs, model.config.vocab_size);
+        tokens[t] = next_token;
+
+        printf("%d\n", tokens[t]);
+        fflush(stdout);
+    }
+
+    gpt2_free(&model);
+}
diff --git a/show.gif b/show.gif
new file mode 100644
index 0000000..ff800f3
Binary files /dev/null and b/show.gif differ