From 5d124d0cb4ebf834aa136aade847092777078c35 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 15 Jun 2023 20:34:56 +0200
Subject: [PATCH 001/100] fix track_max_mem in
 forward_batch_wo_cache_flash_attn_train

---
 .../train-text-from-scratch.cpp                      | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 54dc2beed0080..828a2a9b76bda 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1445,17 +1445,22 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     const int n_ff       = get_n_ff(&hparams);
     const int rope_mode  = 0;
 
+    bool track_max_mem = true;
+
     int last_buf = -1;
     size_t buf_offs[2] = { 0, 0 };
     size_t buf_size[2] = { size_buf_0,
                            size_buf_1 };
     void * buf_data[2] = { compute_buf_0,
                            compute_buf_1 };
-    auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data] (int buf) {
+    size_t buf_maxs[2] = { 0, 0 };
+
+    auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs] (int buf) {
         size_t last_offs = 0;
         last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
         if (last_buf >= 0) {
             buf_offs[last_buf] = last_offs;
+            buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
         }
         if (buf >= 0) {
             size_t offs = buf_offs[buf];
@@ -1466,8 +1471,6 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         last_buf = buf;
     };
 
-    bool track_max_mem = false;
-    size_t buf_maxs[2] = { 0, 0 };
 
     auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) {
         if (buf < 0) return;
@@ -1903,6 +1906,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
     *logits = t35;
 
+    clr_buf(0);
+    clr_buf(1);
+
     if (track_max_mem) {
         printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]);
         printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);

From d39c8e686375b4e2dedbf98e2e11b12b1aef2526 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 15 Jun 2023 21:07:56 +0200
Subject: [PATCH 002/100] remove unnecessary Adam(W) optimizer tensors.

reduces optimizer memory overhead from 7*modelsize to 2*modelsize.

additionally allows to optimize models with more than 2^31 parameters by replacing int with int64_t.

bumps training checkpoint file version, but old checkpoints can still be read.
new version with less tensors is saved.
---
 .../train-text-from-scratch.cpp               | 105 +++++++++++++++---
 ggml.c                                        |  96 ++++++++--------
 ggml.h                                        |   5 -
 3 files changed, 136 insertions(+), 70 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 828a2a9b76bda..60d2b57838e65 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2406,8 +2406,27 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     file->read_raw(tensor->data, ggml_nbytes(tensor));
 }
 
+void skip_tensor(struct llama_file * file) {
+    int32_t nd = file->read_u32();
+
+    uint32_t name_len       = file->read_u32();
+    enum     ggml_type type = (enum ggml_type) file->read_u32();
+
+    uint32_t ne[4] = { 1, 1, 1, 1 };
+
+    file->read_raw(ne, sizeof(ne[0]) * nd);
+
+    std::string name = file->read_string(name_len);
+
+    file->seek(-file->tell() & 31, SEEK_CUR);
+
+    size_t nelements = ne[0]*ne[1]*ne[2]*ne[3];
+    size_t nbytes = nelements*ggml_type_size(type)/ggml_blck_size(type);
+    file->seek(nbytes, SEEK_CUR);
+}
+
 void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt) {
-    const uint32_t version = 0;
+    const uint32_t version = 1;
     GGML_ASSERT(opt->nx   >= 0);
     GGML_ASSERT(opt->iter >= 0);
     file->write_u32(version);
@@ -2418,14 +2437,10 @@ void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt)
     switch (opt->params.type) {
         case GGML_OPT_ADAM:
             {
-                GGML_ASSERT(opt->adam.x != NULL);
-                write_tensor(file, opt->adam.x);
-                write_tensor(file, opt->adam.g1);
-                write_tensor(file, opt->adam.g2);
+                GGML_ASSERT(opt->adam.m  != NULL);
+                GGML_ASSERT(opt->adam.v  != NULL);
                 write_tensor(file, opt->adam.m);
                 write_tensor(file, opt->adam.v);
-                write_tensor(file, opt->adam.mh);
-                write_tensor(file, opt->adam.vh);
                 write_tensor(file, opt->adam.pf);
                 file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
                 file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
@@ -2433,7 +2448,7 @@ void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt)
             } break;
         case GGML_OPT_LBFGS:
             {
-                GGML_ASSERT(opt->adam.x != NULL);
+                GGML_ASSERT(opt->lbfgs.x != NULL);
                 write_tensor(file, opt->lbfgs.x);
                 write_tensor(file, opt->lbfgs.xp);
                 write_tensor(file, opt->lbfgs.g);
@@ -2454,10 +2469,53 @@ void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt)
     }
 }
 
-void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-    uint32_t version = file->read_u32();
-    GGML_ASSERT(version == 0);
+void read_opt_context_v0(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
+    file->read_raw(&opt->params, sizeof(opt->params));
+    file->read_raw(&opt->nx,     sizeof(opt->nx));
+    ggml_opt_init(ctx, opt, opt->params, opt->nx);
+
+    file->read_raw(&opt->iter,   sizeof(opt->iter));
+    opt->just_initialized = (bool) file->read_u32();
 
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                skip_tensor(file);
+                skip_tensor(file);
+                skip_tensor(file);
+                read_tensor(file, opt->adam.m);
+                read_tensor(file, opt->adam.v);
+                skip_tensor(file);
+                skip_tensor(file);
+                if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
+                file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+                file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+                file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                GGML_ASSERT(opt->lbfgs.x != NULL);
+                read_tensor(file, opt->lbfgs.x);
+                read_tensor(file, opt->lbfgs.xp);
+                read_tensor(file, opt->lbfgs.g);
+                read_tensor(file, opt->lbfgs.gp);
+                read_tensor(file, opt->lbfgs.d);
+                if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
+                read_tensor(file, opt->lbfgs.lmal);
+                read_tensor(file, opt->lbfgs.lmys);
+                read_tensor(file, opt->lbfgs.lms);
+                read_tensor(file, opt->lbfgs.lmy);
+                file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+                file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+                file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+                file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+                file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+                file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+            } break;
+    }
+}
+
+void read_opt_context_v1(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
     file->read_raw(&opt->params, sizeof(opt->params));
     file->read_raw(&opt->nx,     sizeof(opt->nx));
     ggml_opt_init(ctx, opt, opt->params, opt->nx);
@@ -2468,13 +2526,8 @@ void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struc
     switch (opt->params.type) {
         case GGML_OPT_ADAM:
             {
-                read_tensor(file, opt->adam.x);
-                read_tensor(file, opt->adam.g1);
-                read_tensor(file, opt->adam.g2);
                 read_tensor(file, opt->adam.m);
                 read_tensor(file, opt->adam.v);
-                read_tensor(file, opt->adam.mh);
-                read_tensor(file, opt->adam.vh);
                 if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
                 file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
                 file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
@@ -2482,7 +2535,7 @@ void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struc
             } break;
         case GGML_OPT_LBFGS:
             {
-                GGML_ASSERT(opt->adam.x != NULL);
+                GGML_ASSERT(opt->lbfgs.x != NULL);
                 read_tensor(file, opt->lbfgs.x);
                 read_tensor(file, opt->lbfgs.xp);
                 read_tensor(file, opt->lbfgs.g);
@@ -2503,6 +2556,24 @@ void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struc
     }
 }
 
+void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
+    uint32_t version = file->read_u32();
+    switch (version) {
+        case 0:
+            {
+                read_opt_context_v0(file, ctx, opt);
+            } break;
+        case 1:
+            {
+                read_opt_context_v1(file, ctx, opt);
+            } break;
+        default:
+            {
+                fprintf(stderr, "%s: unknown version %ud\n", __func__, version);
+            }
+    }
+}
+
 void save_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename) {
     struct llama_file file(filename, "wb");
     if (file.fp == NULL) {
diff --git a/ggml.c b/ggml.c
index b77f9926754ed..143f88d4a657c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17329,7 +17329,7 @@ static enum ggml_opt_result ggml_opt_adam(
     struct ggml_tensor * ps[GGML_MAX_PARAMS];
 
     int np = 0;
-    int nx = 0;
+    int64_t nx = 0;
     for (int i = 0; i < gf->n_nodes; ++i) {
         if (gf->nodes[i]->is_param) {
             GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
@@ -17355,19 +17355,11 @@ static enum ggml_opt_result ggml_opt_adam(
     const float beta2 = params.adam.beta2;
     const float eps   = params.adam.eps;
 
-    float * x  = opt->adam.x->data;  // view of the parameters
-    float * g1 = opt->adam.g1->data; // gradient
-    float * g2 = opt->adam.g2->data; // gradient squared
     float * m  = opt->adam.m->data;  // first moment
     float * v  = opt->adam.v->data;  // second moment
-    float * mh = opt->adam.mh->data; // first moment hat
-    float * vh = opt->adam.vh->data; // second moment hat
 
     float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
 
-    // update view
-    ggml_opt_get_params(np, ps, x);
-
     // compute the function value
     ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
@@ -17412,43 +17404,61 @@ static enum ggml_opt_result ggml_opt_adam(
         UNUSED(t_start_cpu);
 
         {
-            // update the gradient
-            ggml_opt_get_grad(np, ps, g1);
+            int64_t i = 0;
+            for (int p = 0; p < np; ++p) {
+                const int64_t ne = ggml_nelements(ps[p]) ;
+                for (int64_t j = 0; j < ne; ++j) {
+                    float x = ggml_get_f32_1d(ps[p], j);
+                    float g = ggml_get_f32_1d(ps[p]->grad, j);
+                    m[i] = m[i]*beta1 +   g*(1.0f - beta1);
+                    v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
+                    float mh = m[i]*alpha/(1.0f - powf(beta1, opt->iter));
+                    float vh = v[i]*1.0f /(1.0f - powf(beta2, opt->iter));
+                    vh = sqrtf(vh) + eps;
+                    x  = x*(1.0f - decay) - mh/vh;
+                    ggml_set_f32_1d(ps[p], j, x);
+                    ++i;
+                }
+            }
+        }
+        // {
+        //     // update the gradient
+        //     ggml_opt_get_grad(np, ps, g1);
 
-            // m_t = beta1*m_t-1 + (1 - beta1)*g_t
-            ggml_vec_scale_f32(nx, m, beta1);
-            ggml_vec_mad_f32  (nx, m, g1, 1.0f - beta1);
+        //     // m_t = beta1*m_t-1 + (1 - beta1)*g_t
+        //     ggml_vec_scale_f32(nx, m, beta1);
+        //     ggml_vec_mad_f32  (nx, m, g1, 1.0f - beta1);
 
-            // g2 = g1^2
-            ggml_vec_sqr_f32  (nx, g2, g1);
+        //     // g2 = g1^2
+        //     ggml_vec_sqr_f32  (nx, g2, g1);
 
-            // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
-            ggml_vec_scale_f32(nx, v, beta2);
-            ggml_vec_mad_f32  (nx, v, g2, 1.0f - beta2);
+        //     // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
+        //     ggml_vec_scale_f32(nx, v, beta2);
+        //     ggml_vec_mad_f32  (nx, v, g2, 1.0f - beta2);
 
-            // m^hat = m_t / (1 - beta1^t)
-            // v^hat = v_t / (1 - beta2^t)
-            // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
-            // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
-            // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
-            // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
-            // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
-            ggml_vec_cpy_f32  (nx, mh, m);
-            ggml_vec_cpy_f32  (nx, vh, v);
+        //     // m^hat = m_t / (1 - beta1^t)
+        //     // v^hat = v_t / (1 - beta2^t)
+        //     // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
+        //     // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
+        //     // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
+        //     // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
+        //     // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
+        //     ggml_vec_cpy_f32  (nx, mh, m);
+        //     ggml_vec_cpy_f32  (nx, vh, v);
 
-            ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
-            ggml_vec_scale_f32(nx, vh,  1.0f/(1.0f - powf(beta2, opt->iter)));
+        //     ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
+        //     ggml_vec_scale_f32(nx, vh,  1.0f/(1.0f - powf(beta2, opt->iter)));
 
-            ggml_vec_sqrt_f32 (nx, vh, vh);
-            ggml_vec_acc1_f32 (nx, vh, eps);
+        //     ggml_vec_sqrt_f32 (nx, vh, vh);
+        //     ggml_vec_acc1_f32 (nx, vh, eps);
 
-            ggml_vec_div_f32  (nx, mh, mh, vh);
-            ggml_vec_scale_f32(nx, x,  1.0f - decay);
-            ggml_vec_sub_f32  (nx, x,  x,  mh);
+        //     ggml_vec_div_f32  (nx, mh, mh, vh);
+        //     ggml_vec_scale_f32(nx, x,  1.0f - decay);
+        //     ggml_vec_sub_f32  (nx, x,  x,  mh);
 
-            // update the parameters
-            ggml_opt_set_params(np, ps, x);
-        }
+        //     // update the parameters
+        //     ggml_opt_set_params(np, ps, x);
+        // }
 
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
@@ -17941,23 +17951,13 @@ GGML_API void ggml_opt_init(
     switch (opt->params.type) {
         case GGML_OPT_ADAM:
             {
-                opt->adam.x  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
-                opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
-                opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
                 opt->adam.m  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
                 opt->adam.v  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
-                opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
-                opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
                 opt->adam.pf = params.past > 0
                     ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
                     : NULL;
-                ggml_set_zero(opt->adam.x);
-                ggml_set_zero(opt->adam.g1);
-                ggml_set_zero(opt->adam.g2);
                 ggml_set_zero(opt->adam.m);
                 ggml_set_zero(opt->adam.v);
-                ggml_set_zero(opt->adam.mh);
-                ggml_set_zero(opt->adam.vh);
                 if (opt->adam.pf) {
                     ggml_set_zero(opt->adam.pf);
                 }
diff --git a/ggml.h b/ggml.h
index 9919cce7c263f..531b6cb07d81d 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1537,13 +1537,8 @@ extern "C" {
         bool just_initialized;
 
         struct {
-            struct ggml_tensor * x;  // view of the parameters
-            struct ggml_tensor * g1; // gradient
-            struct ggml_tensor * g2; // gradient squared
             struct ggml_tensor * m;  // first moment
             struct ggml_tensor * v;  // second moment
-            struct ggml_tensor * mh; // first moment hat
-            struct ggml_tensor * vh; // second moment hat
             struct ggml_tensor * pf; // past function values
             float fx_best;
             float fx_prev;

From d395b19c8c400bc2f9197b95bdbae5122010370f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 15 Jun 2023 23:48:46 +0200
Subject: [PATCH 003/100] add gradient clipping to AdamW

---
 .../train-text-from-scratch.cpp               | 78 +++++++++++++++++--
 ggml.c                                        | 28 ++++++-
 ggml.h                                        |  1 +
 3 files changed, 98 insertions(+), 9 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 60d2b57838e65..a4a6b05b184b0 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2430,7 +2430,8 @@ void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt)
     GGML_ASSERT(opt->nx   >= 0);
     GGML_ASSERT(opt->iter >= 0);
     file->write_u32(version);
-    file->write_raw(&opt->params, sizeof(opt->params));
+    file->write_u32(opt->params.past);
+    file->write_u32(opt->params.lbfgs.m);
     file->write_raw(&opt->nx,     sizeof(opt->nx));
     file->write_raw(&opt->iter,   sizeof(opt->iter));
     file->write_u32((uint32_t)  opt->just_initialized);
@@ -2469,9 +2470,44 @@ void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt)
     }
 }
 
+struct ggml_opt_params_v0 {
+    enum ggml_opt_type type;
+    int n_threads;
+    int past;
+    float delta;
+    int max_no_improvement;
+    bool print_forward_graph;
+    bool print_backward_graph;
+    struct {
+        int n_iter;
+        float sched;
+        float decay;
+        float alpha;
+        float beta1;
+        float beta2;
+        float eps;
+        float eps_f;
+        float eps_g;
+    } adam;
+    struct {
+        int m;
+        int n_iter;
+        int max_linesearch;
+        float eps;
+        float ftol;
+        float wolfe;
+        float min_step;
+        float max_step;
+        enum ggml_linesearch linesearch;
+    } lbfgs;
+};
+
 void read_opt_context_v0(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-    file->read_raw(&opt->params, sizeof(opt->params));
-    file->read_raw(&opt->nx,     sizeof(opt->nx));
+    ggml_opt_params_v0 pv0;
+    file->read_raw(&pv0, sizeof(pv0));
+    opt->params.past = pv0.past;
+    opt->params.lbfgs.m = pv0.lbfgs.m;
+    file->read_raw(&opt->nx, sizeof(opt->nx));
     ggml_opt_init(ctx, opt, opt->params, opt->nx);
 
     file->read_raw(&opt->iter,   sizeof(opt->iter));
@@ -2516,7 +2552,8 @@ void read_opt_context_v0(struct llama_file * file, struct ggml_context * ctx, st
 }
 
 void read_opt_context_v1(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-    file->read_raw(&opt->params, sizeof(opt->params));
+    opt->params.past    = (int) file->read_u32();
+    opt->params.lbfgs.m = (int) file->read_u32();
     file->read_raw(&opt->nx,     sizeof(opt->nx));
     ggml_opt_init(ctx, opt, opt->params, opt->nx);
 
@@ -2558,6 +2595,7 @@ void read_opt_context_v1(struct llama_file * file, struct ggml_context * ctx, st
 
 void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
     uint32_t version = file->read_u32();
+    printf("%s: opt context version %u\n", __func__, version);
     switch (version) {
         case 0:
             {
@@ -2569,7 +2607,7 @@ void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struc
             } break;
         default:
             {
-                fprintf(stderr, "%s: unknown version %ud\n", __func__, version);
+                fprintf(stderr, "%s: unknown version %u\n", __func__, version);
             }
     }
 }
@@ -2783,6 +2821,9 @@ struct train_params {
     int   adam_n_iter;
     float adam_alpha;
     float adam_decay;
+    float adam_beta1;
+    float adam_beta2;
+    float adam_gclip;
 
     int mem_model_gb;
     int mem_compute_gb;
@@ -2830,6 +2871,9 @@ struct train_params get_default_train_params() {
     params.adam_n_iter       = 16;
     params.adam_alpha        = 1e-3f;
     params.adam_decay        = 1e-3f;
+    params.adam_beta1        = 0.9f;
+    params.adam_beta2        = 0.999f;
+    params.adam_gclip        = 1.0f;
 
     params.mem_model_gb   = 2;
     params.mem_compute_gb = 24;
@@ -2877,6 +2921,9 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
     fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
     fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
+    fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
+    fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
+    fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
     fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
     fprintf(stderr, "  --mem-compute0 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb);
@@ -3066,6 +3113,24 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->adam_decay = std::stof(argv[i]);
+        } else if (arg == "--adam-beta1") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_beta1 = std::stof(argv[i]);
+        } else if (arg == "--adam-beta2") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_beta2 = std::stof(argv[i]);
+        } else if (arg == "--adam-gclip") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_gclip = std::stof(argv[i]);
         } else if (arg == "--mem-model") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -3212,6 +3277,9 @@ int main(int argc, char ** argv) {
     opt_params_adam.adam.sched  = 1.0f;
     opt_params_adam.adam.alpha  = params.adam_alpha;
     opt_params_adam.adam.decay  = params.adam_decay;
+    opt_params_adam.adam.beta1  = params.adam_beta1;
+    opt_params_adam.adam.beta2  = params.adam_beta2;
+    opt_params_adam.adam.gclip  = params.adam_gclip;
 
     opt_params_lbfgs.print_forward_graph = false;
     opt_params_lbfgs.print_backward_graph = false;
diff --git a/ggml.c b/ggml.c
index 143f88d4a657c..19a194beb2542 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17354,6 +17354,7 @@ static enum ggml_opt_result ggml_opt_adam(
     const float beta1 = params.adam.beta1;
     const float beta2 = params.adam.beta2;
     const float eps   = params.adam.eps;
+    const float gclip = params.adam.gclip;
 
     float * m  = opt->adam.m->data;  // first moment
     float * v  = opt->adam.v->data;  // second moment
@@ -17404,16 +17405,34 @@ static enum ggml_opt_result ggml_opt_adam(
         UNUSED(t_start_cpu);
 
         {
+            float gnorm = 1.0f;
+            if (gclip > 0.0f) {
+                // gradient clipping
+                ggml_float sum = 0.0;
+                for (int p = 0; p < np; ++p) {
+                    const int64_t ne = ggml_nelements(ps[p]);
+                    for (int64_t j = 0; j < ne; ++j) {
+                        float g = ggml_get_f32_1d(ps[p]->grad, j);
+                        sum += g*g;
+                    }
+                }
+                ggml_float norm = sqrt(sum);
+                if (norm > (ggml_float) gclip) {
+                    gnorm = (float) ((ggml_float) gclip / norm);
+                }
+            }
+            const float beta1h = alpha/(1.0f - powf(beta1, opt->iter));
+            const float beta2h =  1.0f/(1.0f - powf(beta2, opt->iter));
             int64_t i = 0;
             for (int p = 0; p < np; ++p) {
-                const int64_t ne = ggml_nelements(ps[p]) ;
+                const int64_t ne = ggml_nelements(ps[p]);
                 for (int64_t j = 0; j < ne; ++j) {
                     float x = ggml_get_f32_1d(ps[p], j);
-                    float g = ggml_get_f32_1d(ps[p]->grad, j);
+                    float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
                     m[i] = m[i]*beta1 +   g*(1.0f - beta1);
                     v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
-                    float mh = m[i]*alpha/(1.0f - powf(beta1, opt->iter));
-                    float vh = v[i]*1.0f /(1.0f - powf(beta2, opt->iter));
+                    float mh = m[i]*beta1h;
+                    float vh = v[i]*beta2h;
                     vh = sqrtf(vh) + eps;
                     x  = x*(1.0f - decay) - mh/vh;
                     ggml_set_f32_1d(ps[p], j, x);
@@ -17902,6 +17921,7 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
                         .eps    = 1e-8f,
                         .eps_f  = 1e-5f,
                         .eps_g  = 1e-3f,
+                        .gclip  = 0.0f,
                     },
                 };
             } break;
diff --git a/ggml.h b/ggml.h
index 531b6cb07d81d..460976468a056 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1509,6 +1509,7 @@ extern "C" {
             float eps;   // epsilon for numerical stability
             float eps_f; // epsilon for convergence test
             float eps_g; // epsilon for convergence test
+            float gclip; // gradient clipping
         } adam;
 
         // LBFGS parameters

From d7003a98cceda5fe5926baf3dcad666a311dbe40 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 17 Jun 2023 18:56:27 +0200
Subject: [PATCH 004/100] Fix reset of unused g->nodes and g->grads to NULL

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index a4a6b05b184b0..c76a80c757ba0 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1403,8 +1403,8 @@ void graph_set_leafs_grads(struct ggml_cgraph * g) {
         }
     }
     for (int i=n_nodes; i < g->n_nodes; ++i) {
-        g->nodes[n_nodes] = NULL;
-        g->grads[n_nodes] = NULL;
+        g->nodes[i] = NULL;
+        g->grads[i] = NULL;
     }
     g->n_nodes = n_nodes;
 }

From 6e3f95bf06ea102ac71e7b5bae5ddfaae7c89bc4 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 28 Jul 2023 23:06:05 +0200
Subject: [PATCH 005/100] implement gradient checkpointing for training

reduces memory overhead from O(n_layer) to O(sqrt(n_layer))

as explained in readme of https://github.com/cybertronai/gradient-checkpointing
---
 .../train-text-from-scratch.cpp               | 607 +++++++++++++++++-
 1 file changed, 597 insertions(+), 10 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index c76a80c757ba0..faa60ec8bf320 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1921,6 +1921,556 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     return t36;
 }
 
+struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
+        struct my_llama_model * model,
+        struct ggml_context   * ctx0,
+        struct ggml_cgraph    * gf,
+        struct ggml_cgraph    * gb,
+        struct ggml_tensor  * * logits,
+        struct ggml_tensor    * tokens_input,
+        struct ggml_tensor    * targets,
+        void                  * compute_buf_0,
+        void                  * compute_buf_1,
+        void                  * compute_buf_2,
+        void                  * compute_buf_3,
+        size_t                  size_buf_0,
+        size_t                  size_buf_1,
+        size_t                  size_buf_2,
+        size_t                  size_buf_3,
+        const  int              n_tokens,
+        const  int              n_batch) {
+
+    // implements gradient-checkpointing as explained in readme of https://github.com/cybertronai/gradient-checkpointing
+
+    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+
+    const int n_past = 0;
+    const int N = n_tokens;
+
+    gf->n_nodes = 0;
+    gf->n_leafs = 0;
+    gf->perf_runs = 0;
+    gf->perf_cycles = 0;
+    gf->perf_time_us = 0;
+
+    const auto & hparams = model->hparams;
+    //const int n_ctx      = hparams.n_ctx;
+    const int n_vocab    = hparams.n_vocab;
+    const int n_embd     = hparams.n_embd;
+    const int n_layer    = hparams.n_layer;
+    const int n_head     = hparams.n_head;
+    const int n_rot      = hparams.n_rot;
+    const int n_ff       = get_n_ff(&hparams);
+    const int rope_mode  = 0;
+
+    bool track_max_mem = true;
+
+    int last_buf = -1;
+    size_t buf_offs[4] = { 0, 0, 0, 0 };
+    size_t buf_size[4] = { size_buf_0,
+                           size_buf_1,
+                           size_buf_2,
+                           size_buf_3 };
+    void * buf_data[4] = { compute_buf_0,
+                           compute_buf_1,
+                           compute_buf_2,
+                           compute_buf_3 };
+    size_t buf_maxs[4] = { 0, 0, 0, 0 };
+
+    auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs] (int buf) {
+        size_t last_offs = 0;
+        last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+        if (last_buf >= 0) {
+            buf_offs[last_buf] = last_offs;
+            buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
+        }
+        if (buf >= 0) {
+            size_t offs = buf_offs[buf];
+            size_t size = buf_size[buf];
+            void * data = buf_data[buf];
+            ggml_set_scratch(ctx0, { offs, size, data, });
+        }
+        last_buf = buf;
+    };
+
+
+    auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) {
+        if (buf < 0) return;
+        if (track_max_mem) {
+            size_t last_offs = 0;
+            last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+            if (last_buf >= 0) {
+                buf_offs[last_buf] = last_offs;
+                buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
+            }
+        }
+        buf_offs[buf] = 0;
+        if (track_max_mem && last_buf >= 0) {
+            size_t offs = buf_offs[last_buf];
+            size_t size = buf_size[last_buf];
+            void * data = buf_data[last_buf];
+            ggml_set_scratch(ctx0, { offs, size, data, });
+        }
+    };
+
+
+    auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
+        int64_t ne0 = n_embd/n_head;
+        int64_t ne1 = N;
+        int64_t ne2 = n_head;
+        int64_t ne3 = n_batch;
+        size_t  nb0 = ggml_element_size(t);
+        size_t  nb1 = nb0*ne0;
+        size_t  nb2 = nb1*ne1;
+        size_t  nb3 = nb2*ne2;
+        size_t offset = 0;
+        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
+    };
+
+    auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
+        int64_t ne0 = n_embd/n_head;
+        int64_t ne1 = N;
+        int64_t ne2 = n_head;
+        int64_t ne3 = n_batch;
+        size_t  nb0 = ggml_element_size(t);
+        size_t  nb1 = nb0*ne0;
+        size_t  nb2 = nb1*ne1;
+        size_t  nb3 = nb2*ne2;
+        size_t offset = nb3*ne3;
+        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
+    };
+
+    auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
+        int64_t ne0 = N;
+        int64_t ne1 = n_embd/n_head;
+        int64_t ne2 = n_head;
+        int64_t ne3 = n_batch;
+        size_t  nb0 = ggml_element_size(t);
+        size_t  nb1 = nb0*ne0;
+        size_t  nb2 = nb1*ne1;
+        size_t  nb3 = nb2*ne2;
+        size_t offset = 2*nb3*ne3;
+        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
+    };
+
+    auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * {
+        if (a == NULL) {
+            return b;
+        } else {
+            return ggml_add_inplace(ctx0, a, b);
+        }
+    };
+
+    use_buf(-1);
+
+    model->tok_embeddings->grad    = NULL;
+    model->norm->grad              = NULL;
+    model->output->grad            = NULL;
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct my_llama_layer & layer = model->layers[il];
+        layer.attention_norm->grad = NULL;
+        layer.wq->grad             = NULL;
+        layer.wk->grad             = NULL;
+        layer.wv->grad             = NULL;
+        layer.wo->grad             = NULL;
+        layer.ffn_norm->grad       = NULL;
+        layer.w1->grad             = NULL;
+        layer.w2->grad             = NULL;
+        layer.w3->grad             = NULL;
+    }
+
+    clr_buf(0);
+    clr_buf(1);
+    clr_buf(2);
+    clr_buf(3);
+
+    use_buf(-1);
+
+    struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch);
+    memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch);
+
+    use_buf(-1);
+
+    struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch);
+
+
+    std::vector<int> checkpoints;
+    // for (int il = 0; il < n_layer; ++il) {
+    //     checkpoints.push_back(il);
+    // }
+    // n_check: number of layers between checkpoints
+    int n_check = (int)(sqrtf(n_layer) + 0.5f);
+    printf("%s: n_check = %d\n", __func__, n_check);
+    for (int chk = n_check-1; chk+1 < n_layer; chk += n_check) {
+        checkpoints.push_back(chk);
+    }
+
+    for (int i = 0; i < checkpoints.size(); ++i) {
+        printf("%s: checkpoint #%d = %d\n", __func__, i, checkpoints[i]);
+    }
+
+    // example for 16 layers:
+    // inp  ~    implicit zeroth checkpoint == input
+    // L00 f 4b
+    // L01 f 4b
+    // L02 f 4b
+    // L03 fc4b  first checkpoint
+    // L04 f 3b
+    // L05 f 3b
+    // L06 f 3b
+    // L07 fc3b  second checkpoint
+    // L08 f 2b
+    // L09 f 2b
+    // L10 f 2b
+    // L11 fc2b  third checkpoint
+    // L12 f 1b
+    // L13 f 1b
+    // L14 f 1b
+    // L15 f 1b
+
+    // need to remember these for the backward pass
+    std::vector<struct ggml_tensor *> t02L; t02L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t03L; t03L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t04L; t04L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t05L; t05L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t06L; t06L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t07L; t07L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t08L; t08L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t09L; t09L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t10L; t10L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t11L; t11L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t12L; t12L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t13L; t13L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t14L; t14L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t15L; t15L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t16L; t16L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t17L; t17L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t18L; t18L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t19L; t19L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t20L; t20L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t21L; t21L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t22L; t22L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t23L; t23L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t24L; t24L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t25L; t25L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t26L; t26L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t27L; t27L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t28L; t28L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t29L; t29L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t30L; t30L.resize(n_layer, NULL);
+
+    struct ggml_tensor * cur = t01;
+
+
+    int chk_idx = 0;
+    for (int il = 0; il < n_layer; ++il) {
+        struct my_llama_layer & layer = model->layers[il];
+        // tensors with values necessary for backward pass are in persistent buf(-1)
+        // other tensors with buf(0), buf(1), etc are only temporary needed, and their memory reused
+        bool is_checkpoint = (chk_idx < checkpoints.size() && il == checkpoints[chk_idx]);
+        if (is_checkpoint) {
+            printf("%s: layer %d is_checkpoint\n", __func__, il);
+            chk_idx += 1;
+        }
+        const int prs = 0; // in first forward pass even persistent tensors are only temporary
+        const int tmp = 0; // temporary
+        // nxt is required to compute next layer.
+        // for checkpoints we need to remember this for usage in backward pass,
+        // otherwise temporary until next of this kind
+        const int nxt = is_checkpoint ? -1 : 1;
+        clr_buf(0);
+        use_buf(prs); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
+        use_buf(tmp); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode));          assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode));          assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
+        use_buf(prs); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
+        use_buf(prs); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
+        use_buf(prs); struct ggml_tensor * t14 = expand(gf, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
+        use_buf(prs); struct ggml_tensor * t15 = expand(gf, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
+        use_buf(prs); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+        use_buf(tmp); struct ggml_tensor * t17 = expand(gf, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t18 = expand(gf, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
+        use_buf(tmp); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
+        use_buf(tmp); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t27 = expand(gf, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t28 = expand(gf, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
+        use_buf(tmp); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
+        clr_buf( 1);
+        use_buf(nxt); struct ggml_tensor * t30 = expand(gf, ggml_add          (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
+
+        // only t30L is remembered for checkpointing in first forward pass
+        if (is_checkpoint) {
+            t30L[il] = t30;
+        }
+        cur = t30;
+    }
+    clr_buf(0);
+    use_buf(0);
+    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur, rms_norm_eps));         assert_shape_2d(t31, n_embd, N*n_batch);
+    struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
+    struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
+    use_buf(-1);
+    struct ggml_tensor * t34   = expand(gf, ggml_mul_mat   (ctx0, model->output, t33));        assert_shape_2d(t34, n_vocab, N*n_batch);
+    struct ggml_tensor * t35   = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch));  assert_shape_3d(t35, n_vocab, N, n_batch);
+    struct ggml_tensor * t36   = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets));      assert_shape_1d(t36, 1);
+
+    *gb = *gf;
+
+    // t36->grad gets set to one by optimizer, so we need the tensor.
+    // initialize it with 1.0f to make sure.
+    use_buf(-1);
+    t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
+
+    use_buf(0);
+    t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad));              assert_shape_3d(t35->grad, n_vocab, N, n_batch);
+    t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch));                    assert_shape_2d(t34->grad, n_vocab, N*n_batch);
+    t33->grad = expand(gb, ggml_out_prod   (ctx0, model->output, ggml_transpose(ctx0, t34->grad)));   assert_shape_2d(t33->grad, n_embd, N*n_batch);
+    t32->grad = expand(gb, ggml_mul        (ctx0, t33->grad, t31));                                   assert_shape_2d(t32->grad, n_embd, N*n_batch);
+
+    use_buf(-1);
+
+    model->norm->grad   = expand(gb, add_or_set(model->norm->grad,   ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd);
+    model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad)));            assert_shape_2d(model->output->grad, n_embd, n_vocab);
+
+    clr_buf(1);
+    use_buf(1);
+    t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32));  assert_shape_2d(t31->grad, n_embd, N*n_batch);
+
+    struct ggml_tensor * back_layer_inp = t31;
+    struct ggml_tensor * grad_layer_inp = NULL;
+
+    printf("%s: checkpoints.size() = %zu\n", __func__, checkpoints.size());
+    chk_idx = checkpoints.size()-1;
+    int avail_begin = n_layer;
+    int avail_end = n_layer;
+    printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
+    for (int k = 0; k < n_layer; ++k) {
+        // second forward pass for checkpointing
+        int il = n_layer-1-k;
+        if (il < avail_begin) {
+            // make sure, that txxL[il] is available
+            // forward pass from last checkpoint
+            GGML_ASSERT(chk_idx >= -1);
+            int begin = (chk_idx == -1)
+                        ? 0
+                        : checkpoints[chk_idx] + 1; // checkpoint[chk_idx] contains t30 for computing following layers -> +1
+            int end   = (chk_idx+1 < checkpoints.size())
+                        ? (checkpoints[chk_idx+1] + 1)
+                        : n_layer;
+            GGML_ASSERT(begin <= il);
+            GGML_ASSERT(il < end);
+            cur = (chk_idx == -1) ? t01 : t30L[checkpoints[chk_idx]];
+            clr_buf(2);
+            printf("%s: second forward pass chk_idx=%d begin=%d end=%d\n", __func__, chk_idx, begin, end);
+            for (int i = begin; i < end; ++i) {
+                struct my_llama_layer & layer = model->layers[i];
+                const int prs = 2; // persistent until next checkpoint
+                const int tmp = 0; // temporary for this layer
+                const bool is_checkpoint = (i == end-1);
+                clr_buf(0);
+                use_buf(prs); struct ggml_tensor * t02 = expand(gb, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
+                use_buf(tmp); struct ggml_tensor * t03 = expand(gb, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t04 = expand(gb, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t05 = expand(gb, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t06 = expand(gb, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t07 = expand(gb, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode));          assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t08 = expand(gb, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t09 = expand(gb, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t10 = expand(gb, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode));          assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t11 = expand(gb, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
+                use_buf(prs); struct ggml_tensor * t12 = expand(gb, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
+                use_buf(prs); struct ggml_tensor * t13 = expand(gb, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
+                use_buf(prs); struct ggml_tensor * t14 = expand(gb, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
+                use_buf(prs); struct ggml_tensor * t15 = expand(gb, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
+                use_buf(prs); struct ggml_tensor * t16 = expand(gb, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+                use_buf(tmp); struct ggml_tensor * t17 = expand(gb, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t18 = expand(gb, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t19 = expand(gb, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
+                use_buf(tmp); struct ggml_tensor * t20 = expand(gb, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t21 = expand(gb, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t22 = expand(gb, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
+                use_buf(tmp); struct ggml_tensor * t23 = expand(gb, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t24 = expand(gb, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t25 = expand(gb, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t26 = expand(gb, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t27 = expand(gb, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t28 = expand(gb, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
+                use_buf(tmp); struct ggml_tensor * t29 = expand(gb, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
+                if (t30L[i] == NULL) {
+                    use_buf(prs); struct ggml_tensor * t30 = expand(gb, ggml_add      (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
+                    t30L[i] = t30;
+                    cur     = t30;
+                }
+                t02L[i] = t02;
+                t03L[i] = t03;
+                t04L[i] = t04;
+                t05L[i] = t05;
+                t06L[i] = t06;
+                t07L[i] = t07;
+                t08L[i] = t08;
+                t09L[i] = t09;
+                t10L[i] = t10;
+                t11L[i] = t11;
+                t12L[i] = t12;
+                t13L[i] = t13;
+                t14L[i] = t14;
+                t15L[i] = t15;
+                t16L[i] = t16;
+                t17L[i] = t17;
+                t18L[i] = t18;
+                t19L[i] = t19;
+                t20L[i] = t20;
+                t21L[i] = t21;
+                t22L[i] = t22;
+                t23L[i] = t23;
+                t24L[i] = t24;
+                t25L[i] = t25;
+                t26L[i] = t26;
+                t27L[i] = t27;
+                t28L[i] = t28;
+                t29L[i] = t29;
+            }
+            --chk_idx;
+            avail_begin = begin;
+            avail_end   = end;
+            printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
+        }
+        printf("%s: backward pass il=%d\n", __func__, il);
+
+        struct my_llama_layer & layer = model->layers[il];
+
+        struct ggml_tensor * t02 = t02L[il];
+        struct ggml_tensor * t03 = t03L[il];
+        struct ggml_tensor * t04 = t04L[il];
+        struct ggml_tensor * t05 = t05L[il];
+        struct ggml_tensor * t06 = t06L[il];
+        struct ggml_tensor * t07 = t07L[il];
+        struct ggml_tensor * t08 = t08L[il];
+        struct ggml_tensor * t09 = t09L[il];
+        struct ggml_tensor * t10 = t10L[il];
+        struct ggml_tensor * t11 = t11L[il];
+        struct ggml_tensor * t12 = t12L[il];
+        struct ggml_tensor * t13 = t13L[il];
+        struct ggml_tensor * t14 = t14L[il];
+        struct ggml_tensor * t15 = t15L[il];
+        struct ggml_tensor * t16 = t16L[il];
+        struct ggml_tensor * t17 = t17L[il];
+        struct ggml_tensor * t18 = t18L[il];
+        struct ggml_tensor * t19 = t19L[il];
+        struct ggml_tensor * t20 = t20L[il];
+        struct ggml_tensor * t21 = t21L[il];
+        struct ggml_tensor * t22 = t22L[il];
+        struct ggml_tensor * t23 = t23L[il];
+        struct ggml_tensor * t24 = t24L[il];
+        struct ggml_tensor * t25 = t25L[il];
+        struct ggml_tensor * t26 = t26L[il];
+        struct ggml_tensor * t27 = t27L[il];
+        struct ggml_tensor * t28 = t28L[il];
+        struct ggml_tensor * t29 = t29L[il];
+        struct ggml_tensor * t30 = t30L[il];
+
+        clr_buf(0);
+        use_buf(0);
+        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
+        if (grad_layer_inp) {
+            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
+        }
+        clr_buf(1);
+        t29->grad = t30->grad;                                                                                        assert_shape_2d(t29->grad, n_embd, N*n_batch);
+        t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad)));                       assert_shape_2d(t28->grad, n_ff, N*n_batch);
+        t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25));                                                       assert_shape_2d(t27->grad, n_ff, N*n_batch);
+        t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad));                                                 assert_shape_2d(t26->grad, n_ff, N*n_batch);
+        t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27));                                                       assert_shape_2d(t25->grad, n_ff, N*n_batch);
+        t24->grad = expand(gb, ggml_add_inplace(ctx0,
+                        ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)),
+                        ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad))));                             assert_shape_2d(t24->grad, n_embd, N*n_batch);
+        t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22));                                                       assert_shape_2d(t23->grad, n_embd, N*n_batch);
+        t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad)));              assert_shape_2d(t22->grad, n_embd, N*n_batch);
+        use_buf(1);
+        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad)));                  assert_shape_2d(t21->grad, n_embd, N*n_batch);
+        grad_layer_inp = t21;
+        use_buf(0);
+        t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
+        t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad)));                       assert_shape_2d(t19->grad, n_embd, N*n_batch);
+        t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch));                  assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch);
+        t17->grad = t18->grad;                                                                                        assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch);
+        t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3));                                            assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch);
+        struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true));     assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch);
+        t15->grad = expand(gb, view__v(flash_attn));                                                                  assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch);
+        t14->grad = expand(gb, view__k(flash_attn));                                                                  assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch);
+        t13->grad = expand(gb, view__q(flash_attn));                                                                  assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch);
+        t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
+        t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd));                 assert_shape_2d(t11->grad, N*n_batch, n_embd);
+        t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
+        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode));                            assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
+        t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch));                                  assert_shape_2d(t08->grad, n_embd, N*n_batch);
+        t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
+        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode));                            assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
+        t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch));                                  assert_shape_2d(t05->grad, n_embd, N*n_batch);
+        t04->grad = expand(gb, ggml_add_inplace(ctx0,
+                        ggml_add_inplace(ctx0,
+                            ggml_out_prod(ctx0, layer.wv, t11->grad),
+                            ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))),
+                        ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad))));                             assert_shape_2d(t04->grad, n_embd, N*n_batch);
+        t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02));                                                       assert_shape_2d(t04->grad, n_embd, N*n_batch);
+        use_buf(1);
+        t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02)));              assert_shape_2d(t02->grad, n_embd, N*n_batch);
+        back_layer_inp = t02;
+
+        use_buf(-1);
+        layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm)));   assert_shape_1d(layer.attention_norm->grad, n_embd);
+        layer.wq->grad             = expand(gb, add_or_set(layer.wq->grad,             ggml_out_prod(ctx0, t04, t05->grad)));                       assert_shape_2d(layer.wq->grad,             n_embd, n_embd);
+        layer.wk->grad             = expand(gb, add_or_set(layer.wk->grad,             ggml_out_prod(ctx0, t04, t08->grad)));                       assert_shape_2d(layer.wk->grad,             n_embd, n_embd);
+        layer.wv->grad             = expand(gb, add_or_set(layer.wv->grad,             ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad,             n_embd, n_embd);
+        layer.wo->grad             = expand(gb, add_or_set(layer.wo->grad,             ggml_out_prod(ctx0, t19, t20->grad)));                       assert_shape_2d(layer.wo->grad,             n_embd, n_embd);
+        layer.ffn_norm->grad       = expand(gb, add_or_set(layer.ffn_norm->grad,       ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm)));         assert_shape_1d(layer.ffn_norm->grad,       n_embd);
+        layer.w1->grad             = expand(gb, add_or_set(layer.w1->grad,             ggml_out_prod(ctx0, t24, t26->grad)));                       assert_shape_2d(layer.w1->grad,             n_embd, n_ff);
+        layer.w2->grad             = expand(gb, add_or_set(layer.w2->grad,             ggml_out_prod(ctx0, t28, t29->grad)));                       assert_shape_2d(layer.w2->grad,             n_ff, n_embd);
+        layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
+    }
+    printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
+    GGML_ASSERT(chk_idx == -2);
+    GGML_ASSERT(avail_begin == 0);
+    clr_buf(0);
+    use_buf(0);
+    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad)));  assert_shape_2d(t01->grad, n_embd, N*n_batch);
+    use_buf(-1);
+    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                  assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
+
+    *logits = t35;
+
+    clr_buf(0);
+    clr_buf(1);
+    clr_buf(2);
+    clr_buf(3);
+
+    if (track_max_mem) {
+        printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]);
+        printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
+        printf("%s: max size compute buf2: %zu\n", __func__, buf_maxs[2]);
+        printf("%s: max size compute buf3: %zu\n", __func__, buf_maxs[3]);
+    }
+
+    // now that all grads are created, set the graph leafs and grads
+    graph_set_leafs_grads(gf);
+    graph_set_leafs_grads(gb);
+
+    return t36;
+}
+
 void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
     float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
     *ptr = value;
@@ -2810,6 +3360,7 @@ struct train_params {
     bool use_adam;
     bool use_flash;
     bool use_scratch;
+    bool use_checkpointing;
 
     // only adam
     int   warmup;
@@ -2829,6 +3380,8 @@ struct train_params {
     int mem_compute_gb;
     int mem_compute0_gb;
     int mem_compute1_gb;
+    int mem_compute2_gb;
+    int mem_compute3_gb;
 };
 
 struct train_params get_default_train_params() {
@@ -2860,6 +3413,7 @@ struct train_params get_default_train_params() {
     params.use_adam               = true;
     params.use_flash              = true;
     params.use_scratch            = true;
+    params.use_checkpointing      = true;
 
     // only adam
     params.warmup            =  100;
@@ -2878,8 +3432,9 @@ struct train_params get_default_train_params() {
     params.mem_model_gb   = 2;
     params.mem_compute_gb = 24;
     params.mem_compute0_gb = 8;
-    params.mem_compute1_gb = 2;
-
+    params.mem_compute1_gb = 1;
+    params.mem_compute2_gb = 2;
+    params.mem_compute3_gb = 1;
     return params;
 }
 
@@ -2909,14 +3464,16 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
     fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
     fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
-    fprintf(stderr, "  --no-flash                 Don't use flash attention.\n");
+    fprintf(stderr, "  --no-flash                 Don't use flash attention. Implies no-scratch and no-checkpointing.\n");
     fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
-    fprintf(stderr, "  --no-scratch               Don't use scratch buffers\n");
-    fprintf(stderr, "  --use-scratch              Use scratch buffers (default)\n");
-    fprintf(stderr, "  --warmup N                 Number of warmup steps (default %d)\n", params->warmup);
-    fprintf(stderr, "  --cos-decay-steps N        Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
-    fprintf(stderr, "  --cos-decay-restart N      Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
-    fprintf(stderr, "  --cos-decay-alpha N        Cosine decay alpha (default %f)\n", params->cos_decay_alpha);
+    fprintf(stderr, "  --no-scratch               Don't use scratch buffers. Implies no-checkpointing.\n");
+    fprintf(stderr, "  --use-scratch              Use scratch buffers. Implies use-flash. (default)\n");
+    fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
+    fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing. Implies use-scratch and use-flash. (default)\n");
+    fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
+    fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
+    fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
+    fprintf(stderr, "  --cos-decay-alpha N        Only for Adam optimizer. Cosine decay alpha (default %f)\n", params->cos_decay_alpha);
     fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
     fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
     fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
@@ -2928,6 +3485,8 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
     fprintf(stderr, "  --mem-compute0 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb);
     fprintf(stderr, "  --mem-compute1 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb);
+    fprintf(stderr, "  --mem-compute2 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute2_gb);
+    fprintf(stderr, "  --mem-compute3 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute3_gb);
     fprintf(stderr, "\n");
 }
 
@@ -3065,6 +3624,10 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
             params->use_scratch = false;
         } else if (arg == "--use-scratch") {
             params->use_scratch = true;
+        } else if (arg == "--no-checkpointing") {
+            params->use_checkpointing = false;
+        } else if (arg == "--use-checkpointing") {
+            params->use_checkpointing = true;
         } else if (arg == "--warmup") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -3155,6 +3718,18 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->mem_compute1_gb = std::stoi(argv[i]);
+        } else if (arg == "--mem-compute2") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_compute2_gb = std::stoi(argv[i]);
+        } else if (arg == "--mem-compute3") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_compute3_gb = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
             train_print_usage(argc, argv, &default_params);
             exit(0);
@@ -3316,8 +3891,12 @@ int main(int argc, char ** argv) {
 
     size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb);
     size_t size_buf_1 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute1_gb);
+    size_t size_buf_2 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute2_gb);
+    size_t size_buf_3 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute3_gb);
     uint8_t * compute_buf_0 = new uint8_t[size_buf_0];
     uint8_t * compute_buf_1 = new uint8_t[size_buf_1];
+    uint8_t * compute_buf_2 = new uint8_t[size_buf_2];
+    uint8_t * compute_buf_3 = new uint8_t[size_buf_3];
 
     GGML_ASSERT(n_tokens < (int) train_tokens.size());
     std::vector<int> train_samples;
@@ -3376,7 +3955,15 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * loss   = NULL;
         struct ggml_tensor * logits = NULL;
 
-        if (params.use_scratch) {
+        if (params.use_checkpointing) {
+            loss = forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
+                    &model, ctx0,
+                    gf, gb,
+                    &logits, tokens_input, target_probs,
+                    compute_buf_0, compute_buf_1, compute_buf_2, compute_buf_3,
+                    size_buf_0, size_buf_1, size_buf_2, size_buf_3,
+                    n_tokens, n_batch);
+        } else if (params.use_scratch) {
             loss = forward_batch_wo_cache_flash_attn_train(
                     &model, ctx0,
                     gf, gb,

From e05e4414ac2a66cdde0efa7799b3a3eac92863db Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 27 Jun 2023 17:43:00 +0200
Subject: [PATCH 006/100] remove unused compute buffer 3

---
 .../train-text-from-scratch.cpp               | 74 +++++++------------
 1 file changed, 27 insertions(+), 47 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index faa60ec8bf320..075e0307f4f64 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1932,11 +1932,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
         void                  * compute_buf_0,
         void                  * compute_buf_1,
         void                  * compute_buf_2,
-        void                  * compute_buf_3,
         size_t                  size_buf_0,
         size_t                  size_buf_1,
         size_t                  size_buf_2,
-        size_t                  size_buf_3,
         const  int              n_tokens,
         const  int              n_batch) {
 
@@ -1966,16 +1964,14 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
     bool track_max_mem = true;
 
     int last_buf = -1;
-    size_t buf_offs[4] = { 0, 0, 0, 0 };
-    size_t buf_size[4] = { size_buf_0,
+    size_t buf_offs[3] = { 0, 0, 0 };
+    size_t buf_size[3] = { size_buf_0,
                            size_buf_1,
-                           size_buf_2,
-                           size_buf_3 };
-    void * buf_data[4] = { compute_buf_0,
+                           size_buf_2 };
+    void * buf_data[3] = { compute_buf_0,
                            compute_buf_1,
-                           compute_buf_2,
-                           compute_buf_3 };
-    size_t buf_maxs[4] = { 0, 0, 0, 0 };
+                           compute_buf_2 };
+    size_t buf_maxs[3] = { 0, 0, 0 };
 
     auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs] (int buf) {
         size_t last_offs = 0;
@@ -2083,7 +2079,6 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
     clr_buf(0);
     clr_buf(1);
     clr_buf(2);
-    clr_buf(3);
 
     use_buf(-1);
 
@@ -2112,22 +2107,22 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
 
     // example for 16 layers:
     // inp  ~    implicit zeroth checkpoint == input
-    // L00 f 4b
-    // L01 f 4b
+    // L00 f 4b  [
+    // L01 f 4b    4th second forward pass
     // L02 f 4b
-    // L03 fc4b  first checkpoint
-    // L04 f 3b
-    // L05 f 3b
+    // L03 fc4b  ] first checkpoint
+    // L04 f 3b  [
+    // L05 f 3b   3rd second forward pass
     // L06 f 3b
-    // L07 fc3b  second checkpoint
-    // L08 f 2b
-    // L09 f 2b
+    // L07 fc3b  ] second checkpoint
+    // L08 f 2b  [
+    // L09 f 2b   2nd second forward pass
     // L10 f 2b
-    // L11 fc2b  third checkpoint
-    // L12 f 1b
-    // L13 f 1b
+    // L11 fc2b  ] third checkpoint
+    // L12 f 1b  [
+    // L13 f 1b   1st second forward pass
     // L14 f 1b
-    // L15 f 1b
+    // L15 f 1b  ]
 
     // need to remember these for the backward pass
     std::vector<struct ggml_tensor *> t02L; t02L.resize(n_layer, NULL);
@@ -2162,7 +2157,6 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
 
     struct ggml_tensor * cur = t01;
 
-
     int chk_idx = 0;
     for (int il = 0; il < n_layer; ++il) {
         struct my_llama_layer & layer = model->layers[il];
@@ -2455,13 +2449,11 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
     clr_buf(0);
     clr_buf(1);
     clr_buf(2);
-    clr_buf(3);
 
     if (track_max_mem) {
         printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]);
         printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
         printf("%s: max size compute buf2: %zu\n", __func__, buf_maxs[2]);
-        printf("%s: max size compute buf3: %zu\n", __func__, buf_maxs[3]);
     }
 
     // now that all grads are created, set the graph leafs and grads
@@ -3434,7 +3426,6 @@ struct train_params get_default_train_params() {
     params.mem_compute0_gb = 8;
     params.mem_compute1_gb = 1;
     params.mem_compute2_gb = 2;
-    params.mem_compute3_gb = 1;
     return params;
 }
 
@@ -3486,7 +3477,6 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --mem-compute0 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb);
     fprintf(stderr, "  --mem-compute1 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb);
     fprintf(stderr, "  --mem-compute2 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute2_gb);
-    fprintf(stderr, "  --mem-compute3 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute3_gb);
     fprintf(stderr, "\n");
 }
 
@@ -3724,12 +3714,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->mem_compute2_gb = std::stoi(argv[i]);
-        } else if (arg == "--mem-compute3") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_compute3_gb = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
             train_print_usage(argc, argv, &default_params);
             exit(0);
@@ -3892,11 +3876,9 @@ int main(int argc, char ** argv) {
     size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb);
     size_t size_buf_1 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute1_gb);
     size_t size_buf_2 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute2_gb);
-    size_t size_buf_3 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute3_gb);
     uint8_t * compute_buf_0 = new uint8_t[size_buf_0];
     uint8_t * compute_buf_1 = new uint8_t[size_buf_1];
     uint8_t * compute_buf_2 = new uint8_t[size_buf_2];
-    uint8_t * compute_buf_3 = new uint8_t[size_buf_3];
 
     GGML_ASSERT(n_tokens < (int) train_tokens.size());
     std::vector<int> train_samples;
@@ -3924,9 +3906,9 @@ int main(int argc, char ** argv) {
         }
 
         struct ggml_init_params cparams = {
-            /*.mem_size   =*/ compute_size,
-            /*.mem_buffer =*/ compute_addr,
-            /*.no_alloc   =*/ false,
+            compute_size, // mem_size
+            compute_addr, // mem_buffer
+            false,        // no_alloc
         };
         struct ggml_context * ctx0 = ggml_init(cparams);
 
@@ -3960,8 +3942,8 @@ int main(int argc, char ** argv) {
                     &model, ctx0,
                     gf, gb,
                     &logits, tokens_input, target_probs,
-                    compute_buf_0, compute_buf_1, compute_buf_2, compute_buf_3,
-                    size_buf_0, size_buf_1, size_buf_2, size_buf_3,
+                    compute_buf_0, compute_buf_1, compute_buf_2,
+                    size_buf_0, size_buf_1, size_buf_2,
                     n_tokens, n_batch);
         } else if (params.use_scratch) {
             loss = forward_batch_wo_cache_flash_attn_train(
@@ -4082,9 +4064,9 @@ int main(int argc, char ** argv) {
         printf("---\n");
         for (int i=0; i<n_gen; ++i) {
             struct ggml_init_params cparams = {
-                /*.mem_size   =*/ compute_size,
-                /*.mem_buffer =*/ compute_addr,
-                /*.no_alloc   =*/ false,
+                compute_size, // .mem_size
+                compute_addr, // .mem_buffer
+                false,        // .no_alloc
             };
             struct ggml_context * ctx0 = ggml_init(cparams);
 
@@ -4120,10 +4102,8 @@ int main(int argc, char ** argv) {
     delete[] compute_addr;
     delete[] compute_buf_0;
     delete[] compute_buf_1;
-
+    ggml_free(model.ctx);
     llama_free(lctx);
     llama_free_model(lmodel);
-    ggml_free(model.ctx);
-
     return 0;
 }

From ed4319e1a78e38777a3d0174f667829d9c0cc271 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 28 Jul 2023 23:08:11 +0200
Subject: [PATCH 007/100] add and use function ggml_build_backward_expand to
 avoid stack overflows with large maximum number of nodes

GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
---
 .../train-text-from-scratch.cpp                  | 16 ++++++++++------
 ggml.c                                           | 10 ++++++----
 ggml.h                                           |  3 ++-
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 075e0307f4f64..61def445ecdc6 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3957,12 +3957,14 @@ int main(int argc, char ** argv) {
             logits = forward_batch_wo_cache_flash_attn(&model, ctx0, gf, tokens_input, n_tokens, n_batch);
             loss   = cross_entropy_loss(ctx0, logits, target_probs);
             ggml_build_forward_expand(gf, loss);
-            *gb = ggml_build_backward(ctx0, gf, true);
+            *gb = *gf;
+            ggml_build_backward_expand(ctx0, gf, gb, true);
         } else {
             logits = forward_batch_wo_cache(&model, ctx0, gf, tokens_input, n_tokens, n_batch);
             loss   = cross_entropy_loss(ctx0, logits, target_probs);
             ggml_build_forward_expand(gf, loss);
-            *gb = ggml_build_backward(ctx0, gf, true);
+            *gb = *gf;
+            ggml_build_backward_expand(ctx0, gf, gb, true);
         }
 
         ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
@@ -4070,13 +4072,15 @@ int main(int argc, char ** argv) {
             };
             struct ggml_context * ctx0 = ggml_init(cparams);
 
-            ggml_cgraph gf = {};
+            struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
+            memset(gfbuf->data, 0, ggml_nbytes(gfbuf));
+            struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
 
             int n_past = 0;
-            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
+            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
 
-            ggml_build_forward_expand(&gf, logits);
-            ggml_graph_compute_helper(work_buffer, &gf, params.n_threads);
+            ggml_build_forward_expand(gf, logits);
+            ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
 
             //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
             //struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
diff --git a/ggml.c b/ggml.c
index 19a194beb2542..92717f0aac7af 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15787,9 +15787,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
     return result;
 }
 
-struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
-    struct ggml_cgraph result = *gf;
-
+void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
     GGML_ASSERT(gf->n_nodes > 0);
 
     // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
@@ -15818,10 +15816,14 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
 
         if (node->is_param) {
             GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
-            ggml_build_forward_expand(&result, node->grad);
+            ggml_build_forward_expand(gb, node->grad);
         }
     }
+}
 
+struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
+    struct ggml_cgraph result = *gf;
+    ggml_build_backward_expand(ctx, gf, &result, keep);
     return result;
 }
 
diff --git a/ggml.h b/ggml.h
index 460976468a056..8f51f5d222099 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1403,7 +1403,8 @@ extern "C" {
             struct ggml_tensor  * tensor);
 
 
-    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
 
     GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
     GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);

From a80f184e6d386b4a6d74902ddae61bf4740fd9a1 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 29 Jun 2023 21:31:25 +0200
Subject: [PATCH 008/100] change AdamW decay parameter to work like the torch
 AdamW decay parameter

It is now relative to Adam learning rate `alpha*sched`.
Before that it was relative to `sched` only.

`alpha` being the maximum learning rate and `sched` being a scaling parameter in [0..1]
---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 92717f0aac7af..451c765f97ad4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17351,8 +17351,8 @@ static enum ggml_opt_result ggml_opt_adam(
 
     // constants
     const float sched = params.adam.sched;
-    const float decay = params.adam.decay * sched;
     const float alpha = params.adam.alpha * sched;
+    const float decay = params.adam.decay * alpha;
     const float beta1 = params.adam.beta1;
     const float beta2 = params.adam.beta2;
     const float eps   = params.adam.eps;

From f175ead6efc451bf60fd543ede756383b32b75b1 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 29 Jun 2023 21:33:39 +0200
Subject: [PATCH 009/100] change default AdamW weight decay parameter used in
 training to 0.1 as used in nanoGPT

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 61def445ecdc6..9ee255f4e05c2 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3416,7 +3416,7 @@ struct train_params get_default_train_params() {
     params.lbfgs_n_iter      = 16;
     params.adam_n_iter       = 16;
     params.adam_alpha        = 1e-3f;
-    params.adam_decay        = 1e-3f;
+    params.adam_decay        = 1e-1f;
     params.adam_beta1        = 0.9f;
     params.adam_beta2        = 0.999f;
     params.adam_gclip        = 1.0f;

From 97964a4cc964b099748ef7ca595b59606458c80f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 29 Jun 2023 21:36:28 +0200
Subject: [PATCH 010/100] change default AdamW weight decay parameter defined
 in ggml to 0.0, making Adam default instead of AdamW

btw: the default weight decay parameter for torch.optim.AdamW is 0.01
---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 451c765f97ad4..229ddb2de6dd4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17916,7 +17916,7 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
                     .adam = {
                         .n_iter = 10000,
                         .sched  = 1.000f,
-                        .decay  = 0.001f,
+                        .decay  = 0.0f,
                         .alpha  = 0.001f,
                         .beta1  = 0.9f,
                         .beta2  = 0.999f,

From 2c6985f79e70c175767ef76925bac99fb0107c18 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 20:55:54 +0200
Subject: [PATCH 011/100] bug fixes for cross entropy loss

ggml_cross_entropy_loss: sums where not correctly added in workload of each thread
ggml_cross_entropy_loss_back: simplify backward process, reducing numerical issues

guard usage of exp f16 lookup in cross entropy by #define GGML_CROSS_ENTROPY_EXP_FP16

cross entropy loss is only used once during training, but it is quite sensitive to numerical errors introduced by exp-f16-lookup.
so exp-f16-lookup for cross entropy loss is disabled by default, trading better gradients for very slightly worse runtime performance.
---
 ggml.c | 103 +++++++++++++++++----------------------------------------
 1 file changed, 30 insertions(+), 73 deletions(-)

diff --git a/ggml.c b/ggml.c
index 229ddb2de6dd4..d718de33b044f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -123,6 +123,7 @@ typedef void * thread_ret_t;
 #define GGML_GELU_FP16
 #define GGML_GELU_QUICK_FP16
 #define GGML_SILU_FP16
+// #define GGML_CROSS_ENTROPY_EXP_FP16
 
 #define GGML_SOFT_MAX_UNROLL 4
 #define GGML_VEC_DOT_UNROLL  2
@@ -11486,6 +11487,7 @@ static void ggml_compute_forward_soft_max_back_f32(
         // dx = J * dy
         // dxk = sum_i(Jki * dyi)
         // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
+        // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
         // dxk = sum_i(-yk*yi * dyi) + yk*dyk
         // dxk = -yk * sum_i(yi * dyi) + yk*dyk
         // dxk = -yk * dot(y, dy) + yk*dyk
@@ -13109,6 +13111,7 @@ static void ggml_compute_forward_flash_attn_f32(
                         if (SS[j] == -INFINITY) {
                             SS[j] = 0.0f;
                         } else {
+                            // const float val = expf(SS[j] - max);
                             ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
                             memcpy(&scvt[j], &s, sizeof(uint16_t));
                             const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
@@ -13700,6 +13703,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
                             if (SR[j] == -INFINITY) {
                                 SW[j] = 0.0f;
                             } else {
+                                // const float val = expf(SR[j] - max);
                                 ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
                                 memcpy(&scvt[j], &s, sizeof(uint16_t));
                                 const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
@@ -14317,6 +14321,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
     const int nc = src0->ne[0];
     const int nr = ggml_nrows(src0);
 
+    GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
+
     if (params->type == GGML_TASK_INIT) {
         if (ith == 0) {
             memset(sums, 0, sizeof(float) * (nth + nth * nc));
@@ -14345,7 +14351,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
     for (int i1 = ir0; i1 < ir1; i1++) {
         float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
         float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
-        float * st = (float *) params->wdata + nth + ith*nc;
+        float * st = ((float *) params->wdata) + nth + ith*nc;
 
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
@@ -14365,10 +14371,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
                 if (s0[i] == -INFINITY) {
                     st[i] = 0.0f;
                 } else {
-                    // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
+#ifndef GGML_CROSS_ENTROPY_EXP_FP16
+                    const float s = s0[i] - max;
+                    const float val = expf(s);
+#else
                     ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
                     memcpy(&scvt, &s, sizeof(scvt));
                     const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+#endif
                     sum += (ggml_float)val;
                     st[i] = val;
                 }
@@ -14384,7 +14394,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
         ggml_vec_log_f32(nc, st, st);
         ggml_vec_mul_f32(nc, st, st, s1);
 
-        ggml_vec_sum_f32(nc, sums + ith, st);
+        float st_sum = 0;
+        ggml_vec_sum_f32(nc, &st_sum, st);
+        sums[ith] += st_sum;
 
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
@@ -14434,7 +14446,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
         return;
     }
 
-    const float eps = 1e-9f;
+    const double eps = 1e-9f;
 
     // TODO: handle transposed/permuted matrices
     const int64_t nc = src0->ne[0];
@@ -14453,7 +14465,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
         float * ds0 = (float *)((char *) dst->data  + i1*dst->nb[1]);
         float * s0  = (float *)((char *) src0->data + i1*src0->nb[1]);
         float * s1  = (float *)((char *) src1->data + i1*src1->nb[1]);
-        float * sm  = (float *) params->wdata + ith*nc;
 
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
@@ -14462,54 +14473,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
             assert(!isnan(s1[i]));
         }
 #endif
-        // step by step explanation:
-        {
-            //float * sums = (float *) params->wdata;
-
-            // forward pass with annotated gradients from backward pass
-            // (built by going in reverse operation order, adding to gradients of current operation args)
-            // st0 = exp(s0-max(s0))                                                       grad[st0] = grad[st1]*(1.0 - eps)/sum
-                                                          // from softmax_back:            grad[s0]  = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
-            // ggml_vec_scale_f32(nc, st, sum);           // st1 = st0*/sum = softmax(s0)  grad[st1] = grad[st2]*(1.0 - eps)
-            // ggml_vec_scale_f32(nc, st, (1.0f - eps));  // st2 = st1*(1.0 - eps)         grad[st2] = grad[st3]
-            // ggml_vec_add1_f32(nc, st, st, eps);        // st3 = st2 + eps               grad[st3] = grad[st4]/st3
-            // ggml_vec_log_f32(nc, st, st);              // st4 = log(st3)                grad[st4] = grad[st5] * s1
-            // ggml_vec_mul_f32(nc, st, st, s1);          // st5 = st4 * s1                grad[st5] = grad[sums[ith]]
-            // ggml_vec_sum_f32(nc, sums + ith, st);      // sums[ith] = st5               grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
-
-            // substitute into grad[st1], because we can reuse softmax_back from this point on
-            // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
-            // postorder:
-            // grad[st1] := softmax(s0)
-            // grad[st1] := grad[st1]*(1.0 - eps)
-            // grad[st1] := grad[st1] + eps
-            // grad[st1] := s1 / grad[st1]
-            // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
-
-            // src0 gradients by going through softmax_back
-            // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
-            //   from softmax_back:
-            //   dxk = yk * (dyk - dot(y, dy))
-            //   dot_y_dy := dot(y, dy)
-            //   dx := dy
-            //   dx := dx - dot_y_dy
-            //   dx := dx * y
-            //   postorder:
-            //   dot_st1_dst1 := dot(st1, grad[st1])
-            //   grad[s0] := grad[st1]
-            //   grad[s0] := grad[s0] - dot_st1_dst1
-            //   grad[s0] := grad[s0] * st1
-
-            // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
-            // sm           := softmax(s0)
-            // grad[s0]     := sm*(1.0 - eps)
-            // grad[s0]     := grad[s0] + eps
-            // grad[s0]     := s1 / grad[s0]
-            // grad[s0]     := grad[s0]*(1.0-eps)*-grad[cel]
-            // dot_st1_dst1 := dot(sm, grad[s0])
-            // grad[s0]     := grad[s0] - dot_st1_dst1
-            // grad[s0]     := grad[s0] * sm
-        }
 
         // soft_max
         ggml_float sum = 0.0;
@@ -14520,36 +14483,34 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
             uint16_t scvt;
             for (int i = 0; i < nc; i++) {
                 if (s0[i] == -INFINITY) {
-                    sm[i] = 0.0f;
+                    ds0[i] = 0.0f;
                 } else {
-                    // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
+#ifndef GGML_CROSS_ENTROPY_EXP_FP16
+                    const float s = s0[i] - max;
+                    const float val = expf(s);
+#else
                     ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
                     memcpy(&scvt, &s, sizeof(scvt));
                     const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+#endif
                     sum += (ggml_float)val;
-                    sm[i] = val;
+                    ds0[i] = val;
                 }
             }
 
             assert(sum > 0.0);
-            sum = 1.0/sum;
+            sum = (1.0 - eps)/sum;
         }
 
-        float dot_st1_dst1 = 0;
-        ggml_vec_scale_f32(nc, sm, sum);
-        ggml_vec_cpy_f32  (nc, ds0, sm);
-        ggml_vec_scale_f32(nc, ds0, (1.0f - eps));
-        ggml_vec_add1_f32 (nc, ds0, ds0, eps);
-        ggml_vec_div_f32  (nc, ds0, s1, ds0);
-        ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
-        ggml_vec_dot_f32  (nc, &dot_st1_dst1, sm, ds0);
-        ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
-        ggml_vec_mul_f32  (nc, ds0, ds0, sm);
+        // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
+        ggml_vec_scale_f32(nc, ds0, sum);
+        ggml_vec_add1_f32(nc, ds0, ds0, eps);
+        ggml_vec_sub_f32(nc, ds0, ds0, s1);
+        ggml_vec_scale_f32(nc, ds0, d[0]);
+
 
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
-            assert(!isnan(sm[i]));
-            assert(!isinf(sm[i]));
             assert(!isnan(ds0[i]));
             assert(!isinf(ds0[i]));
         }
@@ -16445,10 +16406,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
             case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
                 {
                     n_tasks = n_threads;
-
-                    size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
-
-                    work_size = MAX(work_size, cur);
                 } break;
             case GGML_OP_NONE:
                 {

From 2d1e6e06753a84b44a323995b68c52eabec1ba7a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 20:57:58 +0200
Subject: [PATCH 012/100] fix test-grad0 for cross_entropy_loss

the second argument to cross_entropy_loss must sum up to 1 for each row
---
 tests/test-grad0.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index 6d312216d58af..dc19c1ad273a4 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -1358,15 +1358,26 @@ int main(int argc, const char ** argv) {
             int64_t ne2[4];
             get_random_dims(ne2, 4);
 
-            for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
                 x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
+                // the second argument to cross_entropy_loss must sum up to 1 for each row
+                int nr = ggml_nrows(x[1]);
+                int nc = ggml_nelements(x[1]) / nr;
+                for (int ir = 0; ir < nr; ++ir) {
+                    float sum = 0;
+                    for (int ic = 0; ic < nc; ++ic) {
+                        sum += ((float *) x[1]->data)[ic + ir*nc];
+                    }
+                    for (int ic = 0; ic < nc; ++ic) {
+                        ((float *) x[1]->data)[ic + ir*nc] /= sum;
+                    }
+                }
                 ggml_set_param(ctx0, x[0]);
 
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
+                struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
 
-                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY);
-                // finite differences regularly fails!
+                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-1f, INFINITY);
             }
         }
 

From 864e7e3aa1b08c4b2cd8cc2f17e0722fd019ffca Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 20:58:52 +0200
Subject: [PATCH 013/100] fix test-grad0 for soft_max

dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
---
 tests/test-grad0.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index dc19c1ad273a4..edc7e2834c7f8 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -1345,9 +1345,18 @@ int main(int argc, const char ** argv) {
                 x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[0]);
 
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0]));
-
-                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                float eps = 1e-6f;
+                // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
+                // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
+                struct ggml_tensor * f = ggml_sum(ctx0, 
+                                            ggml_log(ctx0, 
+                                                ggml_add1(ctx0, 
+                                                    ggml_scale(ctx0,
+                                                        ggml_soft_max(ctx0, x[0]),
+                                                        ggml_new_f32(ctx0, 1.0f - eps)),
+                                                    ggml_new_f32(ctx0, eps))));
+
+                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
             }
         }
 

From 87febeec91b9da387bf668dc10f83915d4bd19de Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 20:59:36 +0200
Subject: [PATCH 014/100] improve finite differences of test-grad0 by using
 double instead of float

---
 tests/test-grad0.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index edc7e2834c7f8..fe2ca212f82a0 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -275,14 +275,14 @@ bool check_gradient(
 
             ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
-            const float f0 = ggml_get_f32_1d(f, 0);
+            const double f0 = ggml_get_f32_1d(f, 0);
 
             ggml_set_f32_1d(x[i], k, xm);
 
             ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
-            const float f1 = ggml_get_f32_1d(f, 0);
-            const float g0 = (f0 - f1)/(2.0f*eps);
+            const double f1 = ggml_get_f32_1d(f, 0);
+            const double g0 = (f0 - f1)/(2.0*(double) eps);
 
             ggml_set_f32_1d(x[i], k, x0);
 
@@ -292,10 +292,10 @@ bool check_gradient(
 
             ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
 
-            const float g1 = ggml_get_f32_1d(x[i]->grad, k);
+            const double g1 = ggml_get_f32_1d(x[i]->grad, k);
 
-            const float error_abs = fabsf(g0 - g1);
-            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
+            const double error_abs = fabs(g0 - g1);
+            const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
 
             if (error_abs > max_error_abs || error_rel > max_error_rel) {
                 printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",

From 51dc77092fa0aaaf832dbfda46058a413521b8a9 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 21:05:12 +0200
Subject: [PATCH 015/100] change cross_entropy_loss to output average over all
 rows

this helps keeping the loss and gradients in a sane range
---
 ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index d718de33b044f..07d100bf070d3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14334,7 +14334,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
         if (ith == 0) {
             float * dp = (float *) dst->data;
             ggml_vec_sum_f32(nth, dp, sums);
-            dp[0] *= -1.0f;
+            dp[0] *= -1.0f / (float) nr;
         }
         return;
     }
@@ -14506,7 +14506,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
         ggml_vec_scale_f32(nc, ds0, sum);
         ggml_vec_add1_f32(nc, ds0, ds0, eps);
         ggml_vec_sub_f32(nc, ds0, ds0, s1);
-        ggml_vec_scale_f32(nc, ds0, d[0]);
+        ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
 
 
 #ifndef NDEBUG

From 3744a9be74b27c758b06ea2bdf8ee97046e2b196 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 21:11:11 +0200
Subject: [PATCH 016/100] improve gradient checkpointing

sqrt(n_layers) is only the best checkpoint step when mem size of checkpoints and mem size of layers are equal.
since layers require more memory than the single-tensor-checkpoint we use, the optimal values are compute different:

```
  given: n, u, v
  objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
  b=n/a
  minimize(a*u+v*n/a)
  diff(a*u+v*n/a, a) = u - (v*n/a)/a
  diff(a*u+v*n/a, a) == 0
  u - (v*n/a)/a == 0
  u == v*n/(a*a)
  u*a*a = v*n
  a*a = v*n/u
  a = sqrt(n*v/u)
```

this change results in more checkpoints, requiring less layers to store between checkpoints, overall improving memory usage.
---
 .../train-text-from-scratch.cpp               | 39 +++++++++++++------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 9ee255f4e05c2..ae3f79c63bb95 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2090,22 +2090,39 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
     struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch);
 
 
+    {
+        // given: n, u, v
+        // objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
+        // b=n/a
+        // minimize(a*u+v*n/a)
+        // diff(a*u+v*n/a, a) = u - (v*n/a)/a
+        // diff(a*u+v*n/a, a) == 0
+        // u - (v*n/a)/a == 0
+        // u == v*n/(a*a)
+        // u*a*a = v*n
+        // a*a = v*n/u
+        // a = sqrt(n*v/u)
+    }
+
+    float memcost_checkpoint   = n_embd;           // (..)*N*n_batch
+    float memcost_snd_fwd_pass = 14*n_embd+4*n_ff; // (..)*N*n_batch
+
+    int n_checkstep = (int)(sqrtf(n_layer*memcost_checkpoint/memcost_snd_fwd_pass) + 0.5f);
+    if (n_checkstep < 1) {
+        n_checkstep = 1;
+    }
     std::vector<int> checkpoints;
-    // for (int il = 0; il < n_layer; ++il) {
-    //     checkpoints.push_back(il);
-    // }
-    // n_check: number of layers between checkpoints
-    int n_check = (int)(sqrtf(n_layer) + 0.5f);
-    printf("%s: n_check = %d\n", __func__, n_check);
-    for (int chk = n_check-1; chk+1 < n_layer; chk += n_check) {
+    for (int chk = n_checkstep-1; chk+1 < n_layer; chk += n_checkstep) {
         checkpoints.push_back(chk);
     }
+    int n_check = checkpoints.size();
+    // printf("%s: n_check = %d n_checkstep = %d\n", __func__, n_check, n_checkstep);
 
-    for (int i = 0; i < checkpoints.size(); ++i) {
-        printf("%s: checkpoint #%d = %d\n", __func__, i, checkpoints[i]);
-    }
+    // for (int i = 0; i < n_check; ++i) {
+    //     printf("%s: checkpoint #%d = %d\n", __func__, i, checkpoints[i]);
+    // }
 
-    // example for 16 layers:
+    // example for 16 layers and memcost_checkpoint=memcost_snd_fwd_pass:
     // inp  ~    implicit zeroth checkpoint == input
     // L00 f 4b  [
     // L01 f 4b    4th second forward pass

From fc379a2de36d45f5fcc12410dcc4eb468e294f8e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 21:12:25 +0200
Subject: [PATCH 017/100] disable gradient checkpointing debug output

---
 .../train-text-from-scratch.cpp               | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index ae3f79c63bb95..08821d4129ef1 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2179,9 +2179,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
         struct my_llama_layer & layer = model->layers[il];
         // tensors with values necessary for backward pass are in persistent buf(-1)
         // other tensors with buf(0), buf(1), etc are only temporary needed, and their memory reused
-        bool is_checkpoint = (chk_idx < checkpoints.size() && il == checkpoints[chk_idx]);
+        bool is_checkpoint = (chk_idx < n_check && il == checkpoints[chk_idx]);
         if (is_checkpoint) {
-            printf("%s: layer %d is_checkpoint\n", __func__, il);
+            // printf("%s: layer %d is_checkpoint\n", __func__, il);
             chk_idx += 1;
         }
         const int prs = 0; // in first forward pass even persistent tensors are only temporary
@@ -2263,11 +2263,11 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
     struct ggml_tensor * back_layer_inp = t31;
     struct ggml_tensor * grad_layer_inp = NULL;
 
-    printf("%s: checkpoints.size() = %zu\n", __func__, checkpoints.size());
-    chk_idx = checkpoints.size()-1;
+    // printf("%s: n_check = %u\n", __func__, n_check);
+    chk_idx = n_check-1;
     int avail_begin = n_layer;
     int avail_end = n_layer;
-    printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
+    // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
     for (int k = 0; k < n_layer; ++k) {
         // second forward pass for checkpointing
         int il = n_layer-1-k;
@@ -2278,14 +2278,14 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
             int begin = (chk_idx == -1)
                         ? 0
                         : checkpoints[chk_idx] + 1; // checkpoint[chk_idx] contains t30 for computing following layers -> +1
-            int end   = (chk_idx+1 < checkpoints.size())
+            int end   = (chk_idx+1 < n_check)
                         ? (checkpoints[chk_idx+1] + 1)
                         : n_layer;
             GGML_ASSERT(begin <= il);
             GGML_ASSERT(il < end);
             cur = (chk_idx == -1) ? t01 : t30L[checkpoints[chk_idx]];
             clr_buf(2);
-            printf("%s: second forward pass chk_idx=%d begin=%d end=%d\n", __func__, chk_idx, begin, end);
+            // printf("%s: second forward pass chk_idx=%d begin=%d end=%d\n", __func__, chk_idx, begin, end);
             for (int i = begin; i < end; ++i) {
                 struct my_llama_layer & layer = model->layers[i];
                 const int prs = 2; // persistent until next checkpoint
@@ -2357,9 +2357,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
             --chk_idx;
             avail_begin = begin;
             avail_end   = end;
-            printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
+            // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
         }
-        printf("%s: backward pass il=%d\n", __func__, il);
+        // printf("%s: backward pass il=%d\n", __func__, il);
 
         struct my_llama_layer & layer = model->layers[il];
 
@@ -2452,7 +2452,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
         layer.w2->grad             = expand(gb, add_or_set(layer.w2->grad,             ggml_out_prod(ctx0, t28, t29->grad)));                       assert_shape_2d(layer.w2->grad,             n_ff, n_embd);
         layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
     }
-    printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
+    // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
     GGML_ASSERT(chk_idx == -2);
     GGML_ASSERT(avail_begin == 0);
     clr_buf(0);

From d0fbb7d328d16f11a7ee229d08db81a46ca92bf0 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 28 Jul 2023 23:05:02 +0200
Subject: [PATCH 018/100] llama : fix rope usage in train-text-from-scratch
 after ChatGLM change

---
 .../train-text-from-scratch.cpp               | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 08821d4129ef1..b597fc82979d3 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -456,8 +456,8 @@ struct ggml_tensor * forward(
             // wk   shape [n_embd, n_embd, 1, 1]
             // Qcur shape [n_embd/n_head, n_head, N, 1]
             // Kcur shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
 
             // store key and value to memory
             {
@@ -713,8 +713,8 @@ struct ggml_tensor * forward_batch(
             // wk   shape [n_embd, n_embd, 1, 1]
             // Qcur shape [n_embd/n_head, n_head, N, n_batch]
             // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
             assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
             assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
 
@@ -957,7 +957,7 @@ struct ggml_tensor * forward_batch_wo_cache(
     const int N = n_tokens;
 
     const auto & hparams = model->hparams;
-    //const int n_ctx   = hparams.n_ctx;
+    const int n_ctx   = hparams.n_ctx;
     const int n_vocab = hparams.n_vocab;
     const int n_embd  = hparams.n_embd;
     const int n_layer = hparams.n_layer;
@@ -998,8 +998,8 @@ struct ggml_tensor * forward_batch_wo_cache(
             // wk   shape [n_embd, n_embd, 1, 1]
             // Qcur shape [n_embd/n_head, n_head, N, n_batch]
             // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
             assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
             assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
 
@@ -1185,7 +1185,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
     const int N = n_tokens;
 
     const auto & hparams = model->hparams;
-    //const int n_ctx   = hparams.n_ctx;
+    const int n_ctx   = hparams.n_ctx;
     const int n_vocab = hparams.n_vocab;
     const int n_embd  = hparams.n_embd;
     const int n_layer = hparams.n_layer;
@@ -1220,8 +1220,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
             // compute Q and K and RoPE them
             // wq   shape [n_embd, n_embd, 1, 1]
             // wk   shape [n_embd, n_embd, 1, 1]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
             assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
             assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
 
@@ -1613,10 +1613,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
         use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
         use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, 0));       assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
         use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
         use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, 0));       assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
         use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
         use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
         use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
@@ -1952,7 +1952,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
     gf->perf_time_us = 0;
 
     const auto & hparams = model->hparams;
-    //const int n_ctx      = hparams.n_ctx;
+    const int n_ctx      = hparams.n_ctx;
     const int n_vocab    = hparams.n_vocab;
     const int n_embd     = hparams.n_embd;
     const int n_layer    = hparams.n_layer;
@@ -2196,10 +2196,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
         use_buf(prs); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
         use_buf(prs); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
         use_buf(prs); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode));          assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
         use_buf(prs); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
         use_buf(prs); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode));          assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
         use_buf(prs); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
         use_buf(prs); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
         use_buf(prs); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
@@ -2297,10 +2297,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
                 use_buf(prs); struct ggml_tensor * t04 = expand(gb, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
                 use_buf(prs); struct ggml_tensor * t05 = expand(gb, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
                 use_buf(prs); struct ggml_tensor * t06 = expand(gb, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t07 = expand(gb, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode));          assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t07 = expand(gb, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
                 use_buf(prs); struct ggml_tensor * t08 = expand(gb, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
                 use_buf(prs); struct ggml_tensor * t09 = expand(gb, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t10 = expand(gb, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode));          assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t10 = expand(gb, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
                 use_buf(prs); struct ggml_tensor * t11 = expand(gb, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
                 use_buf(prs); struct ggml_tensor * t12 = expand(gb, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
                 use_buf(prs); struct ggml_tensor * t13 = expand(gb, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
@@ -2426,10 +2426,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
         t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
         t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd));                 assert_shape_2d(t11->grad, N*n_batch, n_embd);
         t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
-        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode));                            assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
+        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
         t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch));                                  assert_shape_2d(t08->grad, n_embd, N*n_batch);
         t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
-        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode));                            assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
+        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
         t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch));                                  assert_shape_2d(t05->grad, n_embd, N*n_batch);
         t04->grad = expand(gb, ggml_add_inplace(ctx0,
                         ggml_add_inplace(ctx0,

From c6a18e15c1f255f06ce03fee200f7e7c710989e8 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 21:33:47 +0200
Subject: [PATCH 019/100] add more training parameters:

--enable-restart N         Only for Adam optimizer. Enable restarts of cos-decay
--disable-restart N        Only for Adam optimizer. Disable restarts of cos-decay
--opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero.
--opt-delta N              Maximum delta for delta convergence test. Disabled when <= zero.
--opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero.
--adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero.
--adam-min-alpha N         Adam minimum learning rate alpha, usually 0.1 * alpha
---
 .../train-text-from-scratch.cpp               | 110 ++++++++++++++----
 1 file changed, 89 insertions(+), 21 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index b597fc82979d3..f6e146b8091ae 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3333,10 +3333,12 @@ float cosine_decay(const int decay_steps, const float alpha, int step) {
     return decay;
 }
 
-float cosine_decay_restart(int decay_steps, const float alpha, int step, float restart_step_mult) {
-    while (step > decay_steps) {
-        step -= decay_steps;
-        decay_steps = (int) restart_step_mult * decay_steps;
+float cosine_decay_restart(int decay_steps, const float alpha, int step, float restart_step_mult, bool enable_restart) {
+    if (enable_restart) {
+        while (step > decay_steps) {
+            step -= decay_steps;
+            decay_steps = (int) restart_step_mult * decay_steps;
+        }
     }
     return cosine_decay(decay_steps, alpha, step);
 }
@@ -3376,14 +3378,21 @@ struct train_params {
     int   cos_decay_steps;
     float cos_decay_restart;
     float cos_decay_alpha;
+    bool  enable_restart;
+
+    int   opt_past;
+    float opt_delta;
+    int   opt_max_no_improvement;
 
     int   lbfgs_n_iter;
     int   adam_n_iter;
     float adam_alpha;
+    float adam_min_alpha;
     float adam_decay;
     float adam_beta1;
     float adam_beta2;
     float adam_gclip;
+    float adam_eps_f;
 
     int mem_model_gb;
     int mem_compute_gb;
@@ -3424,19 +3433,26 @@ struct train_params get_default_train_params() {
     params.use_scratch            = true;
     params.use_checkpointing      = true;
 
+    params.opt_past               = 0;
+    params.opt_delta              = 1e-5f;
+    params.opt_max_no_improvement = 0;
+
     // only adam
     params.warmup            =  100;
     params.cos_decay_steps   = 1000;
     params.cos_decay_restart = 1.1f;
     params.cos_decay_alpha   = 0.0f;
+    params.enable_restart    = false;
 
     params.lbfgs_n_iter      = 16;
     params.adam_n_iter       = 16;
     params.adam_alpha        = 1e-3f;
+    params.adam_min_alpha    = 1e-4f;
     params.adam_decay        = 1e-1f;
     params.adam_beta1        = 0.9f;
     params.adam_beta2        = 0.999f;
     params.adam_gclip        = 1.0f;
+    params.adam_eps_f        = 0.0f;
 
     params.mem_model_gb   = 2;
     params.mem_compute_gb = 24;
@@ -3482,13 +3498,20 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
     fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
     fprintf(stderr, "  --cos-decay-alpha N        Only for Adam optimizer. Cosine decay alpha (default %f)\n", params->cos_decay_alpha);
-    fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
+    fprintf(stderr, "  --enable-restart N         Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : "");
+    fprintf(stderr, "  --disable-restart N        Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : "");
+    fprintf(stderr, "  --opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
+    fprintf(stderr, "  --opt-delta N              Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta);
+    fprintf(stderr, "  --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement);
+    fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
     fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
     fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
+    fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha, usually 0.1 * alpha (default %f)\n", params->adam_min_alpha);
     fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
     fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
     fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
     fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
+    fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
     fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
     fprintf(stderr, "  --mem-compute0 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb);
@@ -3659,12 +3682,34 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->cos_decay_alpha = std::stof(argv[i]);
-        } else if (arg == "--lbfgs-iter") {
+        } else if (arg == "--enable-restart") {
+            params->enable_restart = true;
+        } else if (arg == "--disable-restart") {
+            params->enable_restart = false;
+        } else if (arg == "--opt-past") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->lbfgs_n_iter = std::stoi(argv[i]);
+            params->opt_past = std::stoi(argv[i]);
+        } else if (arg == "--opt-delta") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->opt_delta = std::stof(argv[i]);
+        } else if (arg == "--opt-max-no-improvement") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->opt_max_no_improvement = std::stoi(argv[i]);
+        } else if (arg == "--adam-epsf") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_eps_f = std::stof(argv[i]);
         } else if (arg == "--adam-iter") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -3677,6 +3722,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->adam_alpha = std::stof(argv[i]);
+        } else if (arg == "--adam-min-alpha") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_min_alpha = std::stof(argv[i]);
         } else if (arg == "--adam-decay") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -3701,6 +3752,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->adam_gclip = std::stof(argv[i]);
+        } else if (arg == "--lbfgs-iter") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->lbfgs_n_iter = std::stoi(argv[i]);
         } else if (arg == "--mem-model") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -3846,21 +3903,28 @@ int main(int argc, char ** argv) {
 
     struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
     struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
-    opt_params_adam.print_forward_graph = false;
+    opt_params_adam.print_forward_graph  = false;
     opt_params_adam.print_backward_graph = false;
-    opt_params_adam.n_threads   = params.n_threads;
-    opt_params_adam.adam.n_iter = params.adam_n_iter;
-    opt_params_adam.adam.sched  = 1.0f;
-    opt_params_adam.adam.alpha  = params.adam_alpha;
-    opt_params_adam.adam.decay  = params.adam_decay;
-    opt_params_adam.adam.beta1  = params.adam_beta1;
-    opt_params_adam.adam.beta2  = params.adam_beta2;
-    opt_params_adam.adam.gclip  = params.adam_gclip;
-
-    opt_params_lbfgs.print_forward_graph = false;
+    opt_params_adam.n_threads            = params.n_threads;
+    opt_params_adam.past                 = params.opt_past;
+    opt_params_adam.delta                = params.opt_delta;
+    opt_params_adam.max_no_improvement   = params.opt_max_no_improvement;
+    opt_params_adam.adam.n_iter          = params.adam_n_iter;
+    opt_params_adam.adam.sched           = 1.0f;
+    opt_params_adam.adam.alpha           = params.adam_alpha;
+    opt_params_adam.adam.decay           = params.adam_decay;
+    opt_params_adam.adam.beta1           = params.adam_beta1;
+    opt_params_adam.adam.beta2           = params.adam_beta2;
+    opt_params_adam.adam.gclip           = params.adam_gclip;
+    opt_params_adam.adam.eps_f           = params.adam_eps_f;
+
+    opt_params_lbfgs.print_forward_graph  = false;
     opt_params_lbfgs.print_backward_graph = false;
-    opt_params_lbfgs.n_threads    = params.n_threads;
-    opt_params_lbfgs.lbfgs.n_iter = params.lbfgs_n_iter;
+    opt_params_lbfgs.n_threads            = params.n_threads;
+    opt_params_adam.past                  = params.opt_past;
+    opt_params_adam.delta                 = params.opt_delta;
+    opt_params_adam.max_no_improvement    = params.opt_max_no_improvement;
+    opt_params_lbfgs.lbfgs.n_iter         = params.lbfgs_n_iter;
 
     opt->ctx = model.ctx;
     opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
@@ -3996,7 +4060,11 @@ int main(int argc, char ** argv) {
                 params.cos_decay_steps,
                 params.cos_decay_alpha,
                 opt->iter - params.warmup,
-                params.cos_decay_restart);
+                params.cos_decay_restart,
+                params.enable_restart);
+
+        float min_sched = params.adam_min_alpha / params.adam_alpha;
+        opt->params.adam.sched = min_sched + opt->params.adam.sched * (1.0f - min_sched);
 
         printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
 

From ce937bc431f7ac88f5e5b0bab2475bcc673369ca Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 21:36:56 +0200
Subject: [PATCH 020/100] replace memcpy with reshape operation so that the
 graph is not cut at the input

this makes it possible to store other values into the input tensor and then simply recompute the graph without rebuilding it
---
 .../train-text-from-scratch.cpp               | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index f6e146b8091ae..db7a528426f30 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -965,8 +965,8 @@ struct ggml_tensor * forward_batch_wo_cache(
     const int n_rot   = hparams.n_rot;
     const int n_ff    = get_n_ff(&hparams);
 
-    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
-    memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
+    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
+    struct ggml_tensor * tokens = ggml_reshape_1d(ctx0, tokens_input, N*n_batch);
 
     // inpL shape [n_embd,N*n_batch,1]
     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
@@ -1168,7 +1168,7 @@ struct ggml_tensor * forward_batch_wo_cache(
     }
 
     // run the computation
-    ggml_build_forward_expand(gf, inpL);
+    // ggml_build_forward_expand(gf, inpL);
 
     return inpL;
 }
@@ -1193,8 +1193,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
     const int n_rot   = hparams.n_rot;
     const int n_ff    = get_n_ff(&hparams);
 
-    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
-    memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
+
+    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
+    struct ggml_tensor * tokens = ggml_reshape_1d(ctx0, tokens_input, N*n_batch);
 
     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
     assert_shape_2d(inpL, n_embd, N*n_batch);
@@ -1336,7 +1337,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
     }
 
     // run the computation
-    ggml_build_forward_expand(gf, inpL);
+    // ggml_build_forward_expand(gf, inpL);
 
     return inpL;
 }
@@ -1563,8 +1564,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
     use_buf(-1);
 
-    struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch);
-    memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch);
+    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
+    struct ggml_tensor * t00 = ggml_reshape_1d(ctx0, tokens_input, N*n_batch); assert_shape_1d(t00, N*n_batch);
 
     use_buf(-1);
 
@@ -2082,8 +2083,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
 
     use_buf(-1);
 
-    struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch);
-    memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch);
+    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
+    struct ggml_tensor * t00 = ggml_reshape_1d(ctx0, tokens_input, N*n_batch); assert_shape_1d(t00, N*n_batch);
 
     use_buf(-1);
 

From ff759d957c34ef98c4700a014dd00de2a15d7435 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 21:38:03 +0200
Subject: [PATCH 021/100] remove unused function argument from
 get_example_targets_batch

---
 .../train-text-from-scratch.cpp                 | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index db7a528426f30..de71dc99671ed 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2581,7 +2581,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
     }
 }
 
-void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets_batch(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     GGML_ASSERT(tokens_input->n_dims  == 2);
     GGML_ASSERT(target_logits->n_dims == 3);
     GGML_ASSERT(target_probs->n_dims  == 3);
@@ -2596,27 +2596,23 @@ void get_example_targets_batch(struct llama_context * /*lctx*/, const int * trai
 
     ggml_set_f32(target_logits, -1.0f/n_vocab);
     ggml_set_f32(target_probs, 0.0f);
+    // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
     for (int k=0; k<n_batch; ++k) {
         // printf("%s: batch %d\n", __func__, k);
-        size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
+        size_t sample_idx = (example_id*n_batch + k) % n_train_samples;
+        size_t sample = train_samples[sample_idx];
+        // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
         GGML_ASSERT(sample+n_tokens-1 < n_train_data);
 
         set_i32_2d(tokens_input, 0, k, llama_token_bos());
         for (int i=1; i<n_tokens+1; ++i) {
             int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-            // print_token(lctx, token);
             set_f32_3d(target_logits, token, i-1, k, +1.0f);
             set_f32_3d(target_probs,  token, i-1, k, +1.0f);
             if (i<n_tokens) {
                 set_i32_2d(tokens_input, i, k, token);
             }
         }
-        // printf("\n=\n");
-        // for (int i=0; i<n_tokens; ++i) {
-        //     int token = get_i32_2d(tokens_input, i, k);
-        //     print_token(lctx, token);
-        // }
-        // printf("\n-\n");
     }
 }
 
@@ -4011,8 +4007,7 @@ int main(int argc, char ** argv) {
         struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
         struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
 
-
-        get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
+        get_example_targets_batch(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
 
         GGML_ASSERT(n_past == 0);
 

From e843d6e71cea22eaa9a4288138ef02bc8cc50e7d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 21:38:52 +0200
Subject: [PATCH 022/100] measure and print total training time

---
 .../train-text-from-scratch/train-text-from-scratch.cpp    | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index de71dc99671ed..0f330fd4ab7b9 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3975,6 +3975,8 @@ int main(int argc, char ** argv) {
 
     printf("%s: begin training\n", __func__);
 
+    int64_t t0 = ggml_time_ms();
+
     for (int ex = 0; ex < params.n_examples; ++ex) {
         if (ex*n_batch >= (int) train_samples.size()) {
             shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
@@ -4112,6 +4114,11 @@ int main(int argc, char ** argv) {
         ggml_free(ctx0);
     }
 
+    int64_t t1 = ggml_time_ms();
+    int64_t d  = t1-t0;
+    double  dd = (double) d * 1e-3;
+    printf("%s: total training time=%f seconds\n", __func__, dd);
+
     if (params.n_examples > 0) {
         save_checkpoint(&model, opt, params.fn_checkpoint_out);
     }

From bfc311913991c75cc2d3c2978d9d2273a1370ac6 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 22:15:08 +0200
Subject: [PATCH 023/100] add optimization callback to ggml_opt_resume_g

this callback is called before each iteration with custom data and pointer to learning schedule parameter (only used in Adam(W)).

can be used for dynamic learning schedule and setting input data for batches before each iteration
---
 .../train-text-from-scratch.cpp               | 14 +---
 ggml.c                                        | 71 ++++++++++++++-----
 ggml.h                                        | 15 ++--
 3 files changed, 69 insertions(+), 31 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 0f330fd4ab7b9..6adbece4cc24c 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -4046,12 +4046,8 @@ int main(int argc, char ** argv) {
             ggml_build_backward_expand(ctx0, gf, gb, true);
         }
 
-        ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
-
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
 
-        float error_before_opt = ggml_get_f32_1d(loss, 0);
-
         opt->params.adam.sched = (opt->iter < params.warmup)
             ? (float) opt->iter / (float) params.warmup
             : cosine_decay_restart(
@@ -4066,7 +4062,7 @@ int main(int argc, char ** argv) {
 
         printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
 
-        ggml_opt_resume_g(ctx0, opt, loss, gf, gb);
+        ggml_opt_resume_g(ctx0, opt, loss, gf, gb, NULL, NULL);
 
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 
@@ -4074,14 +4070,10 @@ int main(int argc, char ** argv) {
         model.train_samples += n_batch;
         model.train_tokens  += n_batch * n_tokens;
 
-        ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
-
-        float error_after_opt = ggml_get_f32_1d(loss, 0);
-
         if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) {
             printf("Example %d, opt iter %d\n", ex, opt->iter);
-            printf("error_before_opt: %.6f\n", error_before_opt);
-            printf("error_after_opt:  %.6f\n", error_after_opt);
+            printf("error_before_opt: %.6f\n", opt->loss_before);
+            printf("error_after_opt:  %.6f\n", opt->loss_after);
             printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt);
             printf("used_mem_after_opt:  %zu bytes\n", used_mem_after_opt);
         }
diff --git a/ggml.c b/ggml.c
index 07d100bf070d3..e0f91ed5a0d02 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17281,7 +17281,9 @@ static enum ggml_opt_result ggml_opt_adam(
         struct ggml_opt_params params,
         struct ggml_tensor * f,
         struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb) {
+        struct ggml_cgraph * gb,
+        ggml_opt_callback callback,
+        void * callback_data) {
     GGML_ASSERT(ggml_is_scalar(f));
 
     // these will store the parameters we want to optimize
@@ -17307,8 +17309,8 @@ static enum ggml_opt_result ggml_opt_adam(
     }
 
     // constants
-    const float sched = params.adam.sched;
-    const float alpha = params.adam.alpha * sched;
+    float sched = params.adam.sched;
+    const float alpha = params.adam.alpha;
     const float decay = params.adam.decay * alpha;
     const float beta1 = params.adam.beta1;
     const float beta2 = params.adam.beta2;
@@ -17320,6 +17322,10 @@ static enum ggml_opt_result ggml_opt_adam(
 
     float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
 
+    if (callback) {
+        callback(callback_data, &sched);
+    }
+
     // compute the function value
     ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
@@ -17332,6 +17338,9 @@ static enum ggml_opt_result ggml_opt_adam(
         pf[opt->iter % params.past] = opt->adam.fx_prev;
     }
 
+    opt->loss_before = opt->adam.fx_prev;
+    opt->loss_after  = opt->adam.fx_prev;
+
     // initialize
     if (opt->just_initialized) {
         opt->adam.n_no_improvement = 0;
@@ -17380,11 +17389,12 @@ static enum ggml_opt_result ggml_opt_adam(
                     gnorm = (float) ((ggml_float) gclip / norm);
                 }
             }
-            const float beta1h = alpha/(1.0f - powf(beta1, opt->iter));
-            const float beta2h =  1.0f/(1.0f - powf(beta2, opt->iter));
+            const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
+            const float beta2h =        1.0f/(1.0f - powf(beta2, opt->iter));
             int64_t i = 0;
             for (int p = 0; p < np; ++p) {
                 const int64_t ne = ggml_nelements(ps[p]);
+                const float p_decay = decay * sched;
                 for (int64_t j = 0; j < ne; ++j) {
                     float x = ggml_get_f32_1d(ps[p], j);
                     float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
@@ -17393,13 +17403,13 @@ static enum ggml_opt_result ggml_opt_adam(
                     float mh = m[i]*beta1h;
                     float vh = v[i]*beta2h;
                     vh = sqrtf(vh) + eps;
-                    x  = x*(1.0f - decay) - mh/vh;
+                    x  = x*(1.0f - p_decay) - mh/vh;
                     ggml_set_f32_1d(ps[p], j, x);
                     ++i;
                 }
             }
         }
-        // {
+        {
         //     // update the gradient
         //     ggml_opt_get_grad(np, ps, g1);
 
@@ -17436,7 +17446,11 @@ static enum ggml_opt_result ggml_opt_adam(
 
         //     // update the parameters
         //     ggml_opt_set_params(np, ps, x);
-        // }
+        }
+
+        if (callback) {
+            callback(callback_data, &sched);
+        }
 
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
@@ -17444,6 +17458,8 @@ static enum ggml_opt_result ggml_opt_adam(
         ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
 
         const float fx = ggml_get_f32_1d(f, 0);
+        opt->loss_after = fx;
+
 
         // check convergence
         if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
@@ -17525,7 +17541,9 @@ static enum ggml_opt_result linesearch_backtracking(
         struct ggml_cgraph * gf,
         struct ggml_cgraph * gb,
         const int np,
-        struct ggml_tensor * ps[]) {
+        struct ggml_tensor * ps[],
+        ggml_opt_callback callback,
+        void * callback_data) {
     int count = 0;
 
     float width  = 0.0f;
@@ -17554,6 +17572,12 @@ static enum ggml_opt_result linesearch_backtracking(
     dgtest = params->lbfgs.ftol*dginit;
 
     while (true) {
+        if (callback) {
+            // LBFG-S does not support learning rate -> ignore learning schedule
+            float sched = 0;
+            callback(callback_data, &sched);
+        }
+
         ggml_vec_cpy_f32(nx, x, xp);
         ggml_vec_mad_f32(nx, x, d, *step);
 
@@ -17624,7 +17648,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         struct ggml_opt_params params,
         struct ggml_tensor * f,
         struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb) {
+        struct ggml_cgraph * gb,
+        ggml_opt_callback callback,
+        void * callback_data) {
     if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
         params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
         if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
@@ -17677,6 +17703,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
     float * lm_s     = opt->lbfgs.lms->data;
     float * lm_y     = opt->lbfgs.lmy->data;
 
+    if (callback) {
+        // LBFG-S does not support learning rate -> ignore learning schedule
+        float sched = 0;
+        callback(callback_data, &sched);
+    }
+
     // evaluate the function value and its gradient
     {
         ggml_opt_set_params(np, ps, x);
@@ -17689,6 +17721,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_opt_get_grad(np, ps, g);
 
         fx = ggml_get_f32_1d(f, 0);
+
+        opt->loss_before = fx;
+        opt->loss_after  = fx;
     }
 
     // search direction = -gradient
@@ -17743,7 +17778,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_vec_cpy_f32(nx, xp, x);
         ggml_vec_cpy_f32(nx, gp, g);
 
-        ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
+        ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps, callback, callback_data);
 
         if (ls < 0) {
             // linesearch failed - go back to the previous point and return
@@ -17753,6 +17788,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
             return ls;
         }
 
+        opt->loss_after = fx;
+
         ggml_vec_norm_f32(nx, &xnorm, x);
         ggml_vec_norm_f32(nx, &gnorm, g);
 
@@ -17810,7 +17847,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         //     ys = y^t \cdot s    -> 1 / \rho.
         //     yy = y^t \cdot y.
         //
-        ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]);
+        ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
         ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
 
         lm_ys[end[0]] = ys;
@@ -18020,7 +18057,7 @@ enum ggml_opt_result ggml_opt_resume(
     *gf = ggml_build_forward (f);
     *gb = ggml_build_backward(ctx, gf, true);
 
-    return ggml_opt_resume_g(ctx, opt, f, gf, gb);
+    return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
 }
 
 enum ggml_opt_result ggml_opt_resume_g(
@@ -18028,7 +18065,9 @@ enum ggml_opt_result ggml_opt_resume_g(
         struct ggml_opt_context * opt,
         struct ggml_tensor * f,
         struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb) {
+        struct ggml_cgraph * gb,
+        ggml_opt_callback callback,
+        void * callback_data) {
 
     // build forward + backward compute graphs
     enum ggml_opt_result result = GGML_OPT_OK;
@@ -18036,11 +18075,11 @@ enum ggml_opt_result ggml_opt_resume_g(
     switch (opt->params.type) {
         case GGML_OPT_ADAM:
             {
-                result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
+                result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
             } break;
         case GGML_OPT_LBFGS:
             {
-                result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
+                result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
             } break;
     }
 
diff --git a/ggml.h b/ggml.h
index 8f51f5d222099..fadc343eef41c 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1469,6 +1469,8 @@ extern "C" {
         GGML_LINESEARCH_INVALID_PARAMETERS,
     };
 
+    typedef void (*ggml_opt_callback)(void * data, float * sched);
+
     // optimization parameters
     //
     //   see ggml.c (ggml_opt_default_params) for default values
@@ -1538,6 +1540,9 @@ extern "C" {
 
         bool just_initialized;
 
+        float loss_before;
+        float loss_after;
+
         struct {
             struct ggml_tensor * m;  // first moment
             struct ggml_tensor * v;  // second moment
@@ -1577,10 +1582,10 @@ extern "C" {
 
     // initialize optimizer context
     GGML_API void ggml_opt_init(
-            struct ggml_context * ctx,
+            struct ggml_context     * ctx,
             struct ggml_opt_context * opt,
-            struct ggml_opt_params params,
-            int64_t nx);
+            struct ggml_opt_params    params,
+            int64_t                   nx);
 
     // continue optimizing the function defined by the tensor f
     GGML_API enum ggml_opt_result ggml_opt_resume(
@@ -1594,7 +1599,9 @@ extern "C" {
             struct ggml_opt_context * opt,
             struct ggml_tensor * f,
             struct ggml_cgraph * gf,
-            struct ggml_cgraph * gb);
+            struct ggml_cgraph * gb,
+            ggml_opt_callback callback,
+            void * callback_data);
 
     //
     // quantization

From d7aa4d9576cbcdd24578e9dc9be81777fd1611ec Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 22:18:50 +0200
Subject: [PATCH 024/100] use optimization callback in training

allows dynamic learning schedule and different batch data for each iteration without relying on low n_iter and high n_examples parameters

reduces runtime by avoiding restart of optimization function and improves training convergence by providing a different batch for each iteration
---
 .../train-text-from-scratch.cpp               | 81 +++++++++++++++++--
 1 file changed, 75 insertions(+), 6 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 6adbece4cc24c..bde29c5b0714a 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3418,7 +3418,7 @@ struct train_params get_default_train_params() {
 
     params.n_threads  =    6;
     params.n_batch    =    8;
-    params.n_examples =    8;
+    params.n_examples =    1;
     params.n_predict  = 1024;
 
     params.print_info_interval    = 1;
@@ -3441,8 +3441,8 @@ struct train_params get_default_train_params() {
     params.cos_decay_alpha   = 0.0f;
     params.enable_restart    = false;
 
-    params.lbfgs_n_iter      = 16;
-    params.adam_n_iter       = 16;
+    params.lbfgs_n_iter      = 256;
+    params.adam_n_iter       = 256;
     params.adam_alpha        = 1e-3f;
     params.adam_min_alpha    = 1e-4f;
     params.adam_decay        = 1e-1f;
@@ -3803,6 +3803,61 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
     return true;
 }
 
+struct opt_callback_data {
+    struct train_params *     params;
+    struct ggml_opt_context * opt;
+    llama_token *             tokens_data;
+    size_t                    tokens_size;
+    int *                     samples_data;
+    size_t                    samples_size;
+    int                       shuffle_countdown;
+    struct ggml_tensor *      tokens_input;
+    struct ggml_tensor *      target_logits;
+    struct ggml_tensor *      target_probs;
+};
+
+void opt_callback(void * vdata, float * sched) {
+    struct opt_callback_data * data = (struct opt_callback_data *) vdata;
+    struct train_params * params    = data->params;
+    struct ggml_opt_context * opt   = data->opt;
+    int n_batch = params->n_batch;
+
+    *sched = (opt->iter < params->warmup)
+                ? (float) opt->iter / (float) params->warmup
+                : cosine_decay_restart(
+                    params->cos_decay_steps,
+                    params->cos_decay_alpha,
+                    opt->iter - params->warmup,
+                    params->cos_decay_restart,
+                    params->enable_restart);
+    float min_sched = params->adam_min_alpha / params->adam_alpha;
+    *sched = min_sched + *sched * (1.0f - min_sched);
+
+    int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
+    printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0);
+
+    if (data->shuffle_countdown < n_batch) {
+        printf("%s: reshuffle samples\n", __func__);
+        shuffle_ints(data->samples_data, data->samples_data + data->samples_size);
+        for (int i = 0; i < (int) data->samples_size; ++i) {
+            GGML_ASSERT(data->samples_data[i]+params->n_ctx-1 < (int) data->tokens_size);
+        }
+        data->shuffle_countdown = data->samples_size;
+    }
+
+    get_example_targets_batch(
+        data->samples_data,
+        data->samples_size,
+        data->tokens_data,
+        data->tokens_size,
+        opt->iter,
+        data->tokens_input,
+        data->target_logits,
+        data->target_probs);
+
+    data->shuffle_countdown -= n_batch;
+}
+
 int main(int argc, char ** argv) {
     struct train_params params = get_default_train_params();
 
@@ -3975,6 +4030,18 @@ int main(int argc, char ** argv) {
 
     printf("%s: begin training\n", __func__);
 
+    struct opt_callback_data opt_cb_data;
+    opt_cb_data.params = &params;
+    opt_cb_data.opt = opt;
+    opt_cb_data.tokens_data = train_tokens.data();
+    opt_cb_data.tokens_size = train_tokens.size();
+    opt_cb_data.samples_data = train_samples.data();
+    opt_cb_data.samples_size = train_samples.size();
+    opt_cb_data.shuffle_countdown = train_samples.size();
+    opt_cb_data.tokens_input  = NULL;
+    opt_cb_data.target_logits = NULL;
+    opt_cb_data.target_probs  = NULL;
+
     int64_t t0 = ggml_time_ms();
 
     for (int ex = 0; ex < params.n_examples; ++ex) {
@@ -3998,6 +4065,10 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * target_logits          = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
         struct ggml_tensor * target_probs           = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
 
+        opt_cb_data.tokens_input  = tokens_input;
+        opt_cb_data.target_logits = target_logits;
+        opt_cb_data.target_probs  = target_probs;
+
         int n_past = 0;
 
         struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
@@ -4009,8 +4080,6 @@ int main(int argc, char ** argv) {
         struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
         struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
 
-        get_example_targets_batch(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
-
         GGML_ASSERT(n_past == 0);
 
         struct ggml_tensor * loss   = NULL;
@@ -4062,7 +4131,7 @@ int main(int argc, char ** argv) {
 
         printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
 
-        ggml_opt_resume_g(ctx0, opt, loss, gf, gb, NULL, NULL);
+        ggml_opt_resume_g(ctx0, opt, loss, gf, gb, &opt_callback, (void *) &opt_cb_data);
 
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 

From e6ff0728e0c311e27d99c47e4a84a650119a5661 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 23:01:38 +0200
Subject: [PATCH 025/100] add minimum number of tensor dimensions to apply
 weight decay (default 2)

this allows to not apply weight decay to bias parameters
---
 .../train-text-from-scratch.cpp                        | 10 ++++++++++
 ggml.c                                                 |  4 +++-
 ggml.h                                                 |  1 +
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index bde29c5b0714a..aaaf954be9ea7 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3386,6 +3386,7 @@ struct train_params {
     float adam_alpha;
     float adam_min_alpha;
     float adam_decay;
+    int   adam_decay_min_ndim;
     float adam_beta1;
     float adam_beta2;
     float adam_gclip;
@@ -3446,6 +3447,7 @@ struct train_params get_default_train_params() {
     params.adam_alpha        = 1e-3f;
     params.adam_min_alpha    = 1e-4f;
     params.adam_decay        = 1e-1f;
+    params.adam_decay_min_ndim = 2;
     params.adam_beta1        = 0.9f;
     params.adam_beta2        = 0.999f;
     params.adam_gclip        = 1.0f;
@@ -3505,6 +3507,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
     fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha, usually 0.1 * alpha (default %f)\n", params->adam_min_alpha);
     fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
+    fprintf(stderr, "  --adam-decay-min-ndim N    Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim);
     fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
     fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
     fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
@@ -3731,6 +3734,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->adam_decay = std::stof(argv[i]);
+        } else if (arg == "--adam-decay-min-ndim") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_decay_min_ndim = std::stoi(argv[i]);
         } else if (arg == "--adam-beta1") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -3965,6 +3974,7 @@ int main(int argc, char ** argv) {
     opt_params_adam.adam.sched           = 1.0f;
     opt_params_adam.adam.alpha           = params.adam_alpha;
     opt_params_adam.adam.decay           = params.adam_decay;
+    opt_params_adam.adam.decay_min_ndim  = params.adam_decay_min_ndim;
     opt_params_adam.adam.beta1           = params.adam_beta1;
     opt_params_adam.adam.beta2           = params.adam_beta2;
     opt_params_adam.adam.gclip           = params.adam_gclip;
diff --git a/ggml.c b/ggml.c
index e0f91ed5a0d02..2138cb8bc9e3d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17316,6 +17316,7 @@ static enum ggml_opt_result ggml_opt_adam(
     const float beta2 = params.adam.beta2;
     const float eps   = params.adam.eps;
     const float gclip = params.adam.gclip;
+    const int decay_min_ndim = params.adam.decay_min_ndim;
 
     float * m  = opt->adam.m->data;  // first moment
     float * v  = opt->adam.v->data;  // second moment
@@ -17394,7 +17395,7 @@ static enum ggml_opt_result ggml_opt_adam(
             int64_t i = 0;
             for (int p = 0; p < np; ++p) {
                 const int64_t ne = ggml_nelements(ps[p]);
-                const float p_decay = decay * sched;
+                const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0) * sched;
                 for (int64_t j = 0; j < ne; ++j) {
                     float x = ggml_get_f32_1d(ps[p], j);
                     float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
@@ -17911,6 +17912,7 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
                         .n_iter = 10000,
                         .sched  = 1.000f,
                         .decay  = 0.0f,
+                        .decay_min_ndim = 2,
                         .alpha  = 0.001f,
                         .beta1  = 0.9f,
                         .beta2  = 0.999f,
diff --git a/ggml.h b/ggml.h
index fadc343eef41c..3980c005036bb 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1506,6 +1506,7 @@ extern "C" {
 
             float sched; // schedule multiplier (fixed, decay or warmup)
             float decay; // weight decay for AdamW, use 0.0f to disable
+            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
             float alpha; // learning rate
             float beta1;
             float beta2;

From 58024d3e5f316a3d792a096089e937abbdf362f7 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 17:57:08 +0200
Subject: [PATCH 026/100] rename training parameter cos-decay-alpha to
 cos-decay-min and clarify that adam-min-alpha also applies to warmup

---
 .../train-text-from-scratch.cpp               | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index aaaf954be9ea7..4c98b8bafb44d 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3321,23 +3321,23 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
     }
 }
 
-float cosine_decay(const int decay_steps, const float alpha, int step) {
+float cosine_decay(const int decay_steps, const float minimum, int step) {
     if (step > decay_steps) {
         step = decay_steps;
     }
     const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps));
-    const float decay = (1 - alpha)*cosine_decay + alpha;
+    const float decay = (1 - minimum)*cosine_decay + minimum;
     return decay;
 }
 
-float cosine_decay_restart(int decay_steps, const float alpha, int step, float restart_step_mult, bool enable_restart) {
+float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) {
     if (enable_restart) {
         while (step > decay_steps) {
             step -= decay_steps;
             decay_steps = (int) restart_step_mult * decay_steps;
         }
     }
-    return cosine_decay(decay_steps, alpha, step);
+    return cosine_decay(decay_steps, minimum, step);
 }
 
 struct train_params {
@@ -3374,7 +3374,7 @@ struct train_params {
     int   warmup;
     int   cos_decay_steps;
     float cos_decay_restart;
-    float cos_decay_alpha;
+    float cos_decay_min;
     bool  enable_restart;
 
     int   opt_past;
@@ -3439,21 +3439,21 @@ struct train_params get_default_train_params() {
     params.warmup            =  100;
     params.cos_decay_steps   = 1000;
     params.cos_decay_restart = 1.1f;
-    params.cos_decay_alpha   = 0.0f;
+    params.cos_decay_min     = 0.1f;
     params.enable_restart    = false;
 
-    params.lbfgs_n_iter      = 256;
-    params.adam_n_iter       = 256;
-    params.adam_alpha        = 1e-3f;
-    params.adam_min_alpha    = 1e-4f;
-    params.adam_decay        = 1e-1f;
+    params.lbfgs_n_iter        = 256;
+    params.adam_n_iter         = 256;
+    params.adam_alpha          = 1e-3f;
+    params.adam_min_alpha      = 0;
+    params.adam_decay          = 1e-1f;
     params.adam_decay_min_ndim = 2;
-    params.adam_beta1        = 0.9f;
-    params.adam_beta2        = 0.999f;
-    params.adam_gclip        = 1.0f;
-    params.adam_eps_f        = 0.0f;
+    params.adam_beta1          = 0.9f;
+    params.adam_beta2          = 0.999f;
+    params.adam_gclip          = 1.0f;
+    params.adam_eps_f          = 0.0f;
 
-    params.mem_model_gb   = 2;
+    params.mem_model_gb   =  2;
     params.mem_compute_gb = 24;
     params.mem_compute0_gb = 8;
     params.mem_compute1_gb = 1;
@@ -3496,7 +3496,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
     fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
     fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
-    fprintf(stderr, "  --cos-decay-alpha N        Only for Adam optimizer. Cosine decay alpha (default %f)\n", params->cos_decay_alpha);
+    fprintf(stderr, "  --cos-decay-min N          Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min);
     fprintf(stderr, "  --enable-restart N         Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : "");
     fprintf(stderr, "  --disable-restart N        Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : "");
     fprintf(stderr, "  --opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
@@ -3505,7 +3505,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
     fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
     fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
-    fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha, usually 0.1 * alpha (default %f)\n", params->adam_min_alpha);
+    fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha);
     fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
     fprintf(stderr, "  --adam-decay-min-ndim N    Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim);
     fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
@@ -3676,12 +3676,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->cos_decay_restart = std::stof(argv[i]);
-        } else if (arg == "--cos-decay-alpha") {
+        } else if (arg == "--cos-decay-min") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->cos_decay_alpha = std::stof(argv[i]);
+            params->cos_decay_min = std::stof(argv[i]);
         } else if (arg == "--enable-restart") {
             params->enable_restart = true;
         } else if (arg == "--disable-restart") {
@@ -3835,7 +3835,7 @@ void opt_callback(void * vdata, float * sched) {
                 ? (float) opt->iter / (float) params->warmup
                 : cosine_decay_restart(
                     params->cos_decay_steps,
-                    params->cos_decay_alpha,
+                    params->cos_decay_min,
                     opt->iter - params->warmup,
                     params->cos_decay_restart,
                     params->enable_restart);
@@ -4131,7 +4131,7 @@ int main(int argc, char ** argv) {
             ? (float) opt->iter / (float) params.warmup
             : cosine_decay_restart(
                 params.cos_decay_steps,
-                params.cos_decay_alpha,
+                params.cos_decay_min,
                 opt->iter - params.warmup,
                 params.cos_decay_restart,
                 params.enable_restart);

From 17a0898d50a5ce653093ee9a8f6528dbdc2e7e61 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 17:58:09 +0200
Subject: [PATCH 027/100] fix increase of model.train_samples and
 model.train_tokens

now that each optimizer iteration gets its own batch we need to multiply by number of opt iterations
---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 4c98b8bafb44d..770b41e7b377f 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -4145,9 +4145,10 @@ int main(int argc, char ** argv) {
 
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 
+        int n_iter = params.use_adam ? params.adam_n_iter : params.lbfgs_n_iter;
         model.train_its = opt->iter;
-        model.train_samples += n_batch;
-        model.train_tokens  += n_batch * n_tokens;
+        model.train_samples += n_batch * n_iter;
+        model.train_tokens  += n_batch * n_tokens * n_iter;
 
         if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) {
             printf("Example %d, opt iter %d\n", ex, opt->iter);

From 24a4b099f37ae2deef2296a0dae4b6fc5f27b266 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 18:24:57 +0200
Subject: [PATCH 028/100] change sampling parameters for prediction after
 training to defaults of common.h

and clarify what is context for prediction and what are generated tokens
---
 .../train-text-from-scratch.cpp               | 50 +++++++++++--------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 770b41e7b377f..2c17d0b99e349 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2799,19 +2799,19 @@ void shuffle_ints(int * begin, int * end) {
 }
 
 struct my_llama_sampler_params {
-    float temp            = 0.0f;  // <= 0.0 disabled
-    int   top_k           = 20;    // <= 0 to use vocab size
-    float top_p           = 0.95f; // 1.0 = disabled
-    float tfs_z           = 1.00f; // 1.0 = disabled
-    float typical_p       = 1.00f; // 1.0 = disabled
-    int   repeat_last_n   = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float repeat_penalty  = 1.0f;  // 1.0 = disabled
-    float alpha_presence  = 0.0f;  // 0.0 = disabled
-    float alpha_frequency = 0.0f;  // 0.0 = disabled
-    int   mirostat        = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float mirostat_tau    = 5.00f; // target entropy
-    float mirostat_eta    = 0.10f; // learning rate
-    bool  penalize_nl     = true;  // consider newlines as a repeatable token
+    float temp              = 0.0f;  // <= 0.0 disabled
+    int   top_k             = 20;    // <= 0 to use vocab size
+    float top_p             = 0.95f; // 1.0 = disabled
+    float tfs_z             = 1.00f; // 1.0 = disabled
+    float typical_p         = 1.00f; // 1.0 = disabled
+    int   repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float repeat_penalty    = 1.0f;  // 1.0 = disabled
+    float presence_penalty  = 0.0f;  // 0.0 = disabled
+    float frequency_penalty = 0.0f;  // 0.0 = disabled
+    int   mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float mirostat_tau      = 5.00f; // target entropy
+    float mirostat_eta      = 0.10f; // learning rate
+    bool  penalize_nl       = true;  // consider newlines as a repeatable token
 };
 
 struct my_llama_sampler {
@@ -2871,8 +2871,8 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
         candidates_p,
         last_tokens + n_last_tokens - n_last,
         n_last,
-        params.alpha_frequency,
-        params.alpha_presence);
+        params.frequency_penalty,
+        params.presence_penalty);
 
     if (!params.penalize_nl) {
         logits[llama_token_nl()] = nl_logit;
@@ -4203,12 +4203,22 @@ int main(int argc, char ** argv) {
         int n_gen = params.n_predict;
         int sample_ctx = n_tokens - n_tokens/8;
 
-        sampler.params.temp = 0.2f;
-        sampler.params.repeat_penalty = 1.1f;
-        sampler.params.mirostat = 2;
+        // use defaults from common.h
+        sampler.params.top_k             = 40;
+        sampler.params.top_p             = 0.95f;
+        sampler.params.tfs_z             = 1.00f;
+        sampler.params.typical_p         = 1.00f;
+        sampler.params.temp              = 0.8f;
+        sampler.params.repeat_penalty    = 1.1f;
+        sampler.params.repeat_last_n     = 64;
+        sampler.params.frequency_penalty = 0.0f;
+        sampler.params.presence_penalty  = 0.0f;
+        sampler.params.mirostat          = 0;
+        sampler.params.mirostat_tau      = 5.00f;
+        sampler.params.mirostat_eta      = 0.10f;
         init_sampler(&sampler, lctx);
 
-        printf("Generating %d tokens.\n", n_gen);
+        printf("[Prediction context]\n");
 
         struct ggml_tensor * tokens_input  = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
         struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
@@ -4223,7 +4233,7 @@ int main(int argc, char ** argv) {
             print_token(lctx, ggml_get_i32_1d(tokens_input, i));
         }
 
-        printf("---\n");
+        printf("\n[Generating %d tokens]\n", n_gen);
         for (int i=0; i<n_gen; ++i) {
             struct ggml_init_params cparams = {
                 compute_size, // .mem_size

From 1065c3b7b934167943ea7824a5dae9931318c358 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 18:35:11 +0200
Subject: [PATCH 029/100] tighten abs error bounds for cross_entropy_loss in
 test-grad0

---
 tests/test-grad0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index fe2ca212f82a0..0bbeff2707a91 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -1386,7 +1386,7 @@ int main(int argc, const char ** argv) {
 
                 struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
 
-                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-1f, INFINITY);
+                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
             }
         }
 

From dbbc2633137f15205a80466451a0ebe5ba8baf2f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 18:45:18 +0200
Subject: [PATCH 030/100] add conditional compilation of using F16 exp in flash
 attention

uncomment `// #define GGML_FLASH_ATTN_EXP_FP16` to enable usage of f16 exp in flash attention
---
 ggml.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 2138cb8bc9e3d..53f2c425450af 100644
--- a/ggml.c
+++ b/ggml.c
@@ -124,6 +124,7 @@ typedef void * thread_ret_t;
 #define GGML_GELU_QUICK_FP16
 #define GGML_SILU_FP16
 // #define GGML_CROSS_ENTROPY_EXP_FP16
+// #define GGML_FLASH_ATTN_EXP_FP16
 
 #define GGML_SOFT_MAX_UNROLL 4
 #define GGML_VEC_DOT_UNROLL  2
@@ -13111,10 +13112,13 @@ static void ggml_compute_forward_flash_attn_f32(
                         if (SS[j] == -INFINITY) {
                             SS[j] = 0.0f;
                         } else {
-                            // const float val = expf(SS[j] - max);
+#ifndef GGML_FLASH_ATTN_EXP_FP16
+                            const float val = expf(SS[j] - max);
+#else
                             ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
                             memcpy(&scvt[j], &s, sizeof(uint16_t));
                             const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+#endif
                             sump[j] += (ggml_float)val;
                             SS[j] = val;
                         }
@@ -13703,10 +13707,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
                             if (SR[j] == -INFINITY) {
                                 SW[j] = 0.0f;
                             } else {
-                                // const float val = expf(SR[j] - max);
+#ifndef GGML_FLASH_ATTN_EXP_FP16
+                                const float val = expf(SR[j] - max);
+#else
                                 ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
                                 memcpy(&scvt[j], &s, sizeof(uint16_t));
                                 const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+#endif
                                 sump[j] += (ggml_float)val;
                                 SW[j] = val;
                             }

From 47055c929fa4696a87c0ea10fc818d86359e622f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 18:45:54 +0200
Subject: [PATCH 031/100] tighten abs error bounds for flash_attn in test-grad0

---
 tests/test-grad0.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index 0bbeff2707a91..aba4b9c20b2a1 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -1493,7 +1493,7 @@ int main(int argc, const char ** argv) {
 
                     struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
 
-                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
+                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
                 }
             }
         }
@@ -1534,7 +1534,7 @@ int main(int argc, const char ** argv) {
 
                     struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
 
-                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
+                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
                 }
             }
         }

From 0f6a8ab51958d9dc12ab4b311b95b2dd53d4e9ae Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 18:48:57 +0200
Subject: [PATCH 032/100] tighten abs error bounds for sqrt in test-grad0

---
 tests/test-grad0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index aba4b9c20b2a1..ef608a01d3a45 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -531,7 +531,7 @@ int main(int argc, const char ** argv) {
 
                 struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
 
-                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
+                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
             }
         }
 

From 87035b96f78170fc3b6eba071efd9075eb750cb3 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 18:56:05 +0200
Subject: [PATCH 033/100] remove out-commented vectorized code of opt_adam

the vectorized code might be bit faster for low number of parameters, but it had a big memory usage overhead
---
 ggml.c | 38 --------------------------------------
 1 file changed, 38 deletions(-)

diff --git a/ggml.c b/ggml.c
index 53f2c425450af..4ddd154bf473f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17417,44 +17417,6 @@ static enum ggml_opt_result ggml_opt_adam(
                 }
             }
         }
-        {
-        //     // update the gradient
-        //     ggml_opt_get_grad(np, ps, g1);
-
-        //     // m_t = beta1*m_t-1 + (1 - beta1)*g_t
-        //     ggml_vec_scale_f32(nx, m, beta1);
-        //     ggml_vec_mad_f32  (nx, m, g1, 1.0f - beta1);
-
-        //     // g2 = g1^2
-        //     ggml_vec_sqr_f32  (nx, g2, g1);
-
-        //     // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
-        //     ggml_vec_scale_f32(nx, v, beta2);
-        //     ggml_vec_mad_f32  (nx, v, g2, 1.0f - beta2);
-
-        //     // m^hat = m_t / (1 - beta1^t)
-        //     // v^hat = v_t / (1 - beta2^t)
-        //     // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
-        //     // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
-        //     // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
-        //     // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
-        //     // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
-        //     ggml_vec_cpy_f32  (nx, mh, m);
-        //     ggml_vec_cpy_f32  (nx, vh, v);
-
-        //     ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
-        //     ggml_vec_scale_f32(nx, vh,  1.0f/(1.0f - powf(beta2, opt->iter)));
-
-        //     ggml_vec_sqrt_f32 (nx, vh, vh);
-        //     ggml_vec_acc1_f32 (nx, vh, eps);
-
-        //     ggml_vec_div_f32  (nx, mh, mh, vh);
-        //     ggml_vec_scale_f32(nx, x,  1.0f - decay);
-        //     ggml_vec_sub_f32  (nx, x,  x,  mh);
-
-        //     // update the parameters
-        //     ggml_opt_set_params(np, ps, x);
-        }
 
         if (callback) {
             callback(callback_data, &sched);

From ecdc16163efa41fc41ac2dfca63cb7af60e2362c Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 28 Jul 2023 23:09:56 +0200
Subject: [PATCH 034/100] ggml : update ggml_rms_norm_back with configurable
 eps

---
 ggml.c | 13 ++++++++++---
 ggml.h |  4 ++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index 4ddd154bf473f..756000cffcc10 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5824,7 +5824,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
 struct ggml_tensor * ggml_rms_norm_back(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
+        struct ggml_tensor  * b,
+        float  eps) {
     bool is_node = false;
 
     if (a->grad) {
@@ -5834,6 +5835,8 @@ struct ggml_tensor * ggml_rms_norm_back(
 
     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
 
+    ggml_set_op_params(result, &eps, sizeof(eps));
+
     result->op   = GGML_OP_RMS_NORM_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
@@ -10211,7 +10214,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
 
     GGML_TENSOR_BINARY_OP_LOCALS;
 
-    const float eps = 1e-6f; // TODO: make this a parameter
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
 
     // TODO: optimize
     for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -15029,9 +15033,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 // necessary for llama
                 if (src0->grad) {
+                    float eps;
+                    memcpy(&eps, tensor->op_params, sizeof(float));
+
                     src0->grad = ggml_add_impl(ctx,
                             src0->grad,
-                            ggml_rms_norm_back(ctx, src0, tensor->grad),
+                            ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
                             inplace);
                 }
             } break;
diff --git a/ggml.h b/ggml.h
index 3980c005036bb..9e8ed956eae4e 100644
--- a/ggml.h
+++ b/ggml.h
@@ -894,11 +894,11 @@ extern "C" {
 
     // a - x
     // b - dy
-    // TODO: update with configurable eps
     GGML_API struct ggml_tensor * ggml_rms_norm_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b,
+            float                 eps);
 
     // A: n columns, m rows
     // B: n columns, p rows  (i.e. we transpose it internally)

From c1a5e116a45227fcd48b14b9db27995b922a7b0d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 28 Jul 2023 23:10:55 +0200
Subject: [PATCH 035/100] llama training : fix ggml_rms_norm_back calls to pass
 configurable eps

---
 .../train-text-from-scratch.cpp                | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 2c17d0b99e349..70fcdc5decfc5 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1838,7 +1838,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
         clr_buf(0);
         use_buf(0);
-        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
+        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad, rms_norm_eps)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
         if (grad_layer_inp) {
             t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
         }
@@ -1854,7 +1854,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22));                                                       assert_shape_2d(t23->grad, n_embd, N*n_batch);
         t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad)));              assert_shape_2d(t22->grad, n_embd, N*n_batch);
         use_buf(1);
-        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad)));                  assert_shape_2d(t21->grad, n_embd, N*n_batch);
+        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad, rms_norm_eps)));    assert_shape_2d(t21->grad, n_embd, N*n_batch);
         grad_layer_inp = t21;
         use_buf(0);
         t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
@@ -1899,9 +1899,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     }
     clr_buf(0);
     use_buf(0);
-    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad)));  assert_shape_2d(t01->grad, n_embd, N*n_batch);
+    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad, rms_norm_eps)));        assert_shape_2d(t01->grad, n_embd, N*n_batch);
     use_buf(-1);
-    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                  assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
+    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                                      assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
     // clr_buf(1);
     // clr_buf(0);
 
@@ -2396,9 +2396,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
 
         clr_buf(0);
         use_buf(0);
-        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
+        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad, rms_norm_eps));                    assert_shape_2d(t30->grad, n_embd, N*n_batch);
         if (grad_layer_inp) {
-            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
+            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad));                                  assert_shape_2d(t30->grad, n_embd, N*n_batch);
         }
         clr_buf(1);
         t29->grad = t30->grad;                                                                                        assert_shape_2d(t29->grad, n_embd, N*n_batch);
@@ -2412,7 +2412,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
         t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22));                                                       assert_shape_2d(t23->grad, n_embd, N*n_batch);
         t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad)));              assert_shape_2d(t22->grad, n_embd, N*n_batch);
         use_buf(1);
-        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad)));                  assert_shape_2d(t21->grad, n_embd, N*n_batch);
+        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad, rms_norm_eps)));    assert_shape_2d(t21->grad, n_embd, N*n_batch);
         grad_layer_inp = t21;
         use_buf(0);
         t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
@@ -2458,9 +2458,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
     GGML_ASSERT(avail_begin == 0);
     clr_buf(0);
     use_buf(0);
-    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad)));  assert_shape_2d(t01->grad, n_embd, N*n_batch);
+    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad, rms_norm_eps)));        assert_shape_2d(t01->grad, n_embd, N*n_batch);
     use_buf(-1);
-    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                  assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
+    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                                      assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
 
     *logits = t35;
 

From 22cb368dd964cb0506da1002e084fdc5ee92b23e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 28 Jul 2023 23:55:30 +0200
Subject: [PATCH 036/100] remove trailing whitespace

---
 tests/test-grad0.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index ef608a01d3a45..079eef7e02e37 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -1348,9 +1348,9 @@ int main(int argc, const char ** argv) {
                 float eps = 1e-6f;
                 // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
                 // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
-                struct ggml_tensor * f = ggml_sum(ctx0, 
-                                            ggml_log(ctx0, 
-                                                ggml_add1(ctx0, 
+                struct ggml_tensor * f = ggml_sum(ctx0,
+                                            ggml_log(ctx0,
+                                                ggml_add1(ctx0,
                                                     ggml_scale(ctx0,
                                                         ggml_soft_max(ctx0, x[0]),
                                                         ggml_new_f32(ctx0, 1.0f - eps)),

From 2bf422eafd9fd8a7bd5a065b51975f84cdf3ca2b Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 6 Aug 2023 23:07:57 +0200
Subject: [PATCH 037/100] add train function using automatic gradient
 checkpointing backward pass and allocator

---
 .../train-text-from-scratch.cpp               | 286 ++++++++++++++++++
 1 file changed, 286 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 70fcdc5decfc5..76e6ace640a66 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "ggml-alloc.h"
 #include "llama.h"
 #include <unordered_map>
 #include <vector>
@@ -1342,6 +1343,291 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
     return inpL;
 }
 
+
+static size_t hash(void * p) {
+    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
+}
+
+static size_t hash_find(void * hash_table[], void * p) {
+    size_t h = hash(p);
+
+    // linear probing
+    size_t i = h;
+    while (hash_table[i] != NULL && hash_table[i] != p) {
+        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
+        if (i == h) {
+            // visited all hash table entries -> not found
+            return GGML_GRAPH_HASHTABLE_SIZE;
+        }
+    }
+    return i;
+}
+
+static bool hash_insert(void * hash_table[], void * p) {
+    size_t h = hash(p);
+    size_t i = hash_find(hash_table, p);
+
+    GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+
+    if (hash_table[i] == p) {
+        return true;
+    }
+
+    // insert
+    GGML_ASSERT(hash_table[i] == NULL);
+    hash_table[i] = p;
+    return false;
+}
+
+static bool hash_contains(void * hash_table[], void * p) {
+    size_t i = hash_find(hash_table, p);
+    return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
+}
+
+struct hash_map {
+    void * keys[GGML_GRAPH_HASHTABLE_SIZE];
+    void * vals[GGML_GRAPH_HASHTABLE_SIZE];
+};
+static const size_t HASH_MAP_SIZE = sizeof(struct hash_map);
+
+struct hash_map * new_hash_map(struct ggml_context * ctx, struct ggml_tensor * * out_buf) {
+    struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, HASH_MAP_SIZE);
+    if (out_buf) {
+        * out_buf = buf;
+    }
+    struct hash_map * result = (struct hash_map *) ((char *) buf->data);
+    *result = (struct hash_map) {
+        /*.keys   =*/ { NULL },
+        /*.vals   =*/ { NULL },
+    };
+    for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
+        result->keys[i] = NULL;
+        result->vals[i] = NULL;
+    }
+    return result;
+};
+
+struct ggml_tensor * ggml_recompute_graph_node(
+        struct ggml_context * ctx,
+        struct ggml_cgraph  * graph,
+        struct hash_map     * replacements,
+        struct ggml_tensor  * node) {
+
+    if (node == NULL) {
+        return NULL;
+    }
+
+    if (node->is_param) {
+        return node;
+    }
+
+    if (!hash_contains(graph->visited_hash_table, node)) {
+        return node;
+    }
+
+    size_t i = hash_find(replacements->keys, node);
+    GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+    if (replacements->keys[i] == p) {
+        return replacements->vals[i];
+    }
+
+    struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
+
+    // insert clone into replacements
+    GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
+    replacements->keys[i] = node;
+    replacements->vals[i] = clone;
+
+    clone->op       = node->op;
+    clone->grad     = node->grad;
+    clone->is_param = node->is_param;
+    clone->extra    = node->extra;
+    for (int k = 0; k < GGML_MAX_SRC; ++k) {
+        clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
+    }
+
+    GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
+    GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);
+    memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
+    memcpy(clone->name,      node->name,      sizeof(node->name));
+
+    return clone;
+};
+
+void ggml_build_backward_gradient_checkpointing(
+        struct ggml_context   * ctx,
+        struct ggml_cgraph    * gf,
+        struct ggml_cgraph    * gb,
+        struct ggml_cgraph    * gb_tmp,
+        struct ggml_tensor  * * checkpoints,
+        int                     n_checkpoints) {
+    *gb_tmp = *gf;
+    ggml_build_backward_expand(ctx, gf, gb_tmp, true);
+
+    if (n_checkpoints <= 0) {
+        *gb = *gb_tmp;
+        return;
+    }
+
+    struct hash_map * replacements = new_hash_map(ctx, NULL);
+
+    // insert checkpoints in replacements
+    for (int i = 0; i < n_checkpoints; ++i) {
+        size_t k = hash_find(replacements->keys, node);
+        GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+        GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
+        replacements->keys[k] = checkpoints[i];
+        replacements->vals[k] = checkpoints[i];
+    }
+
+    *gb = *gf;
+    // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
+    // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
+    // by recomputing them from checkpoints
+    for (int i = gf->n_nodes; i<gb_tmp->n_nodes; ++i) {
+        struct ggml_tensor * node = gb_tmp->nodes[i];
+        for (int k = 0; k < GGML_MAX_SRC; ++k) {
+            // insert new tensors recomputing src, reusing already made replacements,
+            // remember replacements: remember new tensors with mapping from corresponding gf nodes
+            // recurse for input tensors,
+            // unless (i.e. terminating when) input tensors are checkpoints
+            node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
+        }
+        // insert rewritten backward node with replacements made into resulting backward graph gb
+        ggml_build_forward_expand(gb, node);
+    }
+}
+
+struct ggml_tensor * llama_build_train_graphs(
+        struct my_llama_model * model,
+        struct ggml_allocr    * alloc,
+        struct ggml_context   * ctx,
+        struct ggml_cgraph    * gf,
+        struct ggml_cgraph    * gb,
+        struct ggml_cgraph    * gb_tmp,
+        struct ggml_tensor  * * logits,
+        struct ggml_tensor    * tokens_input,
+        struct ggml_tensor    * targets,
+        const  int              n_tokens,
+        const  int              n_batch,
+        const  bool             enable_flash_attn,
+        const  bool             enable_checkpointing) {
+
+    ggml_set_scratch(ctx, { 0, 0, nullptr, });
+    const int n_past = 0;
+    const int N = n_tokens;
+    const auto & hparams = model->hparams;
+    const int n_ctx      = hparams.n_ctx;
+    const int n_vocab    = hparams.n_vocab;
+    const int n_embd     = hparams.n_embd;
+    const int n_layer    = hparams.n_layer;
+    const int n_head     = hparams.n_head;
+    const int n_rot      = hparams.n_rot;
+    const int n_ff       = get_n_ff(&hparams);
+    const int rope_mode  = 0;
+
+    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
+    struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  assert_shape_1d(t00, N*n_batch);
+    struct ggml_tensor * t01 = ggml_get_rows(ctx, model->tok_embeddings, t00); assert_shape_2d(t01, n_embd, N*n_batch);
+
+    struct ggml_tensor * cur = t01;
+
+    std::vector<struct ggml_tensor *> checkpoints;
+    checkpoints.push_back(cur);
+
+    struct ggml_tensor * kv_scale;
+    if (flash_attn) {
+        kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head)));
+    }
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct my_llama_layer & layer = model->layers[il];
+        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                      assert_shape_2d(t02, n_embd, N*n_batch);
+        struct ggml_tensor * t03 = ggml_repeat       (ctx, layer.attention_norm, t02);              assert_shape_2d(t03, n_embd, N*n_batch);
+        struct ggml_tensor * t04 = ggml_mul          (ctx, t02, t03);                               assert_shape_2d(t04, n_embd, N*n_batch);
+        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, layer.wq, t04);                          assert_shape_2d(t05, n_embd, N*n_batch);
+        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t07 = ggml_rope_inplace (ctx, t06, n_past, n_rot, rope_mode, n_ctx);   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, layer.wk, t04);                          assert_shape_2d(t08, n_embd, N*n_batch);
+        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t10 = ggml_rope_inplace (ctx, t09, n_past, n_rot, rope_mode, n_ctx);   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t11 = ggml_mul_mat      (ctx, t04, layer.wv);                          assert_shape_2d(t11, N*n_batch, n_embd);
+        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd/n_head, n_head); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
+        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
+        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
+        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
+        struct ggml_tensor * t16;
+        if (enable_flash_attn) {
+            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+        } else {
+            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                 assert_shape_4d(t16_0, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);          assert_shape_4d(t16_1, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);            assert_shape_4d(t16_2, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                    assert_shape_4d(t16_3, N, N, n_head, n_batch);
+            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+        }
+        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                 assert_shape_2d(t19, n_embd, N*n_batch);
+        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, layer.wo, t19);                          assert_shape_2d(t20, n_embd, N*n_batch);
+        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                               assert_shape_2d(t21, n_embd, N*n_batch);
+        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                      assert_shape_2d(t22, n_embd, N*n_batch);
+        struct ggml_tensor * t23 = ggml_repeat       (ctx, layer.ffn_norm, t22);                    assert_shape_2d(t23, n_embd, N*n_batch);
+        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               assert_shape_2d(t24, n_embd, N*n_batch);
+        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.w3, t24);                          assert_shape_2d(t25, n_ff, N*n_batch);
+        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.w1, t24);                          assert_shape_2d(t26, n_ff, N*n_batch);
+        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    assert_shape_2d(t27, n_ff, N*n_batch);
+        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               assert_shape_2d(t28, n_ff, N*n_batch);
+        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.w2, t28);                          assert_shape_2d(t29, n_embd, N*n_batch);
+        struct ggml_tensor * t30 = ggml_add          (ctx, t21, t29);                               assert_shape_2d(t30, n_embd, N*n_batch);
+        cur = t30;
+        checkpoints.push_back(cur);
+    }
+    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                   assert_shape_2d(t31, n_embd, N*n_batch);
+    struct ggml_tensor * t32   = ggml_repeat            (ctx, model->norm, t31);                    assert_shape_2d(t32, n_embd, N*n_batch);
+    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                            assert_shape_2d(t33, n_embd, N*n_batch);
+    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, model->output, t33);                  assert_shape_2d(t34, n_vocab, N*n_batch);
+    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);            assert_shape_3d(t35, n_vocab, N, n_batch);
+    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                        assert_shape_1d(t36, 1);
+
+    checkpoints.push_back(t31);
+    checkpoints.push_back(t32);
+    checkpoints.push_back(t33);
+    checkpoints.push_back(t34);
+    checkpoints.push_back(t35);
+    checkpoints.push_back(t36);
+
+    ggml_build_forward_expand(gf, t36);
+
+    if (enable_checkpointing) {
+        ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
+    } else {
+        *gb = *gf;
+        ggml_build_backward_expand(ctx, gf, gb, true);
+    }
+
+    if (alloc) {
+        // make sure t35 and t36 are not reallocated by inserting new temporary node depending on them
+        struct ggml_tensor * dep = ggml_scale_inplace(ctx, t35, t36);
+        int n_nodes_before = gb->n_nodes;
+        ggml_build_forward_expand(gb, dep);
+
+        int n_nodes_after = gb->n_nodes;
+        GGML_ASSERT(n_nodes_after == n_nodes_before + 1);
+
+        ggml_allocr_reset(alloc);
+        ggml_allocr_alloc_graph(alloc, gb);
+
+        // remove the additional node that was insert
+        gb->nodes[n_nodes_after-1] = NULL;
+        gb->n_nodes = n_nodes_before;
+    }
+
+    *logits = t35;
+    return t36;
+}
+
+
 // expand the graph nodes without creating leafs.
 struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
     // check if already visited

From fc826c8ea81136961a499d48fb3ab2e221bdbeee Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:49:22 +0200
Subject: [PATCH 038/100] in train function replace add_inplace by regular add

because using add_inplace seems to result in different gradients
---
 .../train-text-from-scratch/train-text-from-scratch.cpp     | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 76e6ace640a66..51eb96fc9510c 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1264,7 +1264,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
             assert_shape_2d(cur, n_embd, N*n_batch);
         }
 
-        struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
+        // struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
+        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
         assert_shape_2d(inpFF, n_embd, N*n_batch);
 
         // feed-forward network
@@ -1304,7 +1305,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
             assert_shape_2d(cur, n_embd, N*n_batch);
         }
 
-        cur = ggml_add_inplace(ctx0, cur, inpFF);
+        // cur = ggml_add_inplace(ctx0, cur, inpFF);
+        cur = ggml_add(ctx0, cur, inpFF);
         assert_shape_2d(cur, n_embd, N*n_batch);
 
         // input for next layer

From d43741540b74fb083aa36ce625409695f39229bf Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:51:20 +0200
Subject: [PATCH 039/100] don't use allocate hash_map on context

because the context has no_alloc=True when using memory allocator resulting in NULL data pointers
---
 .../train-text-from-scratch.cpp               | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 51eb96fc9510c..03ec39d860018 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1392,16 +1392,8 @@ struct hash_map {
 };
 static const size_t HASH_MAP_SIZE = sizeof(struct hash_map);
 
-struct hash_map * new_hash_map(struct ggml_context * ctx, struct ggml_tensor * * out_buf) {
-    struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, HASH_MAP_SIZE);
-    if (out_buf) {
-        * out_buf = buf;
-    }
-    struct hash_map * result = (struct hash_map *) ((char *) buf->data);
-    *result = (struct hash_map) {
-        /*.keys   =*/ { NULL },
-        /*.vals   =*/ { NULL },
-    };
+struct hash_map * new_hash_map() {
+    struct hash_map * result = new struct hash_map;
     for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
         result->keys[i] = NULL;
         result->vals[i] = NULL;
@@ -1409,6 +1401,10 @@ struct hash_map * new_hash_map(struct ggml_context * ctx, struct ggml_tensor * *
     return result;
 };
 
+void free_hash_map(struct hash_map * map) {
+    delete map;
+}
+
 struct ggml_tensor * ggml_recompute_graph_node(
         struct ggml_context * ctx,
         struct ggml_cgraph  * graph,
@@ -1471,7 +1467,7 @@ void ggml_build_backward_gradient_checkpointing(
         return;
     }
 
-    struct hash_map * replacements = new_hash_map(ctx, NULL);
+    struct hash_map * replacements = new_hash_map();
 
     // insert checkpoints in replacements
     for (int i = 0; i < n_checkpoints; ++i) {
@@ -1498,6 +1494,8 @@ void ggml_build_backward_gradient_checkpointing(
         // insert rewritten backward node with replacements made into resulting backward graph gb
         ggml_build_forward_expand(gb, node);
     }
+
+    free_hash_map(replacements);
 }
 
 struct ggml_tensor * llama_build_train_graphs(

From cfddc36be220a035ceaab4bb7365b399cc0cf700 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:52:15 +0200
Subject: [PATCH 040/100] correctly clone reshape and permute operations by
 also cloning tensor->nb values

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 03ec39d860018..92e6315dcb061 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1440,6 +1440,9 @@ struct ggml_tensor * ggml_recompute_graph_node(
     clone->grad     = node->grad;
     clone->is_param = node->is_param;
     clone->extra    = node->extra;
+    for (int k = 0; k < GGML_MAX_DIMS; ++k) {
+        clone->nb[k] = node->nb[k];
+    }
     for (int k = 0; k < GGML_MAX_SRC; ++k) {
         clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
     }

From 0dd496c5e21a6baeb377babd769efa8e01981e9e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:52:48 +0200
Subject: [PATCH 041/100] fix variable name and add missing type cast

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 92e6315dcb061..266f378b9e4ed 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1425,8 +1425,8 @@ struct ggml_tensor * ggml_recompute_graph_node(
 
     size_t i = hash_find(replacements->keys, node);
     GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
-    if (replacements->keys[i] == p) {
-        return replacements->vals[i];
+    if (replacements->keys[i] == node) {
+        return (struct ggml_tensor *) replacements->vals[i];
     }
 
     struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);

From 52c92c0a8cc88f45bcde9556b4f6d4481a36bd9d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:53:36 +0200
Subject: [PATCH 042/100] terminate recursive tensor cloning when reaching
 tensor without src tensors

---
 .../train-text-from-scratch.cpp                       | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 266f378b9e4ed..9b73361ca7e60 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1423,6 +1423,17 @@ struct ggml_tensor * ggml_recompute_graph_node(
         return node;
     }
 
+    int count_children = 0;
+    for (int k = 0; k < GGML_MAX_SRC; ++k) {
+        if (node->src[k]) {
+            ++count_children;
+        }
+    }
+
+    if (count_children == 0) {
+        return node;
+    }
+
     size_t i = hash_find(replacements->keys, node);
     GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
     if (replacements->keys[i] == node) {

From 345f516f7c3384e38e610e413f9060dd729049f4 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:55:13 +0200
Subject: [PATCH 043/100] correctly clone view tensors by setting data pointers

without this the checkpointing would only work when being used together with memory allocator
---
 .../train-text-from-scratch.cpp               | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 9b73361ca7e60..410ba69b9fbec 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1405,6 +1405,33 @@ void free_hash_map(struct hash_map * map) {
     delete map;
 }
 
+static bool ggml_is_view(struct ggml_tensor * t) {
+    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
+           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
+}
+
+static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
+    switch (t->op) {
+        case GGML_OP_PERMUTE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_VIEW:
+            return t->src[0];
+        case GGML_OP_CPY:
+            return t->src[1];
+        default:
+            return NULL;
+    }
+}
+
+static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
+    struct ggml_tensor * parent = t;
+    do {
+        parent = get_view_parent(parent);
+    } while (ggml_is_view(parent));
+    return parent;
+}
+
 struct ggml_tensor * ggml_recompute_graph_node(
         struct ggml_context * ctx,
         struct ggml_cgraph  * graph,
@@ -1457,6 +1484,11 @@ struct ggml_tensor * ggml_recompute_graph_node(
     for (int k = 0; k < GGML_MAX_SRC; ++k) {
         clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
     }
+    if (ggml_is_view(clone)) {
+        struct ggml_tensor * source = get_view_source(clone);
+        GGML_ASSERT(source != NULL);
+        clone->data = source->data;
+    }
 
     GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
     GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);

From 5a11b75875e2e82501c1bdaf7c4528a3ecc4f4e3 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:55:51 +0200
Subject: [PATCH 044/100] fix variable names

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 410ba69b9fbec..bdc7cdade9a1b 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1517,9 +1517,9 @@ void ggml_build_backward_gradient_checkpointing(
 
     // insert checkpoints in replacements
     for (int i = 0; i < n_checkpoints; ++i) {
-        size_t k = hash_find(replacements->keys, node);
+        size_t k = hash_find(replacements->keys, checkpoints[i]);
         GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
-        GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
+        GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
         replacements->keys[k] = checkpoints[i];
         replacements->vals[k] = checkpoints[i];
     }

From b2f13101961825ec4ab4b86907d611daff739ffe Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:57:13 +0200
Subject: [PATCH 045/100] swap arguments to commutative ops to be the same as
 in `forward_batch_wo_cache_flash_attn`

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index bdc7cdade9a1b..d5fde1ca59461 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1590,7 +1590,7 @@ struct ggml_tensor * llama_build_train_graphs(
         struct my_llama_layer & layer = model->layers[il];
         struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                      assert_shape_2d(t02, n_embd, N*n_batch);
         struct ggml_tensor * t03 = ggml_repeat       (ctx, layer.attention_norm, t02);              assert_shape_2d(t03, n_embd, N*n_batch);
-        struct ggml_tensor * t04 = ggml_mul          (ctx, t02, t03);                               assert_shape_2d(t04, n_embd, N*n_batch);
+        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               assert_shape_2d(t04, n_embd, N*n_batch);
         struct ggml_tensor * t05 = ggml_mul_mat      (ctx, layer.wq, t04);                          assert_shape_2d(t05, n_embd, N*n_batch);
         struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
         struct ggml_tensor * t07 = ggml_rope_inplace (ctx, t06, n_past, n_rot, rope_mode, n_ctx);   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
@@ -1625,7 +1625,7 @@ struct ggml_tensor * llama_build_train_graphs(
         struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    assert_shape_2d(t27, n_ff, N*n_batch);
         struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               assert_shape_2d(t28, n_ff, N*n_batch);
         struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.w2, t28);                          assert_shape_2d(t29, n_embd, N*n_batch);
-        struct ggml_tensor * t30 = ggml_add          (ctx, t21, t29);                               assert_shape_2d(t30, n_embd, N*n_batch);
+        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               assert_shape_2d(t30, n_embd, N*n_batch);
         cur = t30;
         checkpoints.push_back(cur);
     }

From 5884b43a622a88f7f2fddf73f97ae9f50137efdc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:58:49 +0200
Subject: [PATCH 046/100] add input tensors as checkpoints

so that recursive tensor cloning of gradient checkpointing terminates on input tensors
---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index d5fde1ca59461..48edf3651145f 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1579,7 +1579,10 @@ struct ggml_tensor * llama_build_train_graphs(
     struct ggml_tensor * cur = t01;
 
     std::vector<struct ggml_tensor *> checkpoints;
-    checkpoints.push_back(cur);
+    checkpoints.push_back(tokens_input);
+    checkpoints.push_back(targets);
+    checkpoints.push_back(t00);
+    checkpoints.push_back(t01);
 
     struct ggml_tensor * kv_scale;
     if (flash_attn) {

From 9716eb8ef0830e54badb729244175329744d9b99 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:59:19 +0200
Subject: [PATCH 047/100] fix variable name and add missing boolean negation

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 48edf3651145f..88a1c3a504108 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1585,8 +1585,8 @@ struct ggml_tensor * llama_build_train_graphs(
     checkpoints.push_back(t01);
 
     struct ggml_tensor * kv_scale;
-    if (flash_attn) {
-        kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head)));
+    if (!enable_flash_attn) {
+        kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head));
     }
 
     for (int il = 0; il < n_layer; ++il) {

From 38f4438c32def72e7a0fd42f9caba9df80a5cc32 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:07:16 +0200
Subject: [PATCH 048/100] make sure some tensors are not reallocated by
 inserting new temporary nodes depending on them:

output and parameter gradient tensors need to be available at the end of the graph execution

parameter gradient tensors also need to be available before the graph execution because they are set to zero before each optimizer iteration

checkpoint tensors are allocated all together to reduce memory allocator fragmentation

afterwards, in addition to the temporary nodes, we also need to reset the temporary leafs
---
 .../train-text-from-scratch.cpp               | 31 ++++++++++++++-----
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 88a1c3a504108..0583250598762 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1656,19 +1656,36 @@ struct ggml_tensor * llama_build_train_graphs(
     }
 
     if (alloc) {
-        // make sure t35 and t36 are not reallocated by inserting new temporary node depending on them
-        struct ggml_tensor * dep = ggml_scale_inplace(ctx, t35, t36);
+        // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
+        int n_leafs_before = gb->n_leafs;
         int n_nodes_before = gb->n_nodes;
-        ggml_build_forward_expand(gb, dep);
+        struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f);
+        // output tensors
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
+        // gradient tensors (will be set to zero by ggml_graph_reset)
+        for (int i = 0; i < gf->n_nodes; ++i) {
+            if (!gf->grads[i]) continue;
+            ggml_allocr_alloc(alloc, gf->grads[i]);
+            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, gf->grads[i], one));
+        }
+        for (int i = 0; i < checkpoints.size(); ++i) {
+            ggml_allocr_alloc(alloc, checkpoints[i]);
+        }
 
+        int n_leafs_after = gb->n_leafs;
         int n_nodes_after = gb->n_nodes;
-        GGML_ASSERT(n_nodes_after == n_nodes_before + 1);
 
-        ggml_allocr_reset(alloc);
         ggml_allocr_alloc_graph(alloc, gb);
 
-        // remove the additional node that was insert
-        gb->nodes[n_nodes_after-1] = NULL;
+        // remove the additional nodes and leafs
+        for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
+            gb->leafs[i] = NULL;
+        }
+        for (int i = n_nodes_before; i < gb->n_nodes; ++i) {
+            gb->nodes[i] = NULL;
+        }
+        gb->n_leafs = n_leafs_before;
         gb->n_nodes = n_nodes_before;
     }
 

From d6c5b03858aff6d68f1549d64343b566c53b3830 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:08:19 +0200
Subject: [PATCH 049/100] fix ASSERT to work with zero layers

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 0583250598762..28fbd2dc84ea6 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2806,7 +2806,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
         layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
     }
     // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
-    GGML_ASSERT(chk_idx == -2);
+    GGML_ASSERT(n_check == 0 || chk_idx == -2);
     GGML_ASSERT(avail_begin == 0);
     clr_buf(0);
     use_buf(0);

From 4ed096c6b086af80032d4b41138a4cc932bb3426 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:10:02 +0200
Subject: [PATCH 050/100] add training options whether to use allocator and/or
 unified training function

---
 .../train-text-from-scratch.cpp                  | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 28fbd2dc84ea6..15f60513f3be2 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3721,6 +3721,8 @@ struct train_params {
     bool use_flash;
     bool use_scratch;
     bool use_checkpointing;
+    bool use_alloc;
+    bool use_unified;
 
     // only adam
     int   warmup;
@@ -3782,6 +3784,8 @@ struct train_params get_default_train_params() {
     params.use_flash              = true;
     params.use_scratch            = true;
     params.use_checkpointing      = true;
+    params.use_alloc              = true;
+    params.use_unified            = true;
 
     params.opt_past               = 0;
     params.opt_delta              = 1e-5f;
@@ -3845,6 +3849,10 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --use-scratch              Use scratch buffers. Implies use-flash. (default)\n");
     fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
     fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing. Implies use-scratch and use-flash. (default)\n");
+    fprintf(stderr, "  --no-alloc                 Don't use allocator\n");
+    fprintf(stderr, "  --use-alloc                Use allocator. Implies use-unified. (default)\n");
+    fprintf(stderr, "  --no-unified               Don't use unified\n");
+    fprintf(stderr, "  --use-unified              Use unified. (default)\n");
     fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
     fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
     fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
@@ -4010,6 +4018,14 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
             params->use_checkpointing = false;
         } else if (arg == "--use-checkpointing") {
             params->use_checkpointing = true;
+        } else if (arg == "--no-alloc") {
+            params->use_alloc = false;
+        } else if (arg == "--use-alloc") {
+            params->use_alloc = true;
+        } else if (arg == "--no-unified") {
+            params->use_unified = false;
+        } else if (arg == "--use-unified") {
+            params->use_unified = true;
         } else if (arg == "--warmup") {
             if (++i >= argc) {
                 invalid_param = true;

From 865c4cd3c1ceceab3e7a4b537b03051befbbc6bc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:12:58 +0200
Subject: [PATCH 051/100] integrate unified training function which may use
 memory allocator

the unified training function also supports arguments whether to use flash attention and/or gradient checkpointing
---
 .../train-text-from-scratch.cpp               | 43 +++++++++++++++----
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 15f60513f3be2..a4b41e7fb8c44 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -4391,6 +4391,12 @@ int main(int argc, char ** argv) {
     uint8_t * compute_buf_1 = new uint8_t[size_buf_1];
     uint8_t * compute_buf_2 = new uint8_t[size_buf_2];
 
+    ggml_allocr * alloc = NULL;
+    if (params.use_alloc) {
+        static const size_t tensor_alignment = 32;
+        alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
+    }
+
     GGML_ASSERT(n_tokens < (int) train_tokens.size());
     std::vector<int> train_samples;
     train_samples.push_back(0);
@@ -4437,33 +4443,48 @@ int main(int argc, char ** argv) {
         };
         struct ggml_context * ctx0 = ggml_init(cparams);
 
+        ggml_set_no_alloc(ctx0, false);
+
+        // don't use alloc for input tensors, so we can safely fill them with data
         struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
         //struct ggml_tensor * after_opt_probs        = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
         struct ggml_tensor * tokens_input           = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
         struct ggml_tensor * target_logits          = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
         struct ggml_tensor * target_probs           = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
 
+        ggml_set_no_alloc(ctx0, (alloc != NULL));
+
+        if (alloc) {
+            ggml_allocr_reset(alloc);
+        }
+
         opt_cb_data.tokens_input  = tokens_input;
         opt_cb_data.target_logits = target_logits;
         opt_cb_data.target_probs  = target_probs;
 
         int n_past = 0;
 
-        struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
-        struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
-
-        memset(gfbuf->data, 0, ggml_nbytes(gfbuf));
-        memset(gbbuf->data, 0, ggml_nbytes(gbbuf));
-
-        struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
-        struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+        struct ggml_cgraph * gb = ggml_new_graph(ctx0);
+        struct ggml_cgraph * gb_tmp = (params.use_unified || params.use_alloc) 
+            ? ggml_new_graph(ctx0)
+            : NULL;
 
         GGML_ASSERT(n_past == 0);
 
         struct ggml_tensor * loss   = NULL;
         struct ggml_tensor * logits = NULL;
 
-        if (params.use_checkpointing) {
+        if (params.use_alloc || params.use_unified) {
+            loss = llama_build_train_graphs(
+                &model, alloc, ctx0,
+                gf, gb, gb_tmp,
+                &logits, tokens_input, target_probs,
+                n_tokens, n_batch,
+                params.use_flash, 
+                params.use_checkpointing
+            );
+        } else if (params.use_checkpointing) {
             loss = forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
                     &model, ctx0,
                     gf, gb,
@@ -4641,6 +4662,10 @@ int main(int argc, char ** argv) {
         }
     }
 
+    if (alloc) {
+        ggml_allocr_free(alloc);
+    }
+
     delete[] compute_addr;
     delete[] compute_buf_0;
     delete[] compute_buf_1;

From 3e99a8d65369ba2bdcc8eff1e6036fe11966cadc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:15:09 +0200
Subject: [PATCH 052/100] format name of cloned tensors with " (clone)" suffix

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index a4b41e7fb8c44..7983b3bfab027 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1493,7 +1493,7 @@ struct ggml_tensor * ggml_recompute_graph_node(
     GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
     GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);
     memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
-    memcpy(clone->name,      node->name,      sizeof(node->name));
+    ggml_format_name(clone, "%s (clone)", ggml_get_name(node));
 
     return clone;
 };

From 75baed230cedf1929e93bcc006160016e2672a70 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:17:14 +0200
Subject: [PATCH 053/100] set names for tensors in unified train function for
 easier debugging

---
 .../train-text-from-scratch.cpp               | 94 ++++++++++---------
 1 file changed, 52 insertions(+), 42 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 7983b3bfab027..07982706330d7 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1572,9 +1572,19 @@ struct ggml_tensor * llama_build_train_graphs(
     const int n_ff       = get_n_ff(&hparams);
     const int rope_mode  = 0;
 
+    auto set_name = [](struct ggml_tensor * t, const char * n) {
+        ggml_set_name(t, n);
+        if (t->grad) {
+            ggml_format_name(t->grad, "%s->grad", n);
+        }
+    };
+
+    set_name(tokens_input, "tokens_input");
+    set_name(targets,      "targets");
+
     GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  assert_shape_1d(t00, N*n_batch);
-    struct ggml_tensor * t01 = ggml_get_rows(ctx, model->tok_embeddings, t00); assert_shape_2d(t01, n_embd, N*n_batch);
+    struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch);
+    struct ggml_tensor * t01 = ggml_get_rows(ctx, model->tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch);
 
     struct ggml_tensor * cur = t01;
 
@@ -1591,53 +1601,53 @@ struct ggml_tensor * llama_build_train_graphs(
 
     for (int il = 0; il < n_layer; ++il) {
         struct my_llama_layer & layer = model->layers[il];
-        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                      assert_shape_2d(t02, n_embd, N*n_batch);
-        struct ggml_tensor * t03 = ggml_repeat       (ctx, layer.attention_norm, t02);              assert_shape_2d(t03, n_embd, N*n_batch);
-        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               assert_shape_2d(t04, n_embd, N*n_batch);
-        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, layer.wq, t04);                          assert_shape_2d(t05, n_embd, N*n_batch);
-        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t07 = ggml_rope_inplace (ctx, t06, n_past, n_rot, rope_mode, n_ctx);   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, layer.wk, t04);                          assert_shape_2d(t08, n_embd, N*n_batch);
-        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t10 = ggml_rope_inplace (ctx, t09, n_past, n_rot, rope_mode, n_ctx);   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t11 = ggml_mul_mat      (ctx, t04, layer.wv);                          assert_shape_2d(t11, N*n_batch, n_embd);
-        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd/n_head, n_head); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
+        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                      set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
+        struct ggml_tensor * t03 = ggml_repeat       (ctx, layer.attention_norm, t02);              set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
+        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
+        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, layer.wq, t04);                          set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
+        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); set_name(t06, "t06");     assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t07 = ggml_rope_inplace (ctx, t06, n_past, n_rot, rope_mode, n_ctx);   set_name(t07, "t07");     assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, layer.wk, t04);                          set_name(t08, "t08");     assert_shape_2d(t08, n_embd, N*n_batch);
+        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t10 = ggml_rope_inplace (ctx, t09, n_past, n_rot, rope_mode, n_ctx);   set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t11 = ggml_mul_mat      (ctx, t04, layer.wv);                          set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd);
+        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd/n_head, n_head); set_name(t12, "t12");     assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
+        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                        set_name(t13, "t13");     assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
+        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                        set_name(t14, "t14");     assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
+        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                        set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
         struct ggml_tensor * t16;
         if (enable_flash_attn) {
-            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
         } else {
-            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                 assert_shape_4d(t16_0, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);          assert_shape_4d(t16_1, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);            assert_shape_4d(t16_2, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                    assert_shape_4d(t16_3, N, N, n_head, n_batch);
-            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                 set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);          set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);            set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                    set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch);
+            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                    set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
         }
-        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                 assert_shape_2d(t19, n_embd, N*n_batch);
-        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, layer.wo, t19);                          assert_shape_2d(t20, n_embd, N*n_batch);
-        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                               assert_shape_2d(t21, n_embd, N*n_batch);
-        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                      assert_shape_2d(t22, n_embd, N*n_batch);
-        struct ggml_tensor * t23 = ggml_repeat       (ctx, layer.ffn_norm, t22);                    assert_shape_2d(t23, n_embd, N*n_batch);
-        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.w3, t24);                          assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.w1, t24);                          assert_shape_2d(t26, n_ff, N*n_batch);
-        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    assert_shape_2d(t27, n_ff, N*n_batch);
-        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.w2, t28);                          assert_shape_2d(t29, n_embd, N*n_batch);
-        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               assert_shape_2d(t30, n_embd, N*n_batch);
+        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                        set_name(t17, "t17");     assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                    set_name(t18, "t18");     assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                 set_name(t19, "t19");     assert_shape_2d(t19, n_embd, N*n_batch);
+        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, layer.wo, t19);                          set_name(t20, "t20");     assert_shape_2d(t20, n_embd, N*n_batch);
+        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                               set_name(t21, "t21");     assert_shape_2d(t21, n_embd, N*n_batch);
+        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                      set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
+        struct ggml_tensor * t23 = ggml_repeat       (ctx, layer.ffn_norm, t22);                    set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
+        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
+        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.w3, t24);                          set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
+        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.w1, t24);                          set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
+        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
+        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
+        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.w2, t28);                          set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
+        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
         cur = t30;
         checkpoints.push_back(cur);
     }
-    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                   assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = ggml_repeat            (ctx, model->norm, t31);                    assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                            assert_shape_2d(t33, n_embd, N*n_batch);
-    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, model->output, t33);                  assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);            assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                        assert_shape_1d(t36, 1);
+    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                   set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
+    struct ggml_tensor * t32   = ggml_repeat            (ctx, model->norm, t31);                    set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
+    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                            set_name(t33, "t33");     assert_shape_2d(t33, n_embd, N*n_batch);
+    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, model->output, t33);                  set_name(t34, "t34");     assert_shape_2d(t34, n_vocab, N*n_batch);
+    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);            set_name(t35, "t35");     assert_shape_3d(t35, n_vocab, N, n_batch);
+    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                        set_name(t36, "t36");     assert_shape_1d(t36, 1);
 
     checkpoints.push_back(t31);
     checkpoints.push_back(t32);

From fe788a1c7a7bbaf286b12b99a3df75dad4c7403b Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:24:13 +0200
Subject: [PATCH 054/100] allocate graph on context using ggml_new_graph

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 07982706330d7..eb3ac9ac314d7 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -4641,9 +4641,7 @@ int main(int argc, char ** argv) {
             };
             struct ggml_context * ctx0 = ggml_init(cparams);
 
-            struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
-            memset(gfbuf->data, 0, ggml_nbytes(gfbuf));
-            struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
+            struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
             int n_past = 0;
             struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);

From c954f41ca43e0c0d6e8f1225d9e598722e3a1dff Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:27:01 +0200
Subject: [PATCH 055/100] remove handwritten training functions

---
 .../train-text-from-scratch.cpp               | 1586 +----------------
 1 file changed, 8 insertions(+), 1578 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index eb3ac9ac314d7..9d94bdfcf6984 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -946,406 +946,6 @@ struct ggml_tensor * forward_batch(
     return inpL;
 }
 
-struct ggml_tensor * forward_batch_wo_cache(
-        struct my_llama_model * model,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_tensor    * tokens_input,
-        const  int              n_tokens,
-        const  int              n_batch) {
-
-    const int n_past = 0;
-    const int N = n_tokens;
-
-    const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_vocab = hparams.n_vocab;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-    const int n_ff    = get_n_ff(&hparams);
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * tokens = ggml_reshape_1d(ctx0, tokens_input, N*n_batch);
-
-    // inpL shape [n_embd,N*n_batch,1]
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    assert_shape_2d(inpL, n_embd, N*n_batch);
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // lctx.use_buf(ctx0, 0);
-
-        // norm
-        {
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
-            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
-
-            // Vcur shape [N, n_batch, n_embd/n_head, n_head]
-            struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head);
-            assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head);
-
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Q shape    [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
-
-            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-            // K shape [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        Kcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch);
-
-            // K * Q
-            // KQ shape [N, N, n_head, n_batch]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            assert_shape_4d(KQ, N, N, n_head, n_batch);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // KQ_scaled shape [N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
-            assert_shape_4d(KQ_scaled, N, N, n_head, n_batch);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // KQ_masked shape [N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-            assert_shape_4d(KQ_masked, N, N, n_head, n_batch);
-
-            // KQ = soft_max(KQ_masked)
-            // KQ_soft_max shape [N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-            assert_shape_4d(KQ_soft_max, N, N, n_head, n_batch);
-
-            // Vcur shape [N, n_batch, n_embd/n_head, n_head]
-            // V shape    [N, n_embd/n_head, n_head, n_batch]
-            struct ggml_tensor * V =
-                ggml_permute(ctx0,
-                    Vcur,
-                    0, 3, 1, 2);
-            assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch);
-
-            // KQV shape [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
-            // KQV_merged shape
-
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // projection (no bias)
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].wo,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // lctx.use_buf(ctx0, 1);
-
-        // inpFF shape [n_embd,N*n_batch,1,1]
-        struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
-        assert_shape_2d(inpFF, n_embd, N*n_batch);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-
-                // cur = ffn_norm*cur
-                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-            }
-
-            // tmp shape [n_ff,N*n_batch,1,1]
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-            assert_shape_2d(tmp, n_ff, N*n_batch);
-
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // SILU activation
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_silu(ctx0, cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_mul(ctx0, cur, tmp);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // cur shape [n_embd,N*n_batch,1,1]
-        cur = ggml_add_inplace(ctx0, cur, inpFF);
-        assert_shape_2d(cur, n_embd, N*n_batch);
-
-        // input for next layer
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = cur;
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-    }
-
-    // norm
-    {
-
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        // inpL = norm*inpL
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        //embeddings = inpL;
-    }
-
-    // lm_head
-    // inpL shape [n_vocab,N*n_batch,1,1]
-    inpL = ggml_mul_mat(ctx0, model->output, inpL);
-    assert_shape_2d(inpL, n_vocab, N*n_batch);
-
-    {
-        // inpL shape [n_vocab,N,n_batch,1]
-        inpL = ggml_reshape_3d(ctx0,
-                        inpL,
-                        n_vocab, N, n_batch);
-        assert_shape_3d(inpL, n_vocab, N, n_batch);
-    }
-
-    // run the computation
-    // ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
-struct ggml_tensor * forward_batch_wo_cache_flash_attn(
-        struct my_llama_model * model,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_tensor    * tokens_input,
-        const  int              n_tokens,
-        const  int              n_batch) {
-
-    const int n_past = 0;
-    const int N = n_tokens;
-
-    const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_vocab = hparams.n_vocab;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-    const int n_ff    = get_n_ff(&hparams);
-
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * tokens = ggml_reshape_1d(ctx0, tokens_input, N*n_batch);
-
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    assert_shape_2d(inpL, n_embd, N*n_batch);
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
-            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
-
-            struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head);
-            assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head);
-
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
-
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        Kcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch);
-
-            struct ggml_tensor * V =
-                ggml_permute(ctx0,
-                    Vcur,
-                    0, 3, 1, 2);
-            assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch);
-
-            bool masked = true;
-            struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, masked);
-            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // projection (no bias)
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].wo,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-        assert_shape_2d(inpFF, n_embd, N*n_batch);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-
-                // cur = ffn_norm*cur
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-            }
-
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-            assert_shape_2d(tmp, n_ff, N*n_batch);
-
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // SILU activation
-            cur = ggml_silu(ctx0, cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            cur = ggml_mul(ctx0, cur, tmp);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // cur = ggml_add_inplace(ctx0, cur, inpFF);
-        cur = ggml_add(ctx0, cur, inpFF);
-        assert_shape_2d(cur, n_embd, N*n_batch);
-
-        // input for next layer
-        inpL = cur;
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-    }
-
-    // norm
-    {
-
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        // inpL = norm*inpL
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-    }
-
-    // lm_head
-    inpL = ggml_mul_mat(ctx0, model->output, inpL);
-    assert_shape_2d(inpL, n_vocab, N*n_batch);
-
-    {
-        inpL = ggml_reshape_3d(ctx0,
-                        inpL,
-                        n_vocab, N, n_batch);
-        assert_shape_3d(inpL, n_vocab, N, n_batch);
-    }
-
-    // run the computation
-    // ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
-
 static size_t hash(void * p) {
     return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
 }
@@ -1703,1146 +1303,6 @@ struct ggml_tensor * llama_build_train_graphs(
     return t36;
 }
 
-
-// expand the graph nodes without creating leafs.
-struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
-    // check if already visited
-    for (int i = 0; i < g->n_nodes; i++) {
-        if (g->nodes[i] == t) {
-            return t;
-        }
-    }
-
-    for (int i = 0; i < g->n_leafs; i++) {
-        if (g->leafs[i] == t) {
-            return t;
-        }
-    }
-
-    for (int i = 0; i < GGML_MAX_SRC; ++i) {
-        if (t->src[i]) {
-            expand(g, t->src[i]);
-        }
-    }
-
-    GGML_ASSERT(g->n_nodes < GGML_MAX_NODES);
-
-    if (strlen(t->name) == 0) {
-        snprintf(t->name, sizeof(t->name), "node_%d", g->n_nodes);
-    }
-
-    g->nodes[g->n_nodes] = t;
-    g->grads[g->n_nodes] = t->grad;
-    g->n_nodes++;
-    return t;
-}
-
-void graph_set_leafs_grads(struct ggml_cgraph * g) {
-    // moves leaf nodes to g->leafs.
-    // i.e. g->n_nodes might change.
-    int n_nodes = 0;
-    for (int i = 0; i < g->n_nodes; ++i) {
-        struct ggml_tensor * node = g->nodes[i];
-        const bool is_leaf = node->op == GGML_OP_NONE && node->grad == NULL;
-        if (is_leaf) {
-            GGML_ASSERT(g->n_leafs < GGML_MAX_NODES);
-
-            if (strlen(node->name) == 0) {
-                snprintf(node->name, sizeof(node->name), "leaf_%d", g->n_leafs);
-            }
-
-            g->leafs[g->n_leafs] = node;
-            g->n_leafs++;
-        } else {
-            GGML_ASSERT(n_nodes < GGML_MAX_NODES);
-
-            if (strlen(node->name) == 0) {
-                snprintf(node->name, sizeof(node->name), "node_%d", n_nodes);
-            }
-
-            g->nodes[n_nodes] = node;
-            g->grads[n_nodes] = node->grad;
-            n_nodes++;
-        }
-    }
-    for (int i=n_nodes; i < g->n_nodes; ++i) {
-        g->nodes[i] = NULL;
-        g->grads[i] = NULL;
-    }
-    g->n_nodes = n_nodes;
-}
-
-struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
-        struct my_llama_model * model,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_tensor  * * logits,
-        struct ggml_tensor    * tokens_input,
-        struct ggml_tensor    * targets,
-        void                  * compute_buf_0,
-        void                  * compute_buf_1,
-        size_t                  size_buf_0,
-        size_t                  size_buf_1,
-        const  int              n_tokens,
-        const  int              n_batch) {
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    const int n_past = 0;
-    const int N = n_tokens;
-
-    gf->n_nodes = 0;
-    gf->n_leafs = 0;
-    gf->perf_runs = 0;
-    gf->perf_cycles = 0;
-    gf->perf_time_us = 0;
-
-    const auto & hparams = model->hparams;
-    const int n_ctx      = hparams.n_ctx;
-    const int n_vocab    = hparams.n_vocab;
-    const int n_embd     = hparams.n_embd;
-    const int n_layer    = hparams.n_layer;
-    const int n_head     = hparams.n_head;
-    const int n_rot      = hparams.n_rot;
-    const int n_ff       = get_n_ff(&hparams);
-    const int rope_mode  = 0;
-
-    bool track_max_mem = true;
-
-    int last_buf = -1;
-    size_t buf_offs[2] = { 0, 0 };
-    size_t buf_size[2] = { size_buf_0,
-                           size_buf_1 };
-    void * buf_data[2] = { compute_buf_0,
-                           compute_buf_1 };
-    size_t buf_maxs[2] = { 0, 0 };
-
-    auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs] (int buf) {
-        size_t last_offs = 0;
-        last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-        if (last_buf >= 0) {
-            buf_offs[last_buf] = last_offs;
-            buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
-        }
-        if (buf >= 0) {
-            size_t offs = buf_offs[buf];
-            size_t size = buf_size[buf];
-            void * data = buf_data[buf];
-            ggml_set_scratch(ctx0, { offs, size, data, });
-        }
-        last_buf = buf;
-    };
-
-
-    auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) {
-        if (buf < 0) return;
-        if (track_max_mem) {
-            size_t last_offs = 0;
-            last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-            if (last_buf >= 0) {
-                buf_offs[last_buf] = last_offs;
-                buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
-            }
-        }
-        buf_offs[buf] = 0;
-        if (track_max_mem && last_buf >= 0) {
-            size_t offs = buf_offs[last_buf];
-            size_t size = buf_size[last_buf];
-            void * data = buf_data[last_buf];
-            ggml_set_scratch(ctx0, { offs, size, data, });
-        }
-    };
-
-
-    auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = n_embd/n_head;
-        int64_t ne1 = N;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = 0;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = n_embd/n_head;
-        int64_t ne1 = N;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = nb3*ne3;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = N;
-        int64_t ne1 = n_embd/n_head;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = 2*nb3*ne3;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * {
-        if (a == NULL) {
-            return b;
-        } else {
-            return ggml_add_inplace(ctx0, a, b);
-        }
-    };
-
-    use_buf(-1);
-
-    model->tok_embeddings->grad    = NULL;
-    model->norm->grad              = NULL;
-    model->output->grad            = NULL;
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        layer.attention_norm->grad = NULL;
-        layer.wq->grad             = NULL;
-        layer.wk->grad             = NULL;
-        layer.wv->grad             = NULL;
-        layer.wo->grad             = NULL;
-        layer.ffn_norm->grad       = NULL;
-        layer.w1->grad             = NULL;
-        layer.w2->grad             = NULL;
-        layer.w3->grad             = NULL;
-    }
-
-    clr_buf(0);
-    clr_buf(1);
-
-    use_buf(-1);
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * t00 = ggml_reshape_1d(ctx0, tokens_input, N*n_batch); assert_shape_1d(t00, N*n_batch);
-
-    use_buf(-1);
-
-    struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch);
-
-    // need to remember these for the backward pass
-    std::vector<struct ggml_tensor *> t02L; t02L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t03L; t03L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t04L; t04L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t05L; t05L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t06L; t06L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t07L; t07L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t08L; t08L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t09L; t09L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t10L; t10L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t11L; t11L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t12L; t12L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t13L; t13L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t14L; t14L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t15L; t15L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t16L; t16L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t17L; t17L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t18L; t18L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t19L; t19L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t20L; t20L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t21L; t21L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t22L; t22L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t23L; t23L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t24L; t24L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t25L; t25L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t26L; t26L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t27L; t27L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t28L; t28L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t29L; t29L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t30L; t30L.resize(n_layer, NULL);
-
-    struct ggml_tensor * cur = t01;
-
-    for (int il = 0; il < n_layer; ++il) {
-        clr_buf(0);
-        struct my_llama_layer & layer = model->layers[il];
-        // tensors with values necessary for backward pass are in persistent buf(-1)
-        // other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed.
-        use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
-        use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
-        use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-        use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-        use_buf(-1); struct ggml_tensor * t14 = expand(gf, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-        use_buf(-1); struct ggml_tensor * t15 = expand(gf, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
-        use_buf(-1); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-        use_buf( 0); struct ggml_tensor * t17 = expand(gf, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t18 = expand(gf, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
-        use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
-        use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t27 = expand(gf, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t28 = expand(gf, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
-        use_buf( 0); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t30 = expand(gf, ggml_add          (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
-        t02L[il] = t02;
-        t03L[il] = t03;
-        t04L[il] = t04;
-        t05L[il] = t05;
-        t06L[il] = t06;
-        t07L[il] = t07;
-        t08L[il] = t08;
-        t09L[il] = t09;
-        t10L[il] = t10;
-        t11L[il] = t11;
-        t12L[il] = t12;
-        t13L[il] = t13;
-        t14L[il] = t14;
-        t15L[il] = t15;
-        t16L[il] = t16;
-        t17L[il] = t17;
-        t18L[il] = t18;
-        t19L[il] = t19;
-        t20L[il] = t20;
-        t21L[il] = t21;
-        t22L[il] = t22;
-        t23L[il] = t23;
-        t24L[il] = t24;
-        t25L[il] = t25;
-        t26L[il] = t26;
-        t27L[il] = t27;
-        t28L[il] = t28;
-        t29L[il] = t29;
-        t30L[il] = t30;
-
-        cur      = t30;
-    }
-    clr_buf(0);
-    use_buf(0);
-    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur, rms_norm_eps));         assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
-    use_buf(-1);
-    struct ggml_tensor * t34   = expand(gf, ggml_mul_mat   (ctx0, model->output, t33));        assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch));  assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets));      assert_shape_1d(t36, 1);
-
-    {
-        /*
-        tok_embeddings                                                        | grad_tok_embeddings = ggml_get_rows_back(grad_t01, t00)
-        L0_att_norm                                                           | grad_L0_att_norm    = ggml_repeat_back(grad_t03L0, L0_att_norm.shape)
-        L0_wq                                                                 | grad_L0_wq          = ggml_out_prod(t04L0, grad_t05L0)
-        L0_wk                                                                 | grad_L0_wk          = ggml_out_prod(t04L0, grad_t08L0)
-        L0_wv                                                                 | grad_L0_wv          = ggml_out_prod(t04L0, ggml_transpose(grad_t11L0))
-        L0_wo                                                                 | grad_L0_wo          = ggml_out_prod(t19L0, grad_t20L0)
-        L0_ffn_norm                                                           | grad_L0_ffn_norm    = ggml_repeat_back(grad_t23L0, L0_ffn_norm.shape)
-        L0_w1                                                                 | grad_L0_w1          = ggml_out_prod(t24L0, grad_t26L0)
-        L0_w2                                                                 | grad_L0_w2          = ggml_out_prod(t28L0, grad_t29L0)
-        L0_w3                                                                 | grad_L0_w3          = ggml_out_prod(t24L0, grad_t25L0)
-        L1_att_norm                                                           | grad_L1_att_norm    = ggml_repeat_back(grad_t03L1, L1_att_norm.shape)
-        L1_wq                                                                 | grad_L1_wq          = ggml_out_prod(t04L1, grad_t05L1)
-        L1_wk                                                                 | grad_L1_wk          = ggml_out_prod(t04L1, grad_t08L1)
-        L1_wv                                                                 | grad_L1_wv          = ggml_out_prod(t04L1, ggml_transpose(grad_t11L1))
-        L1_wo                                                                 | grad_L1_wo          = ggml_out_prod(t19L1, grad_t20L1)
-        L1_ffn_norm                                                           | grad_L1_ffn_norm    = ggml_repeat_back(grad_t23L1, L1_ffn_norm.shape)
-        L1_w1                                                                 | grad_L1_w1          = ggml_out_prod(t24L1, grad_t26L1)
-        L1_w2                                                                 | grad_L1_w2          = ggml_out_prod(t28L1, grad_t29L1)
-        L1_w3                                                                 | grad_L1_w3          = ggml_out_prod(t24L1, grad_t25L1)
-        norm                                                                  | grad_norm           = ggml_repeat_back(grad_t32, norm.shape)
-        output                                                                | grad_output         = ggml_out_prod(t33, grad_t34)
-                                                                              |
-        t01 = ggml_get_rows(tok_embeddings, t00)                              | grad_t01   = grad_t21L0 + ggml_rms_norm_back(t01, grad_t02L0)
-        for layer:                                                            |
-        t02L0*= ggml_rms_norm     (t01)                                       | grad_t02L0 = ggml_mul(grad_t04L0, t03L0)
-        t03L0 = ggml_repeat       (L0_att_norm, t02L0_shape)                  | grad_t03L0 = ggml_mul(grad_t04L0, t02L0)
-        t04L0*= ggml_mul          (t02L0, t03L0)                              | grad_t04L0 = ggml_out_prod(L0_wv, grad_t11L0) + ggml_out_prod(L0_wk, ggml_transpose(grad_t08L0)) + ggml_out_prod(L0_wq, ggml_transpose(grad_t05L0))
-        t05L0 = ggml_mul_mat      (L0_wq, t04L0)                              | grad_t05L0 = ggml_reshape(grad_t06L0, t05L0_shape)
-        t06L0 = ggml_reshape_4d   (t05L0, n_embd/n_head, n_head, N, n_batch)  | grad_t06L0 = ggml_rope_back(grad_t07L0)
-        t07L0 = ggml_rope_inplace (t06L0)                                     | grad_t07L0 = ggml_permute_back(grad_t13L0, 0, 2, 1, 3) = ggml_permute(grad_t13L0, 0, 2, 1, 3)
-        t08L0 = ggml_mul_mat      (L0_wk, t04L0)                              | grad_t08L0 = ggml_reshape(grad_t09L0, t08L0_shape)
-        t09L0 = ggml_reshape_4d   (t08L0, n_embd/n_head, n_head, N, n_batch)  | grad_t09L0 = ggml_rope_back(grad_t10L0)
-        t10L0 = ggml_rope_inplace (t09L0)                                     | grad_t10L0 = ggml_permute_back(grad_t14L0, 0, 2, 1, 3) = ggml_permute(grad_t14L0, 0, 2, 1, 3)
-        t11L0 = ggml_mul_mat      (t04L0, L0_wv)                              | grad_t11L0 = ggml_reshape(grad_t12L0, t11L0_shape)
-        t12L0 = ggml_reshape_4d   (t11L0, N, n_batch, n_embd/n_head, n_head)  | grad_t12L0 = ggml_permute_back(grad_t15L0, 0, 3, 1, 2) = ggml_permute(grad_t15L0, 0, 2, 3, 1)
-        t13L0*= ggml_permute      (t07L0, 0, 2, 1, 3)                         | grad_t13L0 = view__q(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0))
-        t14L0*= ggml_permute      (t10L0, 0, 2, 1, 3)                         | grad_t14L0 = view__k(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0))
-        t15L0*= ggml_permute      (t12L0, 0, 3, 1, 2)                         | grad_t15L0 = view__v(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0))
-        t16L0 = ggml_flash_attn   (t13L0, t14L0, t15L0)                       | grad_t16L0 = ggml_permute_back(grad_t17L0, 0, 2, 1, 3) = ggml_permute(grad_t17L0, 0, 2, 1, 3)
-        t17L0 = ggml_permute      (t16L0, 0, 2, 1, 3)                         | grad_t17L0 = grad_t18L0
-        t18L0 = ggml_cont         (t17L0)                                     | grad_t18L0 = ggml_reshape(grad_t19L0, t18L0_shape)
-        t19L0*= ggml_reshape_2d   (t18L0, n_embd, N*n_batch)                  | grad_t19L0 = ggml_out_prod(L0_wo, ggml_transpose(grad_t20L0))
-        t20L0 = ggml_mul_mat      (L0_wo, t19L0)                              | grad_t20L0 = grad_t21L0
-        t21L0*= ggml_add          (t20L0, t01)                                | grad_t21L0 = grad_t30L0 + ggml_rms_norm_back(t21L0, grad_t22L0)
-        t22L0*= ggml_rms_norm     (t21L0)                                     | grad_t22L0 = ggml_mul(grad_t24L0, t23L0)
-        t23L0 = ggml_repeat       (L0_ffn_norm, t22L0_shape)                  | grad_t23L0 = ggml_mul(grad_t24L0, t22L0)
-        t24L0*= ggml_mul          (t23L0, t22L0)                              | grad_t24L0 = ggml_out_prod(L0_w1, ggml_transpose(grad_t26L0)) + ggml_out_prod(L0_w3, ggml_transpose(grad_t25L0))
-        t25L0*= ggml_mul_mat      (L0_w3, t24L0)                              | grad_t25L0 = ggml_mul(grad_t28L0, t27L0)
-        t26L0*= ggml_mul_mat      (L0_w1, t24L0)                              | grad_t26L0 = ggml_silu_back(t26L0, grad_t27L0)
-        t27L0*= ggml_silu         (t26L0)                                     | grad_t27L0 = ggml_mul(grad_t28L0, t25L0)
-        t28L0*= ggml_mul          (t27L0, t25L0)                              | grad_t28L0 = ggml_out_prod(L0_w2, ggml_transpose(grad_t29L0))
-        t29L0 = ggml_mul_mat      (L0_w2, t28L0)                              | grad_t29L0 = grad_t30L0
-        t30L0*= ggml_add          (t21L0, t29L0)                              | grad_t30L0 = ggml_rms_norm_back(t30L0, grad_t02L1) + grad_t21L1
-                                                                              ^
-        t02L1*= ggml_rms_norm     (t30L0)                                     | grad_t02L1 = ggml_mul(grad_t04L1, t03L1)
-        t03L1 = ggml_repeat       (L1_att_norm, t02L1_shape)                  | grad_t03L1 = ggml_mul(grad_t04L1, t02L1)
-        t04L1*= ggml_mul          (t02L1, t03L1)                              | grad_t04L1 = ggml_out_prod(L1_wv, grad_t11L1) + ggml_out_prod(L1_wk, ggml_transpose(grad_t08L1)) + ggml_out_prod(L1_wq, ggml_transpose(grad_t05L1))
-        t05L1 = ggml_mul_mat      (L1_wq, t04L1)                              | grad_t05L1 = ggml_reshape(grad_t06L1, t05L1_shape)
-        t06L1 = ggml_reshape_4d   (t05L1, n_embd/n_head, n_head, N, n_batch)  | grad_t06L1 = ggml_rope_back(grad_t07L1)
-        t07L1 = ggml_rope_inplace (t06L1)                                     | grad_t07L1 = ggml_permute_back(grad_t13L1, 0, 2, 1, 3) = ggml_permute(grad_t13L1, 0, 2, 1, 3)
-        t08L1 = ggml_mul_mat      (L1_wk, t04L1)                              | grad_t08L1 = ggml_reshape(grad_t09L1, t08L1_shape)
-        t09L1 = ggml_reshape_4d   (t08L1, n_embd/n_head, n_head, N, n_batch)  | grad_t09L1 = ggml_rope_back(grad_t10L1)
-        t10L1 = ggml_rope_inplace (t09L1)                                     | grad_t10L1 = ggml_permute_back(grad_t14L1, 0, 2, 1, 3) = ggml_permute(grad_t14L1, 0, 2, 1, 3)
-        t11L1 = ggml_mul_mat      (t04L1, L1_wv)                              | grad_t11L1 = ggml_reshape(grad_t12L1, t11L1_shape)
-        t12L1 = ggml_reshape_4d   (t11L1, N, n_batch, n_embd/n_head, n_head)  | grad_t12L1 = ggml_permute_back(grad_t15L1, 0, 3, 1, 2) = ggml_permute(grad_t15L1, 0, 2, 3, 1)
-        t13L1*= ggml_permute      (t07L1, 0, 2, 1, 3)                         | grad_t13L1 = view__q(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1))
-        t14L1*= ggml_permute      (t10L1, 0, 2, 1, 3)                         | grad_t14L1 = view__k(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1))
-        t15L1*= ggml_permute      (t12L1, 0, 3, 1, 2)                         | grad_t15L1 = view__v(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1))
-        t16L1 = ggml_flash_attn   (t13L1, t14L1, t15L1)                       | grad_t16L1 = ggml_permute_back(grad_t17L1, 0, 2, 1, 3) = ggml_permute(grad_t17L1, 0, 2, 1, 3)
-        t17L1 = ggml_permute      (t16L1, 0, 2, 1, 3)                         | grad_t17L1 = grad_t18L1
-        t18L1 = ggml_cont         (t17L1)                                     | grad_t18L1 = ggml_reshape(grad_t19L1, t18L1_shape)
-        t19L1*= ggml_reshape_2d   (t18L1, n_embd, N*n_batch)                  | grad_t19L1 = ggml_out_prod(L1_wo, ggml_transpose(grad_t20L1))
-        t20L1 = ggml_mul_mat      (L1_wo, t19L1)                              | grad_t20L1 = grad_t21L1
-        t21L1*= ggml_add          (t20L1, t30L0)                              | grad_t21L1 = grad_t30L1 + ggml_rms_norm_back(t21L1, grad_t22L1)
-        t22L1*= ggml_rms_norm     (t21L1)                                     | grad_t22L1 = ggml_mul(grad_t24L1, t23L1)
-        t23L1 = ggml_repeat       (L1_ffn_norm, t22L1_shape)                  | grad_t23L1 = ggml_mul(grad_t24L1, t22L1)
-        t24L1*= ggml_mul          (t23L1, t22L1)                              | grad_t24L1 = ggml_out_prod(L1_w1, ggml_transpose(grad_t26L1)) + ggml_out_prod(L1_w3, ggml_transpose(grad_t25L1))
-        t25L1*= ggml_mul_mat      (L1_w3, t24L1)                              | grad_t25L1 = ggml_mul(grad_t28L1, t27L1)
-        t26L1*= ggml_mul_mat      (L1_w1, t24L1)                              | grad_t26L1 = ggml_silu_back(t26L1, grad_t27L1)
-        t27L1*= ggml_silu         (t26L1)                                     | grad_t27L1 = ggml_mul(grad_t28L1, t25L1)
-        t28L1*= ggml_mul          (t27L1, t25L1)                              | grad_t28L1 = ggml_out_prod(L1_w2, ggml_transpose(grad_t29L1))
-        t29L1 = ggml_mul_mat      (L1_w2, t28L1)                              | grad_t29L1 = grad_t30L1
-        t30L1*= ggml_add          (t21L1, t29L1)                              | grad_t30L1 = ggml_rms_norm_back(t30L1, grad_t31)
-                                                                              ^
-        t31   = ggml_rms_norm     (t30L1)                                     | grad_t31   = ggml_mul(grad_t33, t32)
-        t32   = ggml_repeat       (norm, t31.shape)                           | grad_t32   = ggml_mul(grad_t33, t31)
-        t33   = ggml_mul          (t32, t31)                                  | grad_t33   = ggml_out_prod(output, ggml_transpose(grad_t34))
-        t34   = ggml_mul_mat      (output, t33)                               | grad_t34   = ggml_reshape(grad_t35, t34.shape)
-        t35   = ggml_reshape_3d   (t34, n_vocab, N, n_batch)                  | grad_t35   = ggml_cross_entropy_loss_back(t35, targets, grad_t36)
-        t36   = ggml_cross_entropy_loss(t35, targets)                         | grad_t36   = 1 (optimizer)
-        tensors marked with * need to be stored until grad computation
-        tensors during grad computation are all temporary
-        */
-    }
-
-    *gb = *gf;
-
-    // t36->grad gets set to one by optimizer, so we need the tensor.
-    // initialize it with 1.0f to make sure.
-    use_buf(-1);
-    t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
-
-    use_buf(0);
-    t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad));              assert_shape_3d(t35->grad, n_vocab, N, n_batch);
-    t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch));                    assert_shape_2d(t34->grad, n_vocab, N*n_batch);
-    t33->grad = expand(gb, ggml_out_prod   (ctx0, model->output, ggml_transpose(ctx0, t34->grad)));   assert_shape_2d(t33->grad, n_embd, N*n_batch);
-    t32->grad = expand(gb, ggml_mul        (ctx0, t33->grad, t31));                                   assert_shape_2d(t32->grad, n_embd, N*n_batch);
-
-    use_buf(-1);
-
-    model->norm->grad   = expand(gb, add_or_set(model->norm->grad,   ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd);
-    model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad)));            assert_shape_2d(model->output->grad, n_embd, n_vocab);
-
-    clr_buf(1);
-    use_buf(1);
-    t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32));  assert_shape_2d(t31->grad, n_embd, N*n_batch);
-
-    struct ggml_tensor * back_layer_inp = t31;
-    struct ggml_tensor * grad_layer_inp = NULL;
-
-    for (int k = 0; k < n_layer; ++k) {
-        int il = n_layer-1-k;
-        struct my_llama_layer & layer = model->layers[il];
-
-        struct ggml_tensor * t02 = t02L[il];
-        struct ggml_tensor * t03 = t03L[il];
-        struct ggml_tensor * t04 = t04L[il];
-        struct ggml_tensor * t05 = t05L[il];
-        struct ggml_tensor * t06 = t06L[il];
-        struct ggml_tensor * t07 = t07L[il];
-        struct ggml_tensor * t08 = t08L[il];
-        struct ggml_tensor * t09 = t09L[il];
-        struct ggml_tensor * t10 = t10L[il];
-        struct ggml_tensor * t11 = t11L[il];
-        struct ggml_tensor * t12 = t12L[il];
-        struct ggml_tensor * t13 = t13L[il];
-        struct ggml_tensor * t14 = t14L[il];
-        struct ggml_tensor * t15 = t15L[il];
-        struct ggml_tensor * t16 = t16L[il];
-        struct ggml_tensor * t17 = t17L[il];
-        struct ggml_tensor * t18 = t18L[il];
-        struct ggml_tensor * t19 = t19L[il];
-        struct ggml_tensor * t20 = t20L[il];
-        struct ggml_tensor * t21 = t21L[il];
-        struct ggml_tensor * t22 = t22L[il];
-        struct ggml_tensor * t23 = t23L[il];
-        struct ggml_tensor * t24 = t24L[il];
-        struct ggml_tensor * t25 = t25L[il];
-        struct ggml_tensor * t26 = t26L[il];
-        struct ggml_tensor * t27 = t27L[il];
-        struct ggml_tensor * t28 = t28L[il];
-        struct ggml_tensor * t29 = t29L[il];
-        struct ggml_tensor * t30 = t30L[il];
-
-        clr_buf(0);
-        use_buf(0);
-        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad, rms_norm_eps)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
-        if (grad_layer_inp) {
-            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
-        }
-        clr_buf(1);
-        t29->grad = t30->grad;                                                                                        assert_shape_2d(t29->grad, n_embd, N*n_batch);
-        t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad)));                       assert_shape_2d(t28->grad, n_ff, N*n_batch);
-        t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25));                                                       assert_shape_2d(t27->grad, n_ff, N*n_batch);
-        t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad));                                                 assert_shape_2d(t26->grad, n_ff, N*n_batch);
-        t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27));                                                       assert_shape_2d(t25->grad, n_ff, N*n_batch);
-        t24->grad = expand(gb, ggml_add_inplace(ctx0,
-                        ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)),
-                        ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad))));                             assert_shape_2d(t24->grad, n_embd, N*n_batch);
-        t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22));                                                       assert_shape_2d(t23->grad, n_embd, N*n_batch);
-        t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad)));              assert_shape_2d(t22->grad, n_embd, N*n_batch);
-        use_buf(1);
-        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad, rms_norm_eps)));    assert_shape_2d(t21->grad, n_embd, N*n_batch);
-        grad_layer_inp = t21;
-        use_buf(0);
-        t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
-        t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad)));                       assert_shape_2d(t19->grad, n_embd, N*n_batch);
-        t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch));                  assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch);
-        t17->grad = t18->grad;                                                                                        assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch);
-        t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3));                                            assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true));     assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch);
-        t15->grad = expand(gb, view__v(flash_attn));                                                                  assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch);
-        t14->grad = expand(gb, view__k(flash_attn));                                                                  assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch);
-        t13->grad = expand(gb, view__q(flash_attn));                                                                  assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch);
-        t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
-        t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd));                 assert_shape_2d(t11->grad, N*n_batch, n_embd);
-        t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
-        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
-        t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch));                                  assert_shape_2d(t08->grad, n_embd, N*n_batch);
-        t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
-        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
-        t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch));                                  assert_shape_2d(t05->grad, n_embd, N*n_batch);
-        t04->grad = expand(gb, ggml_add_inplace(ctx0,
-                        ggml_add_inplace(ctx0,
-                            ggml_out_prod(ctx0, layer.wv, t11->grad),
-                            ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))),
-                        ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad))));                             assert_shape_2d(t04->grad, n_embd, N*n_batch);
-        t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02));                                                       assert_shape_2d(t04->grad, n_embd, N*n_batch);
-        use_buf(1);
-        t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02)));              assert_shape_2d(t02->grad, n_embd, N*n_batch);
-        back_layer_inp = t02;
-        // use_buf(0);
-
-        use_buf(-1);
-        layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm)));   assert_shape_1d(layer.attention_norm->grad, n_embd);
-        layer.wq->grad             = expand(gb, add_or_set(layer.wq->grad,             ggml_out_prod(ctx0, t04, t05->grad)));                       assert_shape_2d(layer.wq->grad,             n_embd, n_embd);
-        layer.wk->grad             = expand(gb, add_or_set(layer.wk->grad,             ggml_out_prod(ctx0, t04, t08->grad)));                       assert_shape_2d(layer.wk->grad,             n_embd, n_embd);
-        layer.wv->grad             = expand(gb, add_or_set(layer.wv->grad,             ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad,             n_embd, n_embd);
-        layer.wo->grad             = expand(gb, add_or_set(layer.wo->grad,             ggml_out_prod(ctx0, t19, t20->grad)));                       assert_shape_2d(layer.wo->grad,             n_embd, n_embd);
-        layer.ffn_norm->grad       = expand(gb, add_or_set(layer.ffn_norm->grad,       ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm)));         assert_shape_1d(layer.ffn_norm->grad,       n_embd);
-        layer.w1->grad             = expand(gb, add_or_set(layer.w1->grad,             ggml_out_prod(ctx0, t24, t26->grad)));                       assert_shape_2d(layer.w1->grad,             n_embd, n_ff);
-        layer.w2->grad             = expand(gb, add_or_set(layer.w2->grad,             ggml_out_prod(ctx0, t28, t29->grad)));                       assert_shape_2d(layer.w2->grad,             n_ff, n_embd);
-        layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
-        // use_buf(0);
-    }
-    clr_buf(0);
-    use_buf(0);
-    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad, rms_norm_eps)));        assert_shape_2d(t01->grad, n_embd, N*n_batch);
-    use_buf(-1);
-    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                                      assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
-    // clr_buf(1);
-    // clr_buf(0);
-
-    *logits = t35;
-
-    clr_buf(0);
-    clr_buf(1);
-
-    if (track_max_mem) {
-        printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]);
-        printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
-    }
-
-    // now that all grads are created, set the graph leafs and grads
-    graph_set_leafs_grads(gf);
-    graph_set_leafs_grads(gb);
-
-    return t36;
-}
-
-struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
-        struct my_llama_model * model,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_tensor  * * logits,
-        struct ggml_tensor    * tokens_input,
-        struct ggml_tensor    * targets,
-        void                  * compute_buf_0,
-        void                  * compute_buf_1,
-        void                  * compute_buf_2,
-        size_t                  size_buf_0,
-        size_t                  size_buf_1,
-        size_t                  size_buf_2,
-        const  int              n_tokens,
-        const  int              n_batch) {
-
-    // implements gradient-checkpointing as explained in readme of https://github.com/cybertronai/gradient-checkpointing
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    const int n_past = 0;
-    const int N = n_tokens;
-
-    gf->n_nodes = 0;
-    gf->n_leafs = 0;
-    gf->perf_runs = 0;
-    gf->perf_cycles = 0;
-    gf->perf_time_us = 0;
-
-    const auto & hparams = model->hparams;
-    const int n_ctx      = hparams.n_ctx;
-    const int n_vocab    = hparams.n_vocab;
-    const int n_embd     = hparams.n_embd;
-    const int n_layer    = hparams.n_layer;
-    const int n_head     = hparams.n_head;
-    const int n_rot      = hparams.n_rot;
-    const int n_ff       = get_n_ff(&hparams);
-    const int rope_mode  = 0;
-
-    bool track_max_mem = true;
-
-    int last_buf = -1;
-    size_t buf_offs[3] = { 0, 0, 0 };
-    size_t buf_size[3] = { size_buf_0,
-                           size_buf_1,
-                           size_buf_2 };
-    void * buf_data[3] = { compute_buf_0,
-                           compute_buf_1,
-                           compute_buf_2 };
-    size_t buf_maxs[3] = { 0, 0, 0 };
-
-    auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs] (int buf) {
-        size_t last_offs = 0;
-        last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-        if (last_buf >= 0) {
-            buf_offs[last_buf] = last_offs;
-            buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
-        }
-        if (buf >= 0) {
-            size_t offs = buf_offs[buf];
-            size_t size = buf_size[buf];
-            void * data = buf_data[buf];
-            ggml_set_scratch(ctx0, { offs, size, data, });
-        }
-        last_buf = buf;
-    };
-
-
-    auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) {
-        if (buf < 0) return;
-        if (track_max_mem) {
-            size_t last_offs = 0;
-            last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-            if (last_buf >= 0) {
-                buf_offs[last_buf] = last_offs;
-                buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
-            }
-        }
-        buf_offs[buf] = 0;
-        if (track_max_mem && last_buf >= 0) {
-            size_t offs = buf_offs[last_buf];
-            size_t size = buf_size[last_buf];
-            void * data = buf_data[last_buf];
-            ggml_set_scratch(ctx0, { offs, size, data, });
-        }
-    };
-
-
-    auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = n_embd/n_head;
-        int64_t ne1 = N;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = 0;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = n_embd/n_head;
-        int64_t ne1 = N;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = nb3*ne3;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = N;
-        int64_t ne1 = n_embd/n_head;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = 2*nb3*ne3;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * {
-        if (a == NULL) {
-            return b;
-        } else {
-            return ggml_add_inplace(ctx0, a, b);
-        }
-    };
-
-    use_buf(-1);
-
-    model->tok_embeddings->grad    = NULL;
-    model->norm->grad              = NULL;
-    model->output->grad            = NULL;
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        layer.attention_norm->grad = NULL;
-        layer.wq->grad             = NULL;
-        layer.wk->grad             = NULL;
-        layer.wv->grad             = NULL;
-        layer.wo->grad             = NULL;
-        layer.ffn_norm->grad       = NULL;
-        layer.w1->grad             = NULL;
-        layer.w2->grad             = NULL;
-        layer.w3->grad             = NULL;
-    }
-
-    clr_buf(0);
-    clr_buf(1);
-    clr_buf(2);
-
-    use_buf(-1);
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * t00 = ggml_reshape_1d(ctx0, tokens_input, N*n_batch); assert_shape_1d(t00, N*n_batch);
-
-    use_buf(-1);
-
-    struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch);
-
-
-    {
-        // given: n, u, v
-        // objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
-        // b=n/a
-        // minimize(a*u+v*n/a)
-        // diff(a*u+v*n/a, a) = u - (v*n/a)/a
-        // diff(a*u+v*n/a, a) == 0
-        // u - (v*n/a)/a == 0
-        // u == v*n/(a*a)
-        // u*a*a = v*n
-        // a*a = v*n/u
-        // a = sqrt(n*v/u)
-    }
-
-    float memcost_checkpoint   = n_embd;           // (..)*N*n_batch
-    float memcost_snd_fwd_pass = 14*n_embd+4*n_ff; // (..)*N*n_batch
-
-    int n_checkstep = (int)(sqrtf(n_layer*memcost_checkpoint/memcost_snd_fwd_pass) + 0.5f);
-    if (n_checkstep < 1) {
-        n_checkstep = 1;
-    }
-    std::vector<int> checkpoints;
-    for (int chk = n_checkstep-1; chk+1 < n_layer; chk += n_checkstep) {
-        checkpoints.push_back(chk);
-    }
-    int n_check = checkpoints.size();
-    // printf("%s: n_check = %d n_checkstep = %d\n", __func__, n_check, n_checkstep);
-
-    // for (int i = 0; i < n_check; ++i) {
-    //     printf("%s: checkpoint #%d = %d\n", __func__, i, checkpoints[i]);
-    // }
-
-    // example for 16 layers and memcost_checkpoint=memcost_snd_fwd_pass:
-    // inp  ~    implicit zeroth checkpoint == input
-    // L00 f 4b  [
-    // L01 f 4b    4th second forward pass
-    // L02 f 4b
-    // L03 fc4b  ] first checkpoint
-    // L04 f 3b  [
-    // L05 f 3b   3rd second forward pass
-    // L06 f 3b
-    // L07 fc3b  ] second checkpoint
-    // L08 f 2b  [
-    // L09 f 2b   2nd second forward pass
-    // L10 f 2b
-    // L11 fc2b  ] third checkpoint
-    // L12 f 1b  [
-    // L13 f 1b   1st second forward pass
-    // L14 f 1b
-    // L15 f 1b  ]
-
-    // need to remember these for the backward pass
-    std::vector<struct ggml_tensor *> t02L; t02L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t03L; t03L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t04L; t04L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t05L; t05L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t06L; t06L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t07L; t07L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t08L; t08L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t09L; t09L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t10L; t10L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t11L; t11L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t12L; t12L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t13L; t13L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t14L; t14L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t15L; t15L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t16L; t16L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t17L; t17L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t18L; t18L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t19L; t19L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t20L; t20L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t21L; t21L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t22L; t22L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t23L; t23L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t24L; t24L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t25L; t25L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t26L; t26L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t27L; t27L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t28L; t28L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t29L; t29L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t30L; t30L.resize(n_layer, NULL);
-
-    struct ggml_tensor * cur = t01;
-
-    int chk_idx = 0;
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        // tensors with values necessary for backward pass are in persistent buf(-1)
-        // other tensors with buf(0), buf(1), etc are only temporary needed, and their memory reused
-        bool is_checkpoint = (chk_idx < n_check && il == checkpoints[chk_idx]);
-        if (is_checkpoint) {
-            // printf("%s: layer %d is_checkpoint\n", __func__, il);
-            chk_idx += 1;
-        }
-        const int prs = 0; // in first forward pass even persistent tensors are only temporary
-        const int tmp = 0; // temporary
-        // nxt is required to compute next layer.
-        // for checkpoints we need to remember this for usage in backward pass,
-        // otherwise temporary until next of this kind
-        const int nxt = is_checkpoint ? -1 : 1;
-        clr_buf(0);
-        use_buf(prs); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
-        use_buf(tmp); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
-        use_buf(prs); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-        use_buf(prs); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-        use_buf(prs); struct ggml_tensor * t14 = expand(gf, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-        use_buf(prs); struct ggml_tensor * t15 = expand(gf, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
-        use_buf(prs); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-        use_buf(tmp); struct ggml_tensor * t17 = expand(gf, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t18 = expand(gf, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
-        use_buf(tmp); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
-        use_buf(tmp); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t27 = expand(gf, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t28 = expand(gf, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
-        use_buf(tmp); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
-        clr_buf( 1);
-        use_buf(nxt); struct ggml_tensor * t30 = expand(gf, ggml_add          (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
-
-        // only t30L is remembered for checkpointing in first forward pass
-        if (is_checkpoint) {
-            t30L[il] = t30;
-        }
-        cur = t30;
-    }
-    clr_buf(0);
-    use_buf(0);
-    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur, rms_norm_eps));         assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
-    use_buf(-1);
-    struct ggml_tensor * t34   = expand(gf, ggml_mul_mat   (ctx0, model->output, t33));        assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch));  assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets));      assert_shape_1d(t36, 1);
-
-    *gb = *gf;
-
-    // t36->grad gets set to one by optimizer, so we need the tensor.
-    // initialize it with 1.0f to make sure.
-    use_buf(-1);
-    t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
-
-    use_buf(0);
-    t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad));              assert_shape_3d(t35->grad, n_vocab, N, n_batch);
-    t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch));                    assert_shape_2d(t34->grad, n_vocab, N*n_batch);
-    t33->grad = expand(gb, ggml_out_prod   (ctx0, model->output, ggml_transpose(ctx0, t34->grad)));   assert_shape_2d(t33->grad, n_embd, N*n_batch);
-    t32->grad = expand(gb, ggml_mul        (ctx0, t33->grad, t31));                                   assert_shape_2d(t32->grad, n_embd, N*n_batch);
-
-    use_buf(-1);
-
-    model->norm->grad   = expand(gb, add_or_set(model->norm->grad,   ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd);
-    model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad)));            assert_shape_2d(model->output->grad, n_embd, n_vocab);
-
-    clr_buf(1);
-    use_buf(1);
-    t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32));  assert_shape_2d(t31->grad, n_embd, N*n_batch);
-
-    struct ggml_tensor * back_layer_inp = t31;
-    struct ggml_tensor * grad_layer_inp = NULL;
-
-    // printf("%s: n_check = %u\n", __func__, n_check);
-    chk_idx = n_check-1;
-    int avail_begin = n_layer;
-    int avail_end = n_layer;
-    // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
-    for (int k = 0; k < n_layer; ++k) {
-        // second forward pass for checkpointing
-        int il = n_layer-1-k;
-        if (il < avail_begin) {
-            // make sure, that txxL[il] is available
-            // forward pass from last checkpoint
-            GGML_ASSERT(chk_idx >= -1);
-            int begin = (chk_idx == -1)
-                        ? 0
-                        : checkpoints[chk_idx] + 1; // checkpoint[chk_idx] contains t30 for computing following layers -> +1
-            int end   = (chk_idx+1 < n_check)
-                        ? (checkpoints[chk_idx+1] + 1)
-                        : n_layer;
-            GGML_ASSERT(begin <= il);
-            GGML_ASSERT(il < end);
-            cur = (chk_idx == -1) ? t01 : t30L[checkpoints[chk_idx]];
-            clr_buf(2);
-            // printf("%s: second forward pass chk_idx=%d begin=%d end=%d\n", __func__, chk_idx, begin, end);
-            for (int i = begin; i < end; ++i) {
-                struct my_llama_layer & layer = model->layers[i];
-                const int prs = 2; // persistent until next checkpoint
-                const int tmp = 0; // temporary for this layer
-                const bool is_checkpoint = (i == end-1);
-                clr_buf(0);
-                use_buf(prs); struct ggml_tensor * t02 = expand(gb, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
-                use_buf(tmp); struct ggml_tensor * t03 = expand(gb, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t04 = expand(gb, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t05 = expand(gb, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t06 = expand(gb, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t07 = expand(gb, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t08 = expand(gb, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t09 = expand(gb, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t10 = expand(gb, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t11 = expand(gb, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
-                use_buf(prs); struct ggml_tensor * t12 = expand(gb, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-                use_buf(prs); struct ggml_tensor * t13 = expand(gb, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-                use_buf(prs); struct ggml_tensor * t14 = expand(gb, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-                use_buf(prs); struct ggml_tensor * t15 = expand(gb, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
-                use_buf(prs); struct ggml_tensor * t16 = expand(gb, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-                use_buf(tmp); struct ggml_tensor * t17 = expand(gb, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t18 = expand(gb, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t19 = expand(gb, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
-                use_buf(tmp); struct ggml_tensor * t20 = expand(gb, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t21 = expand(gb, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t22 = expand(gb, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
-                use_buf(tmp); struct ggml_tensor * t23 = expand(gb, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t24 = expand(gb, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t25 = expand(gb, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t26 = expand(gb, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t27 = expand(gb, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t28 = expand(gb, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
-                use_buf(tmp); struct ggml_tensor * t29 = expand(gb, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
-                if (t30L[i] == NULL) {
-                    use_buf(prs); struct ggml_tensor * t30 = expand(gb, ggml_add      (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
-                    t30L[i] = t30;
-                    cur     = t30;
-                }
-                t02L[i] = t02;
-                t03L[i] = t03;
-                t04L[i] = t04;
-                t05L[i] = t05;
-                t06L[i] = t06;
-                t07L[i] = t07;
-                t08L[i] = t08;
-                t09L[i] = t09;
-                t10L[i] = t10;
-                t11L[i] = t11;
-                t12L[i] = t12;
-                t13L[i] = t13;
-                t14L[i] = t14;
-                t15L[i] = t15;
-                t16L[i] = t16;
-                t17L[i] = t17;
-                t18L[i] = t18;
-                t19L[i] = t19;
-                t20L[i] = t20;
-                t21L[i] = t21;
-                t22L[i] = t22;
-                t23L[i] = t23;
-                t24L[i] = t24;
-                t25L[i] = t25;
-                t26L[i] = t26;
-                t27L[i] = t27;
-                t28L[i] = t28;
-                t29L[i] = t29;
-            }
-            --chk_idx;
-            avail_begin = begin;
-            avail_end   = end;
-            // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
-        }
-        // printf("%s: backward pass il=%d\n", __func__, il);
-
-        struct my_llama_layer & layer = model->layers[il];
-
-        struct ggml_tensor * t02 = t02L[il];
-        struct ggml_tensor * t03 = t03L[il];
-        struct ggml_tensor * t04 = t04L[il];
-        struct ggml_tensor * t05 = t05L[il];
-        struct ggml_tensor * t06 = t06L[il];
-        struct ggml_tensor * t07 = t07L[il];
-        struct ggml_tensor * t08 = t08L[il];
-        struct ggml_tensor * t09 = t09L[il];
-        struct ggml_tensor * t10 = t10L[il];
-        struct ggml_tensor * t11 = t11L[il];
-        struct ggml_tensor * t12 = t12L[il];
-        struct ggml_tensor * t13 = t13L[il];
-        struct ggml_tensor * t14 = t14L[il];
-        struct ggml_tensor * t15 = t15L[il];
-        struct ggml_tensor * t16 = t16L[il];
-        struct ggml_tensor * t17 = t17L[il];
-        struct ggml_tensor * t18 = t18L[il];
-        struct ggml_tensor * t19 = t19L[il];
-        struct ggml_tensor * t20 = t20L[il];
-        struct ggml_tensor * t21 = t21L[il];
-        struct ggml_tensor * t22 = t22L[il];
-        struct ggml_tensor * t23 = t23L[il];
-        struct ggml_tensor * t24 = t24L[il];
-        struct ggml_tensor * t25 = t25L[il];
-        struct ggml_tensor * t26 = t26L[il];
-        struct ggml_tensor * t27 = t27L[il];
-        struct ggml_tensor * t28 = t28L[il];
-        struct ggml_tensor * t29 = t29L[il];
-        struct ggml_tensor * t30 = t30L[il];
-
-        clr_buf(0);
-        use_buf(0);
-        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad, rms_norm_eps));                    assert_shape_2d(t30->grad, n_embd, N*n_batch);
-        if (grad_layer_inp) {
-            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad));                                  assert_shape_2d(t30->grad, n_embd, N*n_batch);
-        }
-        clr_buf(1);
-        t29->grad = t30->grad;                                                                                        assert_shape_2d(t29->grad, n_embd, N*n_batch);
-        t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad)));                       assert_shape_2d(t28->grad, n_ff, N*n_batch);
-        t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25));                                                       assert_shape_2d(t27->grad, n_ff, N*n_batch);
-        t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad));                                                 assert_shape_2d(t26->grad, n_ff, N*n_batch);
-        t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27));                                                       assert_shape_2d(t25->grad, n_ff, N*n_batch);
-        t24->grad = expand(gb, ggml_add_inplace(ctx0,
-                        ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)),
-                        ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad))));                             assert_shape_2d(t24->grad, n_embd, N*n_batch);
-        t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22));                                                       assert_shape_2d(t23->grad, n_embd, N*n_batch);
-        t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad)));              assert_shape_2d(t22->grad, n_embd, N*n_batch);
-        use_buf(1);
-        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad, rms_norm_eps)));    assert_shape_2d(t21->grad, n_embd, N*n_batch);
-        grad_layer_inp = t21;
-        use_buf(0);
-        t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
-        t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad)));                       assert_shape_2d(t19->grad, n_embd, N*n_batch);
-        t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch));                  assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch);
-        t17->grad = t18->grad;                                                                                        assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch);
-        t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3));                                            assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true));     assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch);
-        t15->grad = expand(gb, view__v(flash_attn));                                                                  assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch);
-        t14->grad = expand(gb, view__k(flash_attn));                                                                  assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch);
-        t13->grad = expand(gb, view__q(flash_attn));                                                                  assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch);
-        t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
-        t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd));                 assert_shape_2d(t11->grad, N*n_batch, n_embd);
-        t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
-        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
-        t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch));                                  assert_shape_2d(t08->grad, n_embd, N*n_batch);
-        t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
-        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
-        t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch));                                  assert_shape_2d(t05->grad, n_embd, N*n_batch);
-        t04->grad = expand(gb, ggml_add_inplace(ctx0,
-                        ggml_add_inplace(ctx0,
-                            ggml_out_prod(ctx0, layer.wv, t11->grad),
-                            ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))),
-                        ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad))));                             assert_shape_2d(t04->grad, n_embd, N*n_batch);
-        t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02));                                                       assert_shape_2d(t04->grad, n_embd, N*n_batch);
-        use_buf(1);
-        t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02)));              assert_shape_2d(t02->grad, n_embd, N*n_batch);
-        back_layer_inp = t02;
-
-        use_buf(-1);
-        layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm)));   assert_shape_1d(layer.attention_norm->grad, n_embd);
-        layer.wq->grad             = expand(gb, add_or_set(layer.wq->grad,             ggml_out_prod(ctx0, t04, t05->grad)));                       assert_shape_2d(layer.wq->grad,             n_embd, n_embd);
-        layer.wk->grad             = expand(gb, add_or_set(layer.wk->grad,             ggml_out_prod(ctx0, t04, t08->grad)));                       assert_shape_2d(layer.wk->grad,             n_embd, n_embd);
-        layer.wv->grad             = expand(gb, add_or_set(layer.wv->grad,             ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad,             n_embd, n_embd);
-        layer.wo->grad             = expand(gb, add_or_set(layer.wo->grad,             ggml_out_prod(ctx0, t19, t20->grad)));                       assert_shape_2d(layer.wo->grad,             n_embd, n_embd);
-        layer.ffn_norm->grad       = expand(gb, add_or_set(layer.ffn_norm->grad,       ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm)));         assert_shape_1d(layer.ffn_norm->grad,       n_embd);
-        layer.w1->grad             = expand(gb, add_or_set(layer.w1->grad,             ggml_out_prod(ctx0, t24, t26->grad)));                       assert_shape_2d(layer.w1->grad,             n_embd, n_ff);
-        layer.w2->grad             = expand(gb, add_or_set(layer.w2->grad,             ggml_out_prod(ctx0, t28, t29->grad)));                       assert_shape_2d(layer.w2->grad,             n_ff, n_embd);
-        layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
-    }
-    // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
-    GGML_ASSERT(n_check == 0 || chk_idx == -2);
-    GGML_ASSERT(avail_begin == 0);
-    clr_buf(0);
-    use_buf(0);
-    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad, rms_norm_eps)));        assert_shape_2d(t01->grad, n_embd, N*n_batch);
-    use_buf(-1);
-    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                                      assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
-
-    *logits = t35;
-
-    clr_buf(0);
-    clr_buf(1);
-    clr_buf(2);
-
-    if (track_max_mem) {
-        printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]);
-        printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
-        printf("%s: max size compute buf2: %zu\n", __func__, buf_maxs[2]);
-    }
-
-    // now that all grads are created, set the graph leafs and grads
-    graph_set_leafs_grads(gf);
-    graph_set_leafs_grads(gb);
-
-    return t36;
-}
-
 void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
     float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
     *ptr = value;
@@ -4485,44 +2945,14 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * loss   = NULL;
         struct ggml_tensor * logits = NULL;
 
-        if (params.use_alloc || params.use_unified) {
-            loss = llama_build_train_graphs(
-                &model, alloc, ctx0,
-                gf, gb, gb_tmp,
-                &logits, tokens_input, target_probs,
-                n_tokens, n_batch,
-                params.use_flash, 
-                params.use_checkpointing
-            );
-        } else if (params.use_checkpointing) {
-            loss = forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
-                    &model, ctx0,
-                    gf, gb,
-                    &logits, tokens_input, target_probs,
-                    compute_buf_0, compute_buf_1, compute_buf_2,
-                    size_buf_0, size_buf_1, size_buf_2,
-                    n_tokens, n_batch);
-        } else if (params.use_scratch) {
-            loss = forward_batch_wo_cache_flash_attn_train(
-                    &model, ctx0,
-                    gf, gb,
-                    &logits, tokens_input, target_probs,
-                    compute_buf_0, compute_buf_1,
-                    size_buf_0, size_buf_1,
-                    n_tokens, n_batch);
-        } else if (params.use_flash) {
-            logits = forward_batch_wo_cache_flash_attn(&model, ctx0, gf, tokens_input, n_tokens, n_batch);
-            loss   = cross_entropy_loss(ctx0, logits, target_probs);
-            ggml_build_forward_expand(gf, loss);
-            *gb = *gf;
-            ggml_build_backward_expand(ctx0, gf, gb, true);
-        } else {
-            logits = forward_batch_wo_cache(&model, ctx0, gf, tokens_input, n_tokens, n_batch);
-            loss   = cross_entropy_loss(ctx0, logits, target_probs);
-            ggml_build_forward_expand(gf, loss);
-            *gb = *gf;
-            ggml_build_backward_expand(ctx0, gf, gb, true);
-        }
+        loss = llama_build_train_graphs(
+            &model, alloc, ctx0,
+            gf, gb, gb_tmp,
+            &logits, tokens_input, target_probs,
+            n_tokens, n_batch,
+            params.use_flash, 
+            params.use_checkpointing
+        );
 
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
 

From 271e4d64b56fc4cb10535f473b5ee65ebe063441 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:31:59 +0200
Subject: [PATCH 056/100] remove unused training parameters "use_scratch" and
 "use_unified"

---
 .../train-text-from-scratch.cpp               | 24 ++++---------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 9d94bdfcf6984..b6d6db4b80df1 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2189,10 +2189,8 @@ struct train_params {
     bool samples_start_after_nl;
     bool use_adam;
     bool use_flash;
-    bool use_scratch;
     bool use_checkpointing;
     bool use_alloc;
-    bool use_unified;
 
     // only adam
     int   warmup;
@@ -2252,10 +2250,8 @@ struct train_params get_default_train_params() {
     params.samples_start_after_nl = false;
     params.use_adam               = true;
     params.use_flash              = true;
-    params.use_scratch            = true;
     params.use_checkpointing      = true;
     params.use_alloc              = true;
-    params.use_unified            = true;
 
     params.opt_past               = 0;
     params.opt_delta              = 1e-5f;
@@ -2313,16 +2309,12 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
     fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
     fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
-    fprintf(stderr, "  --no-flash                 Don't use flash attention. Implies no-scratch and no-checkpointing.\n");
+    fprintf(stderr, "  --no-flash                 Don't use flash attention \n");
     fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
-    fprintf(stderr, "  --no-scratch               Don't use scratch buffers. Implies no-checkpointing.\n");
-    fprintf(stderr, "  --use-scratch              Use scratch buffers. Implies use-flash. (default)\n");
     fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
-    fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing. Implies use-scratch and use-flash. (default)\n");
+    fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing (default)\n");
     fprintf(stderr, "  --no-alloc                 Don't use allocator\n");
-    fprintf(stderr, "  --use-alloc                Use allocator. Implies use-unified. (default)\n");
-    fprintf(stderr, "  --no-unified               Don't use unified\n");
-    fprintf(stderr, "  --use-unified              Use unified. (default)\n");
+    fprintf(stderr, "  --use-alloc                Use allocator (default)\n");
     fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
     fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
     fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
@@ -2480,10 +2472,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
             params->use_flash = false;
         } else if (arg == "--use-flash") {
             params->use_flash = true;
-        } else if (arg == "--no-scratch") {
-            params->use_scratch = false;
-        } else if (arg == "--use-scratch") {
-            params->use_scratch = true;
         } else if (arg == "--no-checkpointing") {
             params->use_checkpointing = false;
         } else if (arg == "--use-checkpointing") {
@@ -2492,10 +2480,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
             params->use_alloc = false;
         } else if (arg == "--use-alloc") {
             params->use_alloc = true;
-        } else if (arg == "--no-unified") {
-            params->use_unified = false;
-        } else if (arg == "--use-unified") {
-            params->use_unified = true;
         } else if (arg == "--warmup") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2936,7 +2920,7 @@ int main(int argc, char ** argv) {
 
         struct ggml_cgraph * gf = ggml_new_graph(ctx0);
         struct ggml_cgraph * gb = ggml_new_graph(ctx0);
-        struct ggml_cgraph * gb_tmp = (params.use_unified || params.use_alloc) 
+        struct ggml_cgraph * gb_tmp = params.use_alloc
             ? ggml_new_graph(ctx0)
             : NULL;
 

From 6f161c784b94c0ab6ae84f96257983378abe1a70 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:33:27 +0200
Subject: [PATCH 057/100] remove trailing whitespace

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index b6d6db4b80df1..5d4a1c2c25af1 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2934,7 +2934,7 @@ int main(int argc, char ** argv) {
             gf, gb, gb_tmp,
             &logits, tokens_input, target_probs,
             n_tokens, n_batch,
-            params.use_flash, 
+            params.use_flash,
             params.use_checkpointing
         );
 

From 3794dceb7f164e6688f4895448cbfdee04164d20 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:44:42 +0200
Subject: [PATCH 058/100] remove unused train params: mem_compute1_gb &
 mem_compute2_gb

mem_compute_gb is used for compute when automatic memory allocator is not enabled, otherwise it can be very small to only hold the tensor definitions
mem_compute0_gb is used for automatic memory allocator (as long as measurement of max required size is not implemented)
---
 .../train-text-from-scratch.cpp               | 26 +------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 5d4a1c2c25af1..e7b43bf9ac036 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2217,9 +2217,6 @@ struct train_params {
     int mem_model_gb;
     int mem_compute_gb;
     int mem_compute0_gb;
-    int mem_compute1_gb;
-    int mem_compute2_gb;
-    int mem_compute3_gb;
 };
 
 struct train_params get_default_train_params() {
@@ -2278,8 +2275,6 @@ struct train_params get_default_train_params() {
     params.mem_model_gb   =  2;
     params.mem_compute_gb = 24;
     params.mem_compute0_gb = 8;
-    params.mem_compute1_gb = 1;
-    params.mem_compute2_gb = 2;
     return params;
 }
 
@@ -2336,9 +2331,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
     fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
-    fprintf(stderr, "  --mem-compute0 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb);
-    fprintf(stderr, "  --mem-compute1 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb);
-    fprintf(stderr, "  --mem-compute2 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute2_gb);
+    fprintf(stderr, "  --mem-compute0 N           Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb);
     fprintf(stderr, "\n");
 }
 
@@ -2604,18 +2597,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->mem_compute0_gb = std::stoi(argv[i]);
-        } else if (arg == "--mem-compute1") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_compute1_gb = std::stoi(argv[i]);
-        } else if (arg == "--mem-compute2") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_compute2_gb = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
             train_print_usage(argc, argv, &default_params);
             exit(0);
@@ -2839,11 +2820,7 @@ int main(int argc, char ** argv) {
     uint8_t * compute_addr = new uint8_t[compute_size];
 
     size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb);
-    size_t size_buf_1 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute1_gb);
-    size_t size_buf_2 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute2_gb);
     uint8_t * compute_buf_0 = new uint8_t[size_buf_0];
-    uint8_t * compute_buf_1 = new uint8_t[size_buf_1];
-    uint8_t * compute_buf_2 = new uint8_t[size_buf_2];
 
     ggml_allocr * alloc = NULL;
     if (params.use_alloc) {
@@ -3090,7 +3067,6 @@ int main(int argc, char ** argv) {
 
     delete[] compute_addr;
     delete[] compute_buf_0;
-    delete[] compute_buf_1;
     ggml_free(model.ctx);
     llama_free(lctx);
     llama_free_model(lmodel);

From 6e280b24dcf5d751e12b8a0b3ced7dc41c589e2d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 19:02:12 +0200
Subject: [PATCH 059/100] remove unused forward_batch function

---
 .../train-text-from-scratch.cpp               | 290 ------------------
 1 file changed, 290 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index e7b43bf9ac036..94a2a766d1c8b 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -656,296 +656,6 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
     GGML_ASSERT(tensor->ne[3] == ne3);
 }
 
-struct ggml_tensor * forward_batch(
-        struct my_llama_model    * model,
-        struct my_llama_kv_cache * cache,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_tensor    * tokens_input,
-        const  int              n_tokens,
-        const  int              n_past,
-        const  int              n_batch) {
-
-    const int N = n_tokens;
-
-    struct my_llama_kv_cache& kv_self = *cache;
-    const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_vocab = hparams.n_vocab;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-    const int n_ff    = get_n_ff(&hparams);
-
-    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
-    memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
-
-    struct ggml_tensor * kc = kv_self.k;
-    struct ggml_tensor * vc = kv_self.v;
-
-    // inpL shape [n_embd,N*n_batch,1]
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    assert_shape_2d(inpL, n_embd, N*n_batch);
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // lctx.use_buf(ctx0, 0);
-
-        // norm
-        {
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
-            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
-
-            // store key and value to memory
-            {
-                // compute the transposed [N, n_embd] V matrix
-                // wv   shape [n_embd, n_embd, 1, 1]
-                // Vcur shape [N, n_embd, n_batch, 1]
-                struct ggml_tensor * Vcur = ggml_cont(ctx0,
-                    ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_mul_mat(ctx0,
-                                model->layers[il].wv,
-                                cur),
-                        n_embd, N, n_batch),
-                        1, 0, 2, 3));
-                assert_shape_3d(Vcur, N, n_embd, n_batch);
-
-                // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-                // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
-                // k         shape [n_embd * N, n_batch]   == kv_self.k[:,n_past:n_past+N,:,il]
-                // v         shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
-
-                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-
-                    // important: storing RoPE-ed version of K in the KV cache!
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-                } //*/
-
-                kc = ggml_set_2d_inplace(ctx0, kc,
-                        ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch),
-                        ggml_element_size(kc)*n_embd*n_ctx,
-                        (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past));
-                vc = ggml_set_2d_inplace(ctx0, vc,
-                        ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch),
-                        ggml_element_size(vc)*n_ctx*n_embd,
-                        ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx));
-
-                assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer);
-                assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer);
-            }
-
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Q shape    [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
-
-            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-            // K shape [n_embd/n_head, n_past + N, n_head, n_batch]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_4d(ctx0,
-                            ggml_view_3d(ctx0,
-                                kc,
-                                n_embd,
-                                (n_past + N),
-                                n_batch,
-                                n_embd*ggml_element_size(kc),
-                                n_ctx*n_embd*ggml_element_size(kc),
-                                il*n_batch*n_ctx*n_embd*ggml_element_size(kc)),
-                            n_embd/n_head, n_head, n_past + N, n_batch),
-                        0, 2, 1, 3);
-            assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch);
-
-            // K * Q
-            // KQ shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            assert_shape_4d(KQ, n_past + N, N, n_head, n_batch);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // KQ_scaled shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
-            assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // KQ_masked shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-            assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch);
-
-            // KQ = soft_max(KQ_masked)
-            // KQ_soft_max shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-            assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
-
-            // split cached V into n_head heads
-            // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
-            // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
-            struct ggml_tensor * V =
-                ggml_view_4d(ctx0, vc,
-                        n_past + N, n_embd/n_head, n_head, n_batch,
-                        ggml_element_size(vc)*n_ctx,
-                        ggml_element_size(vc)*n_ctx*n_embd/n_head,
-                        ggml_element_size(vc)*n_ctx*n_embd,
-                        il*n_batch*n_ctx*n_embd*ggml_element_size(vc));
-            assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch);
-
-            // KQV shape [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
-            // KQV_merged shape
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-            // cur = ggml_cpy(ctx0,
-            //         KQV_merged,
-            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection (no bias)
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].wo,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // lctx.use_buf(ctx0, 1);
-
-        // inpFF shape [n_embd,N*n_batch,1,1]
-        struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
-        assert_shape_2d(inpFF, n_embd, N*n_batch);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-
-                // cur = ffn_norm*cur
-                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-            }
-
-            // tmp shape [n_ff,N*n_batch,1,1]
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-            assert_shape_2d(tmp, n_ff, N*n_batch);
-
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // SILU activation
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_silu(ctx0, cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_mul(ctx0, cur, tmp);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // cur shape [n_embd,N*n_batch,1,1]
-        cur = ggml_add_inplace(ctx0, cur, inpFF);
-        assert_shape_2d(cur, n_embd, N*n_batch);
-
-        // input for next layer
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = cur;
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-    }
-
-    // norm
-    {
-
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        // inpL = norm*inpL
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        //embeddings = inpL;
-    }
-
-    // lm_head
-    // inpL shape [n_vocab,N*n_batch,1,1]
-    inpL = ggml_mul_mat(ctx0, model->output, inpL);
-    assert_shape_2d(inpL, n_vocab, N*n_batch);
-
-    {
-        // inpL shape [n_vocab,N,n_batch,1]
-        inpL = ggml_reshape_3d(ctx0,
-                        inpL,
-                        n_vocab, N, n_batch);
-        assert_shape_3d(inpL, n_vocab, N, n_batch);
-    }
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
 static size_t hash(void * p) {
     return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
 }

From faf3e21eaf8ac4667a233fc7d03e8cb29477183a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 20:50:09 +0200
Subject: [PATCH 060/100] add debug asserts in ggml_allocr_alloc to some common
 pitfalls when using this function directly

---
 ggml-alloc.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index 5e1be61ff6cef..ddf973daec7e4 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -104,6 +104,10 @@ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct g
 }
 
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+#ifdef GGML_ALLOCATOR_DEBUG
+    GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
+    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
+#endif
     size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
 

From 098654c27760f837bddf6b114d0d3e53788e7043 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 20:56:56 +0200
Subject: [PATCH 061/100] only use ggml_allocr_alloc when tensor has NULL data
 and is no view

---
 .../train-text-from-scratch/train-text-from-scratch.cpp   | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 94a2a766d1c8b..a30291a1c7169 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -986,11 +986,15 @@ struct ggml_tensor * llama_build_train_graphs(
         // gradient tensors (will be set to zero by ggml_graph_reset)
         for (int i = 0; i < gf->n_nodes; ++i) {
             if (!gf->grads[i]) continue;
-            ggml_allocr_alloc(alloc, gf->grads[i]);
+            if (gf->grads[i]->data == NULL && !ggml_is_view(gf->grads[i])) {
+                ggml_allocr_alloc(alloc, gf->grads[i]);
+            }
             ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, gf->grads[i], one));
         }
         for (int i = 0; i < checkpoints.size(); ++i) {
-            ggml_allocr_alloc(alloc, checkpoints[i]);
+            if (checkpoints[i]->data == NULL && !ggml_is_view(checkpoints[i])) {
+                ggml_allocr_alloc(alloc, checkpoints[i]);
+            }
         }
 
         int n_leafs_after = gb->n_leafs;

From 3e6468b0976dbfe329b9f9511e770f8ad4092fe1 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 20:56:03 +0200
Subject: [PATCH 062/100] fix test when to create temporary backward graph

temporary backward graph is only necessary when using checkpointing
---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index a30291a1c7169..11754ffd9442b 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2611,7 +2611,7 @@ int main(int argc, char ** argv) {
 
         struct ggml_cgraph * gf = ggml_new_graph(ctx0);
         struct ggml_cgraph * gb = ggml_new_graph(ctx0);
-        struct ggml_cgraph * gb_tmp = params.use_alloc
+        struct ggml_cgraph * gb_tmp = params.use_checkpointing
             ? ggml_new_graph(ctx0)
             : NULL;
 

From 56228461c83010c1ee848c8e6a88bc8e23a576ca Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 21:12:02 +0200
Subject: [PATCH 063/100] fix memory "leak" in optimizers

each iteration a new cplan with new memory for work data was allocated.
now cplan creation only happens at the start of optimization, with each iteration reusing the cplan and its work data.
---
 ggml.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/ggml.c b/ggml.c
index 47f912683cd50..142d5965a1c8b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17368,7 +17368,10 @@ static enum ggml_opt_result ggml_opt_adam(
     ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
 
-    ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
+    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+    ggml_graph_compute(gb, &cplan);
 
     opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
     opt->adam.fx_best = opt->adam.fx_prev;
@@ -17455,7 +17458,7 @@ static enum ggml_opt_result ggml_opt_adam(
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
 
-        ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
+        ggml_graph_compute(gb, &cplan);
 
         const float fx = ggml_get_f32_1d(f, 0);
         opt->loss_after = fx;
@@ -17528,7 +17531,6 @@ struct ggml_lbfgs_iteration_data {
 };
 
 static enum ggml_opt_result linesearch_backtracking(
-        struct ggml_context * ctx,
         const struct ggml_opt_params * params,
         int nx,
         float * x,
@@ -17540,6 +17542,7 @@ static enum ggml_opt_result linesearch_backtracking(
         struct ggml_tensor * f,
         struct ggml_cgraph * gf,
         struct ggml_cgraph * gb,
+        struct ggml_cplan  * cplan,
         const int np,
         struct ggml_tensor * ps[],
         ggml_opt_callback callback,
@@ -17588,7 +17591,7 @@ static enum ggml_opt_result linesearch_backtracking(
             ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
 
-            ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
+            ggml_graph_compute(gb, cplan);
 
             ggml_opt_get_grad(np, ps, g);
 
@@ -17682,6 +17685,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         opt->iter = iter;
     }
 
+    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
+    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+
     float * x  = opt->lbfgs.x->data;  // current parameters
     float * xp = opt->lbfgs.xp->data; // previous parameters
     float * g  = opt->lbfgs.g->data;  // current gradient
@@ -17716,7 +17723,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
 
-        ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
+        ggml_graph_compute(gb, &cplan);
 
         ggml_opt_get_grad(np, ps, g);
 
@@ -17778,7 +17785,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_vec_cpy_f32(nx, xp, x);
         ggml_vec_cpy_f32(nx, gp, g);
 
-        ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps, callback, callback_data);
+        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
 
         if (ls < 0) {
             // linesearch failed - go back to the previous point and return

From 3b5515bbe0e2224425986ba24f1f5d84aa38dce9 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 22:09:36 +0200
Subject: [PATCH 064/100] reverse order of for loop in
 ggml_build_backward_expand to save memory when using gradient checkpointing
 and allocator

with this loop order gradient checkpointing with allocator on 16 layer model saves 13% memory; 2 layer memory it saves 2% memory.

the computation results are the same
---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 142d5965a1c8b..79098a2fccb38 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15809,7 +15809,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
         }
     }
 
-    for (int i = gf->n_nodes - 1; i >= 0; i--) {
+    for (int i = 0; i < gf->n_nodes; i++) {
         struct ggml_tensor * node = gf->nodes[i];
 
         if (node->is_param) {

From 4072f20bbacd6920f354a772ba3c555ed638af64 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 24 Aug 2023 15:49:34 +0200
Subject: [PATCH 065/100] add missing lctx argument to
 get_example_targets_batch

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index a60475d097c9d..52f5fb608d476 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2337,6 +2337,7 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
 struct opt_callback_data {
     struct train_params *     params;
     struct ggml_opt_context * opt;
+    struct llama_context *    lctx;
     llama_token *             tokens_data;
     size_t                    tokens_size;
     int *                     samples_data;
@@ -2377,6 +2378,7 @@ void opt_callback(void * vdata, float * sched) {
     }
 
     get_example_targets_batch(
+        data->lctx,
         data->samples_data,
         data->samples_size,
         data->tokens_data,
@@ -2560,6 +2562,7 @@ int main(int argc, char ** argv) {
     struct opt_callback_data opt_cb_data;
     opt_cb_data.params = &params;
     opt_cb_data.opt = opt;
+    opt_cb_data.lctx = lctx;
     opt_cb_data.tokens_data = train_tokens.data();
     opt_cb_data.tokens_size = train_tokens.size();
     opt_cb_data.samples_data = train_samples.data();

From f51c5d76203cd43ff3e9b0ecba18bf312ce9e446 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 24 Aug 2023 19:25:39 +0200
Subject: [PATCH 066/100] implement llama model file saving using gguf

checkpoint loading and saving disabled, to be replaced by loading and saving via gguf
---
 .../train-text-from-scratch.cpp               | 1022 ++++++++++-------
 1 file changed, 587 insertions(+), 435 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 52f5fb608d476..9b12991bfe9a0 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -168,29 +168,20 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
     return tensor;
 }
 
-struct llama_vocab {
-    using id    = int32_t;
-    using token = std::string;
-    using ttype = llama_token_type;
-
-    struct token_data {
-        token text;
-        float score;
-        ttype type;
-    };
-
-    std::unordered_map<token, id> token_to_id;
-    std::vector<token_data> id_to_token;
-};
-
 struct my_llama_hparams {
     uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t n_ctx   = 512;
     uint32_t n_embd  = 4096;
-    uint32_t n_mult  = 4;
     uint32_t n_head  = 32;
     uint32_t n_layer = 32;
     uint32_t n_rot   = 64;
+    uint32_t n_ff    = 11008;
+
+    // float f_norm_eps     = 1e-5; // falcon
+    float f_norm_rms_eps = 1e-5; // llama
+
+    float rope_freq_base  = 10000.0f;
+    float rope_freq_scale = 1.0f;
 
     bool operator!=(const my_llama_hparams& other) const {
         return memcmp(this, &other, sizeof(my_llama_hparams));
@@ -244,18 +235,12 @@ struct my_llama_model {
     uint32_t train_tokens = 0;
 };
 
-uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
-    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
-    return n_ff;
-}
-
 void print_params(struct my_llama_hparams * params) {
     printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
     printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
     printf("%s: n_embd:  %d\n", __func__, params->n_embd);
-    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
     printf("%s: n_head:  %d\n", __func__, params->n_head);
-    printf("%s: n_ff:    %d\n", __func__, get_n_ff(params));
+    printf("%s: n_ff:    %d\n", __func__, params->n_ff);
     printf("%s: n_layer: %d\n", __func__, params->n_layer);
     printf("%s: n_rot:   %d\n", __func__, params->n_rot);
 }
@@ -266,8 +251,7 @@ void init_model(struct my_llama_model * model) {
     const uint32_t n_embd  = hparams.n_embd;
     const uint32_t n_layer = hparams.n_layer;
     const uint32_t n_vocab = hparams.n_vocab;
-
-    const uint32_t n_ff = get_n_ff(&hparams);
+    const uint32_t n_ff    = hparams.n_ff;
 
     struct ggml_context * ctx = model->ctx;
 
@@ -275,20 +259,48 @@ void init_model(struct my_llama_model * model) {
     model->train_samples = 0;
     model->train_tokens = 0;
 
+    const char * arch = "llama";
+
+    // gguf constants (sync with gguf.py)
+
+    const char * LLM_TENSOR_TOKEN_EMBD    = "token_embd";
+    const char * LLM_TENSOR_OUTPUT_NORM   = "output_norm";
+    const char * LLM_TENSOR_OUTPUT        = "output";
+    const char * LLM_TENSOR_ATTN_NORM     = "blk.%d.attn_norm";
+    const char * LLM_TENSOR_ATTN_Q        = "blk.%d.attn_q";
+    const char * LLM_TENSOR_ATTN_K        = "blk.%d.attn_k";
+    const char * LLM_TENSOR_ATTN_V        = "blk.%d.attn_v";
+    const char * LLM_TENSOR_ATTN_OUT      = "blk.%d.attn_output";
+    const char * LLM_TENSOR_FFN_NORM      = "blk.%d.ffn_norm";
+    const char * LLM_TENSOR_FFN_GATE      = "blk.%d.ffn_gate";
+    const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
+    const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
+
+    std::vector<char> tn_buf;
+    tn_buf.resize(GGML_MAX_NAME);
+    auto tn = [arch, &tn_buf](const char * key) -> const char * {
+        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
+        return tn_buf.data();
+    };
+    auto tni = [arch, &tn_buf](const char * key, int bid) -> const char * {
+        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
+        std::string s = tn_buf.data();
+        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
+        return tn_buf.data();
+    };
+
     model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
     model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
     model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
 
-    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
-    ggml_set_name(model->norm,           "norm.weight");
-    ggml_set_name(model->output,         "output.weight");
+    ggml_set_name(model->tok_embeddings, tn(LLM_TENSOR_TOKEN_EMBD));
+    ggml_set_name(model->norm,           tn(LLM_TENSOR_OUTPUT_NORM));
+    ggml_set_name(model->output,         tn(LLM_TENSOR_OUTPUT));
 
     model->layers.resize(n_layer);
     for (uint32_t i = 0; i < n_layer; ++i) {
         auto & layer = model->layers[i];
 
-        std::string layers_i = "layers." + std::to_string(i);
-
         layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
 
         layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
@@ -302,18 +314,18 @@ void init_model(struct my_llama_model * model) {
         layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
         layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
 
-        ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
+        ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i));
 
-        ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
-        ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
-        ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
-        ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
+        ggml_set_name(layer.wq,             tni(LLM_TENSOR_ATTN_Q, i));
+        ggml_set_name(layer.wk,             tni(LLM_TENSOR_ATTN_K, i));
+        ggml_set_name(layer.wv,             tni(LLM_TENSOR_ATTN_V, i));
+        ggml_set_name(layer.wo,             tni(LLM_TENSOR_ATTN_OUT, i));
 
-        ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
+        ggml_set_name(layer.ffn_norm,       tni(LLM_TENSOR_FFN_NORM, i));
 
-        ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
-        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
-        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
+        ggml_set_name(layer.w1,             tni(LLM_TENSOR_FFN_GATE, i));
+        ggml_set_name(layer.w2,             tni(LLM_TENSOR_FFN_DOWN, i));
+        ggml_set_name(layer.w3,             tni(LLM_TENSOR_FFN_UP, i));
     }
 }
 
@@ -882,7 +894,7 @@ struct ggml_tensor * llama_build_train_graphs(
     const int n_layer    = hparams.n_layer;
     const int n_head     = hparams.n_head;
     const int n_rot      = hparams.n_rot;
-    const int n_ff       = get_n_ff(&hparams);
+    const int n_ff       = hparams.n_ff;
     const int rope_mode  = 0;
 
     auto set_name = [](struct ggml_tensor * t, const char * n) {
@@ -986,7 +998,12 @@ struct ggml_tensor * llama_build_train_graphs(
         // output tensors
         ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
         ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
+        // input gradient
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
+        GGML_ASSERT(t36->grad->data == NULL && !ggml_is_view(t36->grad));
+        ggml_allocr_alloc(alloc, t36->grad);
         // gradient tensors (will be set to zero by ggml_graph_reset)
+        // pinning these produces large unnecessary memory overhead, which will be resolved by PR 2632
         for (int i = 0; i < gf->n_nodes; ++i) {
             if (!gf->grads[i]) continue;
             if (gf->grads[i]->data == NULL && !ggml_is_view(gf->grads[i])) {
@@ -994,6 +1011,8 @@ struct ggml_tensor * llama_build_train_graphs(
             }
             ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, gf->grads[i], one));
         }
+        // allocating checkpoints in one block to reduce memory fragmentation
+        // note: they will be freed in reverse order
         for (int i = 0; i < checkpoints.size(); ++i) {
             if (checkpoints[i]->data == NULL && !ggml_is_view(checkpoints[i])) {
                 ggml_allocr_alloc(alloc, checkpoints[i]);
@@ -1455,411 +1474,564 @@ void set_logits_masked(struct ggml_tensor * logits, std::vector<bool>& mask, flo
     }
 }
 
-void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
-    if (tensor == NULL) {
-        file->write_u32(0);
-        file->write_u32(0);
-        file->write_u32(GGML_TYPE_F32);
-        file->seek((0-file->tell()) & 31, SEEK_CUR);
-        return;
-    }
-    const char * name = ggml_get_name(tensor);
-    uint32_t name_len = strlen(name);
-    uint32_t nd = tensor->n_dims;
-    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
-                       (uint32_t)tensor->ne[1],
-                       (uint32_t)tensor->ne[2],
-                       (uint32_t)tensor->ne[3] };
-    file->write_u32(nd);
-    file->write_u32(name_len);
-    file->write_u32(tensor->type);
-    file->write_raw(ne, sizeof(ne[0]) * nd);
-    file->write_raw(name, name_len);
-    file->seek((0-file->tell()) & 31, SEEK_CUR);
-    file->write_raw(tensor->data, ggml_nbytes(tensor));
-}
+// void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+//     if (tensor == NULL) {
+//         file->write_u32(0);
+//         file->write_u32(0);
+//         file->write_u32(GGML_TYPE_F32);
+//         file->seek((0-file->tell()) & 31, SEEK_CUR);
+//         return;
+//     }
+//     const char * name = ggml_get_name(tensor);
+//     uint32_t name_len = strlen(name);
+//     uint32_t nd = tensor->n_dims;
+//     uint32_t ne[4] = { (uint32_t)tensor->ne[0],
+//                        (uint32_t)tensor->ne[1],
+//                        (uint32_t)tensor->ne[2],
+//                        (uint32_t)tensor->ne[3] };
+//     file->write_u32(nd);
+//     file->write_u32(name_len);
+//     file->write_u32(tensor->type);
+//     file->write_raw(ne, sizeof(ne[0]) * nd);
+//     file->write_raw(name, name_len);
+//     file->seek((0-file->tell()) & 31, SEEK_CUR);
+//     file->write_raw(tensor->data, ggml_nbytes(tensor));
+// }
 
-void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
-    int32_t nd = file->read_u32();
-    GGML_ASSERT(nd == tensor->n_dims);
+// void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+//     int32_t nd = file->read_u32();
+//     GGML_ASSERT(nd == tensor->n_dims);
 
-    uint32_t name_len       = file->read_u32();
-    enum     ggml_type type = (enum ggml_type) file->read_u32();
-    GGML_ASSERT(type == tensor->type);
+//     uint32_t name_len       = file->read_u32();
+//     enum     ggml_type type = (enum ggml_type) file->read_u32();
+//     GGML_ASSERT(type == tensor->type);
 
-    uint32_t ne[4];
-    file->read_raw(ne, sizeof(ne[0]) * nd);
-    for (int i=0; i<nd; ++i) {
-        GGML_ASSERT(ne[i] == tensor->ne[i]);
-    }
+//     uint32_t ne[4];
+//     file->read_raw(ne, sizeof(ne[0]) * nd);
+//     for (int i=0; i<nd; ++i) {
+//         GGML_ASSERT(ne[i] == tensor->ne[i]);
+//     }
 
-    std::string name = file->read_string(name_len);
-    GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
+//     std::string name = file->read_string(name_len);
+//     GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
 
-    file->seek((0-file->tell()) & 31, SEEK_CUR);
-    file->read_raw(tensor->data, ggml_nbytes(tensor));
-}
+//     file->seek((0-file->tell()) & 31, SEEK_CUR);
+//     file->read_raw(tensor->data, ggml_nbytes(tensor));
+// }
 
-void skip_tensor(struct llama_file * file) {
-    int32_t nd = file->read_u32();
+// void skip_tensor(struct llama_file * file) {
+//     int32_t nd = file->read_u32();
 
-    uint32_t name_len       = file->read_u32();
-    enum     ggml_type type = (enum ggml_type) file->read_u32();
+//     uint32_t name_len       = file->read_u32();
+//     enum     ggml_type type = (enum ggml_type) file->read_u32();
 
-    uint32_t ne[4] = { 1, 1, 1, 1 };
+//     uint32_t ne[4] = { 1, 1, 1, 1 };
 
-    file->read_raw(ne, sizeof(ne[0]) * nd);
+//     file->read_raw(ne, sizeof(ne[0]) * nd);
 
-    std::string name = file->read_string(name_len);
+//     std::string name = file->read_string(name_len);
 
-    file->seek(-file->tell() & 31, SEEK_CUR);
+//     file->seek(-file->tell() & 31, SEEK_CUR);
 
-    size_t nelements = ne[0]*ne[1]*ne[2]*ne[3];
-    size_t nbytes = nelements*ggml_type_size(type)/ggml_blck_size(type);
-    file->seek(nbytes, SEEK_CUR);
-}
+//     size_t nelements = ne[0]*ne[1]*ne[2]*ne[3];
+//     size_t nbytes = nelements*ggml_type_size(type)/ggml_blck_size(type);
+//     file->seek(nbytes, SEEK_CUR);
+// }
 
 void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt) {
-    const uint32_t version = 1;
-    GGML_ASSERT(opt->nx   >= 0);
-    GGML_ASSERT(opt->iter >= 0);
-    file->write_u32(version);
-    file->write_u32(opt->params.past);
-    file->write_u32(opt->params.lbfgs.m);
-    file->write_raw(&opt->nx,     sizeof(opt->nx));
-    file->write_raw(&opt->iter,   sizeof(opt->iter));
-    file->write_u32((uint32_t)  opt->just_initialized);
-    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
-            {
-                GGML_ASSERT(opt->adam.m  != NULL);
-                GGML_ASSERT(opt->adam.v  != NULL);
-                write_tensor(file, opt->adam.m);
-                write_tensor(file, opt->adam.v);
-                write_tensor(file, opt->adam.pf);
-                file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
-                file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
-                file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                GGML_ASSERT(opt->lbfgs.x != NULL);
-                write_tensor(file, opt->lbfgs.x);
-                write_tensor(file, opt->lbfgs.xp);
-                write_tensor(file, opt->lbfgs.g);
-                write_tensor(file, opt->lbfgs.gp);
-                write_tensor(file, opt->lbfgs.d);
-                write_tensor(file, opt->lbfgs.pf);
-                write_tensor(file, opt->lbfgs.lmal);
-                write_tensor(file, opt->lbfgs.lmys);
-                write_tensor(file, opt->lbfgs.lms);
-                write_tensor(file, opt->lbfgs.lmy);
-                file->write_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
-                file->write_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
-                file->write_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
-                file->write_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
-                file->write_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
-                file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
-            } break;
-    }
-}
-
-struct ggml_opt_params_v0 {
-    enum ggml_opt_type type;
-    int n_threads;
-    int past;
-    float delta;
-    int max_no_improvement;
-    bool print_forward_graph;
-    bool print_backward_graph;
-    struct {
-        int n_iter;
-        float sched;
-        float decay;
-        float alpha;
-        float beta1;
-        float beta2;
-        float eps;
-        float eps_f;
-        float eps_g;
-    } adam;
-    struct {
-        int m;
-        int n_iter;
-        int max_linesearch;
-        float eps;
-        float ftol;
-        float wolfe;
-        float min_step;
-        float max_step;
-        enum ggml_linesearch linesearch;
-    } lbfgs;
-};
-
-void read_opt_context_v0(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-    ggml_opt_params_v0 pv0;
-    file->read_raw(&pv0, sizeof(pv0));
-    opt->params.past = pv0.past;
-    opt->params.lbfgs.m = pv0.lbfgs.m;
-    file->read_raw(&opt->nx, sizeof(opt->nx));
-    ggml_opt_init(ctx, opt, opt->params, opt->nx);
-
-    file->read_raw(&opt->iter,   sizeof(opt->iter));
-    opt->just_initialized = (bool) file->read_u32();
-
-    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
-            {
-                skip_tensor(file);
-                skip_tensor(file);
-                skip_tensor(file);
-                read_tensor(file, opt->adam.m);
-                read_tensor(file, opt->adam.v);
-                skip_tensor(file);
-                skip_tensor(file);
-                if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
-                file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
-                file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
-                file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                GGML_ASSERT(opt->lbfgs.x != NULL);
-                read_tensor(file, opt->lbfgs.x);
-                read_tensor(file, opt->lbfgs.xp);
-                read_tensor(file, opt->lbfgs.g);
-                read_tensor(file, opt->lbfgs.gp);
-                read_tensor(file, opt->lbfgs.d);
-                if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
-                read_tensor(file, opt->lbfgs.lmal);
-                read_tensor(file, opt->lbfgs.lmys);
-                read_tensor(file, opt->lbfgs.lms);
-                read_tensor(file, opt->lbfgs.lmy);
-                file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
-                file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
-                file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
-                file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
-                file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
-                file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
-            } break;
-    }
-}
-
-void read_opt_context_v1(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-    opt->params.past    = (int) file->read_u32();
-    opt->params.lbfgs.m = (int) file->read_u32();
-    file->read_raw(&opt->nx,     sizeof(opt->nx));
-    ggml_opt_init(ctx, opt, opt->params, opt->nx);
-
-    file->read_raw(&opt->iter,   sizeof(opt->iter));
-    opt->just_initialized = (bool) file->read_u32();
-
-    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
-            {
-                read_tensor(file, opt->adam.m);
-                read_tensor(file, opt->adam.v);
-                if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
-                file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
-                file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
-                file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                GGML_ASSERT(opt->lbfgs.x != NULL);
-                read_tensor(file, opt->lbfgs.x);
-                read_tensor(file, opt->lbfgs.xp);
-                read_tensor(file, opt->lbfgs.g);
-                read_tensor(file, opt->lbfgs.gp);
-                read_tensor(file, opt->lbfgs.d);
-                if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
-                read_tensor(file, opt->lbfgs.lmal);
-                read_tensor(file, opt->lbfgs.lmys);
-                read_tensor(file, opt->lbfgs.lms);
-                read_tensor(file, opt->lbfgs.lmy);
-                file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
-                file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
-                file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
-                file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
-                file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
-                file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
-            } break;
-    }
-}
+#pragma message("TODO: implement file saving using gguf: write_opt_context")
+    // const uint32_t version = 1;
+    // GGML_ASSERT(opt->nx   >= 0);
+    // GGML_ASSERT(opt->iter >= 0);
+    // file->write_u32(version);
+    // file->write_u32(opt->params.past);
+    // file->write_u32(opt->params.lbfgs.m);
+    // file->write_raw(&opt->nx,     sizeof(opt->nx));
+    // file->write_raw(&opt->iter,   sizeof(opt->iter));
+    // file->write_u32((uint32_t)  opt->just_initialized);
+    // switch (opt->params.type) {
+    //     case GGML_OPT_ADAM:
+    //         {
+    //             GGML_ASSERT(opt->adam.m  != NULL);
+    //             GGML_ASSERT(opt->adam.v  != NULL);
+    //             write_tensor(file, opt->adam.m);
+    //             write_tensor(file, opt->adam.v);
+    //             write_tensor(file, opt->adam.pf);
+    //             file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+    //             file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+    //             file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+    //         } break;
+    //     case GGML_OPT_LBFGS:
+    //         {
+    //             GGML_ASSERT(opt->lbfgs.x != NULL);
+    //             write_tensor(file, opt->lbfgs.x);
+    //             write_tensor(file, opt->lbfgs.xp);
+    //             write_tensor(file, opt->lbfgs.g);
+    //             write_tensor(file, opt->lbfgs.gp);
+    //             write_tensor(file, opt->lbfgs.d);
+    //             write_tensor(file, opt->lbfgs.pf);
+    //             write_tensor(file, opt->lbfgs.lmal);
+    //             write_tensor(file, opt->lbfgs.lmys);
+    //             write_tensor(file, opt->lbfgs.lms);
+    //             write_tensor(file, opt->lbfgs.lmy);
+    //             file->write_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+    //             file->write_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+    //             file->write_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+    //             file->write_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+    //             file->write_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+    //             file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+    //         } break;
+    // }
+}
+
+// struct ggml_opt_params_v0 {
+//     enum ggml_opt_type type;
+//     int n_threads;
+//     int past;
+//     float delta;
+//     int max_no_improvement;
+//     bool print_forward_graph;
+//     bool print_backward_graph;
+//     struct {
+//         int n_iter;
+//         float sched;
+//         float decay;
+//         float alpha;
+//         float beta1;
+//         float beta2;
+//         float eps;
+//         float eps_f;
+//         float eps_g;
+//     } adam;
+//     struct {
+//         int m;
+//         int n_iter;
+//         int max_linesearch;
+//         float eps;
+//         float ftol;
+//         float wolfe;
+//         float min_step;
+//         float max_step;
+//         enum ggml_linesearch linesearch;
+//     } lbfgs;
+// };
+
+// void read_opt_context_v0(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
+//     ggml_opt_params_v0 pv0;
+//     file->read_raw(&pv0, sizeof(pv0));
+//     opt->params.past = pv0.past;
+//     opt->params.lbfgs.m = pv0.lbfgs.m;
+//     file->read_raw(&opt->nx, sizeof(opt->nx));
+//     ggml_opt_init(ctx, opt, opt->params, opt->nx);
+
+//     file->read_raw(&opt->iter,   sizeof(opt->iter));
+//     opt->just_initialized = (bool) file->read_u32();
+
+//     switch (opt->params.type) {
+//         case GGML_OPT_ADAM:
+//             {
+//                 skip_tensor(file);
+//                 skip_tensor(file);
+//                 skip_tensor(file);
+//                 read_tensor(file, opt->adam.m);
+//                 read_tensor(file, opt->adam.v);
+//                 skip_tensor(file);
+//                 skip_tensor(file);
+//                 if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
+//                 file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+//                 file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+//                 file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+//             } break;
+//         case GGML_OPT_LBFGS:
+//             {
+//                 GGML_ASSERT(opt->lbfgs.x != NULL);
+//                 read_tensor(file, opt->lbfgs.x);
+//                 read_tensor(file, opt->lbfgs.xp);
+//                 read_tensor(file, opt->lbfgs.g);
+//                 read_tensor(file, opt->lbfgs.gp);
+//                 read_tensor(file, opt->lbfgs.d);
+//                 if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
+//                 read_tensor(file, opt->lbfgs.lmal);
+//                 read_tensor(file, opt->lbfgs.lmys);
+//                 read_tensor(file, opt->lbfgs.lms);
+//                 read_tensor(file, opt->lbfgs.lmy);
+//                 file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+//                 file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+//                 file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+//                 file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+//                 file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+//                 file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+//             } break;
+//     }
+// }
+
+// void read_opt_context_v1(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
+//     opt->params.past    = (int) file->read_u32();
+//     opt->params.lbfgs.m = (int) file->read_u32();
+//     file->read_raw(&opt->nx,     sizeof(opt->nx));
+//     ggml_opt_init(ctx, opt, opt->params, opt->nx);
+
+//     file->read_raw(&opt->iter,   sizeof(opt->iter));
+//     opt->just_initialized = (bool) file->read_u32();
+
+//     switch (opt->params.type) {
+//         case GGML_OPT_ADAM:
+//             {
+//                 read_tensor(file, opt->adam.m);
+//                 read_tensor(file, opt->adam.v);
+//                 if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
+//                 file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+//                 file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+//                 file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+//             } break;
+//         case GGML_OPT_LBFGS:
+//             {
+//                 GGML_ASSERT(opt->lbfgs.x != NULL);
+//                 read_tensor(file, opt->lbfgs.x);
+//                 read_tensor(file, opt->lbfgs.xp);
+//                 read_tensor(file, opt->lbfgs.g);
+//                 read_tensor(file, opt->lbfgs.gp);
+//                 read_tensor(file, opt->lbfgs.d);
+//                 if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
+//                 read_tensor(file, opt->lbfgs.lmal);
+//                 read_tensor(file, opt->lbfgs.lmys);
+//                 read_tensor(file, opt->lbfgs.lms);
+//                 read_tensor(file, opt->lbfgs.lmy);
+//                 file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+//                 file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+//                 file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+//                 file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+//                 file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+//                 file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+//             } break;
+//     }
+// }
 
 void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-    uint32_t version = file->read_u32();
-    printf("%s: opt context version %u\n", __func__, version);
-    switch (version) {
-        case 0:
-            {
-                read_opt_context_v0(file, ctx, opt);
-            } break;
-        case 1:
-            {
-                read_opt_context_v1(file, ctx, opt);
-            } break;
-        default:
-            {
-                fprintf(stderr, "%s: unknown version %u\n", __func__, version);
-            }
-    }
+#pragma message("TODO: implement file loading using gguf: read_opt_context")
+    // uint32_t version = file->read_u32();
+    // printf("%s: opt context version %u\n", __func__, version);
+    // switch (version) {
+    //     case 0:
+    //         {
+    //             read_opt_context_v0(file, ctx, opt);
+    //         } break;
+    //     case 1:
+    //         {
+    //             read_opt_context_v1(file, ctx, opt);
+    //         } break;
+    //     default:
+    //         {
+    //             fprintf(stderr, "%s: unknown version %u\n", __func__, version);
+    //         }
+    // }
 }
 
 void save_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename) {
+#pragma message("TODO: implement file saving using gguf: save_checkpoint")
+    // struct llama_file file(filename, "wb");
+    // if (file.fp == NULL) {
+    //     return;
+    // }
+
+    // const uint32_t magic   = 'ggcp';
+    // const uint32_t version = 0;
+
+    // file.write_u32(magic);
+    // file.write_u32(version);
+    // file.write_u32(model->train_its);
+    // file.write_u32(model->train_samples);
+    // file.write_u32(model->train_tokens);
+    // file.write_u32(model->hparams.n_vocab);
+    // file.write_u32(model->hparams.n_embd);
+    // // file.write_u32(model->hparams.n_mult);
+    // file.write_u32(model->hparams.n_head);
+    // file.write_u32(model->hparams.n_layer);
+    // file.write_u32(model->hparams.n_rot);
+
+    // write_tensor(&file, model->tok_embeddings);
+    // write_tensor(&file, model->norm);
+    // write_tensor(&file, model->output);
+
+    // for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+    //     auto & layer = model->layers[i];
+
+    //     write_tensor(&file, layer.attention_norm);
+    //     write_tensor(&file, layer.wq);
+    //     write_tensor(&file, layer.wk);
+    //     write_tensor(&file, layer.wv);
+    //     write_tensor(&file, layer.wo);
+    //     write_tensor(&file, layer.ffn_norm);
+    //     write_tensor(&file, layer.w1);
+    //     write_tensor(&file, layer.w2);
+    //     write_tensor(&file, layer.w3);
+    // }
+
+    // write_opt_context(&file, opt);
+}
+
+bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, bool init) {
+#pragma message("TODO: implement file loading using gguf: load_checkpoint")
+    return false;
+    // struct llama_file file(filename, "rb");
+
+    // uint32_t magic;
+    // uint32_t version;
+
+    // uint32_t train_its = 0;
+    // uint32_t train_samples = 0;
+    // uint32_t train_tokens = 0;
+
+    // if (file.fp) {
+    //     printf("%s: Loading model from '%s'.\n", __func__, filename);
+    //     magic                  = file.read_u32();
+    //     GGML_ASSERT(magic     == 'ggcp');
+    //     version                = file.read_u32();
+    //     GGML_ASSERT(version   == 0);
+    //     train_its              = file.read_u32();
+    //     train_samples          = file.read_u32();
+    //     train_tokens           = file.read_u32();
+    //     model->hparams.n_vocab = file.read_u32();
+    //     model->hparams.n_embd  = file.read_u32();
+    //     // model->hparams.n_mult  = file.read_u32();
+    //     model->hparams.n_head  = file.read_u32();
+    //     model->hparams.n_layer = file.read_u32();
+    //     model->hparams.n_rot   = file.read_u32();
+    //     print_params(&model->hparams);
+    // }
+
+    // if (init) {
+    //     init_model(model);
+    // }
+
+    // if (file.fp) {
+    //     model->train_its = train_its;
+    //     model->train_samples = train_samples;
+    //     model->train_tokens = train_tokens;
+    // }
+
+    // printf("%s: Training iterations: %u.\n", __func__, model->train_its);
+    // printf("%s: Training samples:    %u.\n", __func__, model->train_samples);
+    // printf("%s: Training tokens:     %u.\n", __func__, model->train_tokens);
+
+    // if (file.fp) {
+    //     read_tensor(&file, model->tok_embeddings);
+    //     read_tensor(&file, model->norm);
+    //     read_tensor(&file, model->output);
+
+    //     for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+    //         auto & layer = model->layers[i];
+
+    //         read_tensor(&file, layer.attention_norm);
+    //         read_tensor(&file, layer.wq);
+    //         read_tensor(&file, layer.wk);
+    //         read_tensor(&file, layer.wv);
+    //         read_tensor(&file, layer.wo);
+    //         read_tensor(&file, layer.ffn_norm);
+    //         read_tensor(&file, layer.w1);
+    //         read_tensor(&file, layer.w2);
+    //         read_tensor(&file, layer.w3);
+    //     }
+
+    //     read_opt_context(&file, model->ctx, opt);
+    // }
+
+    // return (file.fp != NULL);
+}
+
+#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
+{ \
+    const std::string skey(key); \
+    const int kid = gguf_find_key(ctx, skey.c_str()); \
+    if (kid >= 0) { \
+        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
+        if (ktype != (type)) { \
+            throw std::runtime_error("key has wrong type"); \
+        } \
+        (dst) = func(ctx, kid); \
+    } else if (req) { \
+        throw std::runtime_error("key not found in model"); \
+    } \
+}
+
+void save_as_llama_model(const char * fn_vocab_model, struct my_llama_model * model, const char * filename) {
     struct llama_file file(filename, "wb");
     if (file.fp == NULL) {
         return;
     }
 
-    const uint32_t magic   = 'ggcp';
-    const uint32_t version = 0;
+    const char * arch = "llama";
+    enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
 
-    file.write_u32(magic);
-    file.write_u32(version);
-    file.write_u32(model->train_its);
-    file.write_u32(model->train_samples);
-    file.write_u32(model->train_tokens);
-    file.write_u32(model->hparams.n_vocab);
-    file.write_u32(model->hparams.n_embd);
-    file.write_u32(model->hparams.n_mult);
-    file.write_u32(model->hparams.n_head);
-    file.write_u32(model->hparams.n_layer);
-    file.write_u32(model->hparams.n_rot);
+    std::vector<char> keybuf;
+    keybuf.resize(512);
+    auto kv = [arch, &keybuf](const char * key) -> const char * {
+        snprintf(keybuf.data(), keybuf.size(), key, arch);
+        return keybuf.data();
+    };
 
-    write_tensor(&file, model->tok_embeddings);
-    write_tensor(&file, model->norm);
-    write_tensor(&file, model->output);
+    // gguf constants (sync with gguf.py)
+
+    const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
+    const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
+
+    const char * LLM_KV_CONTEXT_LENGTH              = "%s.context_length";
+    const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
+    const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
+    const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
+    const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
+    const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
+    const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
+    const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
+    const char * LLM_KV_ROPE_SCALE_LINEAR           = "%s.rope.scale_linear";
+
+    const char * LLM_KV_TOKENIZER_MODEL             = "tokenizer.ggml.model";
+    const char * LLM_KV_TOKENIZER_LIST              = "tokenizer.ggml.tokens";
+    const char * LLM_KV_TOKENIZER_TOKEN_TYPE        = "tokenizer.ggml.token_type";
+    const char * LLM_KV_TOKENIZER_SCORES            = "tokenizer.ggml.scores";
+    const char * LLM_KV_TOKENIZER_MERGES            = "tokenizer.ggml.merges";
+    const char * LLM_KV_TOKENIZER_BOS_ID            = "tokenizer.ggml.bos_token_id";
+    const char * LLM_KV_TOKENIZER_EOS_ID            = "tokenizer.ggml.eos_token_id";
+    const char * LLM_KV_TOKENIZER_UNK_ID            = "tokenizer.ggml.unknown_token_id";
+    const char * LLM_KV_TOKENIZER_SEP_ID            = "tokenizer.ggml.seperator_token_id";
+    const char * LLM_KV_TOKENIZER_PAD_ID            = "tokenizer.ggml.padding_token_id";
+
+    struct gguf_context * fctx = gguf_init_empty();
+
+    // set arch
+    gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
+    gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
+
+    // set hparams
+    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.n_ctx                  );
+    gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH),            model->hparams.n_embd                 );
+    gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff                   );
+    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head                 );
+    gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT),                 model->hparams.n_layer                );
+    gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT),        model->hparams.n_rot                  );
+
+    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps         );
+    gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_FREQ_BASE),              model->hparams.rope_freq_base         ); // TODO load in llama.cpp
+    gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR),           1.0f / model->hparams.rope_freq_scale );
+
+    // set vocab by copying from vocab_model gguf file
+    {
+        struct gguf_init_params params = {
+            /*.no_alloc = */ false,
+            /*.ctx      = */ NULL,
+        };
+        struct gguf_context * vctx = gguf_init_from_file(fn_vocab_model, params);
 
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
+        const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST));
+        if (token_idx == -1) {
+            throw std::runtime_error("cannot find tokenizer vocab in model file\n");
+        }
+        const uint32_t n_vocab = gguf_get_arr_n(vctx, token_idx);
 
-        write_tensor(&file, layer.attention_norm);
-        write_tensor(&file, layer.wq);
-        write_tensor(&file, layer.wk);
-        write_tensor(&file, layer.wv);
-        write_tensor(&file, layer.wo);
-        write_tensor(&file, layer.ffn_norm);
-        write_tensor(&file, layer.w1);
-        write_tensor(&file, layer.w2);
-        write_tensor(&file, layer.w3);
-    }
+        const int score_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_SCORES));
+        if (score_idx == -1) {
+            throw std::runtime_error("cannot find tokenizer scores in model file\n");
+        }
 
-    write_opt_context(&file, opt);
-}
+        const float * scores = (const float * ) gguf_get_arr_data(vctx, score_idx);
 
-bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, bool init) {
-    struct llama_file file(filename, "rb");
+        const int toktype_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE));
+        if (toktype_idx == -1) {
+            throw std::runtime_error("cannot find token type list in GGUF file\n");
+        }
 
-    uint32_t magic;
-    uint32_t version;
+        const int * toktypes = (const int * ) gguf_get_arr_data(vctx, toktype_idx);
+
+        std::string tokenizer_name;
+        GGUF_GET_KEY(vctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
+
+        gguf_set_val_str(fctx, kv(LLM_KV_TOKENIZER_MODEL), tokenizer_name.c_str());
+        gguf_set_arr_data(fctx, kv(LLM_KV_TOKENIZER_SCORES), GGUF_TYPE_FLOAT32, scores, n_vocab);
+        gguf_set_arr_data(fctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE), GGUF_TYPE_INT32, toktypes, n_vocab);
+
+        int32_t special_bos_id = 1;
+        int32_t special_eos_id = 2;
+        int32_t special_unk_id = 0;
+        int32_t special_sep_id = -1;
+        int32_t special_pad_id = -1;
+        if (tokenizer_name == "llama") {
+            // default special tokens
+            special_bos_id = 1;
+            special_eos_id = 2;
+            special_unk_id = 0;
+            special_sep_id = -1;
+            special_pad_id = -1;
+        } else if (tokenizer_name == "gpt2") {
+            // read and copy bpe merges
+            const int merges_keyidx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_MERGES));
+            if (merges_keyidx == -1) {
+                throw std::runtime_error("cannot find tokenizer merges in model file\n");
+            }
 
-    uint32_t train_its = 0;
-    uint32_t train_samples = 0;
-    uint32_t train_tokens = 0;
+            const int n_merges = gguf_get_arr_n(vctx, merges_keyidx);
 
-    if (file.fp) {
-        printf("%s: Loading model from '%s'.\n", __func__, filename);
-        magic                  = file.read_u32();
-        GGML_ASSERT(magic     == 'ggcp');
-        version                = file.read_u32();
-        GGML_ASSERT(version   == 0);
-        train_its              = file.read_u32();
-        train_samples          = file.read_u32();
-        train_tokens           = file.read_u32();
-        model->hparams.n_vocab = file.read_u32();
-        model->hparams.n_embd  = file.read_u32();
-        model->hparams.n_mult  = file.read_u32();
-        model->hparams.n_head  = file.read_u32();
-        model->hparams.n_layer = file.read_u32();
-        model->hparams.n_rot   = file.read_u32();
-        print_params(&model->hparams);
-    }
-
-    if (init) {
-        init_model(model);
-    }
-
-    if (file.fp) {
-        model->train_its = train_its;
-        model->train_samples = train_samples;
-        model->train_tokens = train_tokens;
-    }
-
-    printf("%s: Training iterations: %u.\n", __func__, model->train_its);
-    printf("%s: Training samples:    %u.\n", __func__, model->train_samples);
-    printf("%s: Training tokens:     %u.\n", __func__, model->train_tokens);
-
-    if (file.fp) {
-        read_tensor(&file, model->tok_embeddings);
-        read_tensor(&file, model->norm);
-        read_tensor(&file, model->output);
-
-        for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-            auto & layer = model->layers[i];
-
-            read_tensor(&file, layer.attention_norm);
-            read_tensor(&file, layer.wq);
-            read_tensor(&file, layer.wk);
-            read_tensor(&file, layer.wv);
-            read_tensor(&file, layer.wo);
-            read_tensor(&file, layer.ffn_norm);
-            read_tensor(&file, layer.w1);
-            read_tensor(&file, layer.w2);
-            read_tensor(&file, layer.w3);
+            std::vector<const char*> merges;
+            merges.resize(n_merges);
+            for (int i = 0; i < n_merges; i++) {
+                merges[i] = gguf_get_arr_str(vctx, merges_keyidx, i);
+            }
+            gguf_set_arr_str(fctx, kv(LLM_KV_TOKENIZER_MERGES), merges.data(), n_merges);
+
+            // default special tokens
+            special_bos_id = 11;
+            special_eos_id = 11;
+            special_unk_id = -1;
+            special_sep_id = -1;
+            special_pad_id = -1;
+        } else {
+            fprintf(stderr, "%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
+            fprintf(stderr, "%s: using default tokenizer: 'llama'", __func__);
+        }
+
+        std::vector<const char*> tokens;
+        tokens.resize(n_vocab);
+        for (uint32_t i = 0; i < n_vocab; i++) {
+            tokens[i] = gguf_get_arr_str(vctx, token_idx, i);
         }
+        gguf_set_arr_str(fctx, kv(LLM_KV_TOKENIZER_LIST), tokens.data(), n_vocab);
+
+        GGUF_GET_KEY(vctx, special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
+        GGUF_GET_KEY(vctx, special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));
+        GGUF_GET_KEY(vctx, special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
+        GGUF_GET_KEY(vctx, special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
+        GGUF_GET_KEY(vctx, special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
+
+        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_BOS_ID), special_bos_id);
+        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_EOS_ID), special_eos_id);
+        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_UNK_ID), special_unk_id);
+        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_SEP_ID), special_sep_id);
+        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_PAD_ID), special_pad_id);
 
-        read_opt_context(&file, model->ctx, opt);
+        gguf_free(vctx);
     }
 
-    return (file.fp != NULL);
-}
+    // add tensors
+    gguf_add_tensor(fctx, model->tok_embeddings);
+    gguf_add_tensor(fctx, model->norm);
+    gguf_add_tensor(fctx, model->output);
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+        auto & layer = model->layers[i];
 
-void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, const char * filename) {
-    struct llama_file file(filename, "wb");
-    if (file.fp == NULL) {
-        return;
+        gguf_add_tensor(fctx, layer.attention_norm);
+        gguf_add_tensor(fctx, layer.wq);
+        gguf_add_tensor(fctx, layer.wk);
+        gguf_add_tensor(fctx, layer.wv);
+        gguf_add_tensor(fctx, layer.wo);
+        gguf_add_tensor(fctx, layer.ffn_norm);
+        gguf_add_tensor(fctx, layer.w1);
+        gguf_add_tensor(fctx, layer.w2);
+        gguf_add_tensor(fctx, layer.w3);
     }
 
-#pragma message("TODO: implement file saving using gguf")
-    (void) vocab;
-    (void) model;
-//    // write_magic
-//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
-//    file.write_u32(LLAMA_FILE_VERSION); // version
-//    // write_hparams
-//    file.write_u32(model->hparams.n_vocab);
-//    file.write_u32(model->hparams.n_embd);
-//    file.write_u32(model->hparams.n_mult);
-//    file.write_u32(model->hparams.n_head);
-//    file.write_u32(model->hparams.n_layer);
-//    file.write_u32(model->hparams.n_rot);
-//    file.write_u32(LLAMA_FTYPE_ALL_F32);
-//    // write_vocab
-//    uint32_t n_vocab = model->hparams.n_vocab;
-//    for (uint32_t i = 0; i < n_vocab; i++) {
-//        const auto & token_data = vocab->id_to_token.at(i);
-//        file.write_u32((uint32_t) token_data.tok.size());
-//        file.write_raw(token_data.tok.data(), token_data.tok.size());
-//        file.write_raw(&token_data.score, sizeof(token_data.score));
-//    }
-//    // write tensors
-//    write_tensor(&file, model->tok_embeddings);
-//    write_tensor(&file, model->norm);
-//    write_tensor(&file, model->output);
-//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-//        auto & layer = model->layers[i];
-//
-//        write_tensor(&file, layer.attention_norm);
-//        write_tensor(&file, layer.wq);
-//        write_tensor(&file, layer.wk);
-//        write_tensor(&file, layer.wv);
-//        write_tensor(&file, layer.wo);
-//        write_tensor(&file, layer.ffn_norm);
-//        write_tensor(&file, layer.w1);
-//        write_tensor(&file, layer.w2);
-//        write_tensor(&file, layer.w3);
-//    }
+    // write file
+    const bool only_meta = false;
+    gguf_write_to_file(fctx, filename, only_meta);
+    gguf_free(fctx);
 }
 
 float cosine_decay(const int decay_steps, const float minimum, int step) {
@@ -1892,10 +2064,9 @@ struct train_params {
 
     int n_ctx;
     int n_embd;
-    int n_mult;
     int n_head;
     int n_layer;
-    int n_rotmax;
+    int n_ff;
 
     int n_threads;
     int n_batch;
@@ -1950,10 +2121,9 @@ struct train_params get_default_train_params() {
 
     params.n_ctx      =  128;
     params.n_embd     =  256;
-    params.n_mult     =  256;
     params.n_head     =    8;
     params.n_layer    =   16;
-    params.n_rotmax   =   64;
+    params.n_ff       =  768;
 
     params.n_threads  =    6;
     params.n_batch    =    8;
@@ -2010,10 +2180,9 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
     fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
     fprintf(stderr, "  --embd N                   Embedding size used for new models (default %d)\n", params->n_embd);
-    fprintf(stderr, "  --mult N                   Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult);
+    fprintf(stderr, "  --ff N                     Feedforward size used for new models. (default %d)\n", params->n_ff);
     fprintf(stderr, "  --head N                   Number of heads for new models (default %d)\n", params->n_head);
     fprintf(stderr, "  --layer N                  Number of layers for new models (default %d)\n", params->n_layer);
-    fprintf(stderr, "  --rotmax N                 Maximal number Rope dimensions for new models (default %d)\n", params->n_rotmax);
     fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
     fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
     fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
@@ -2114,12 +2283,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_embd = std::stoi(argv[i]);
-        } else if (arg == "--mult") {
+        } else if (arg == "--ff") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->n_mult = std::stoi(argv[i]);
+            params->n_ff = std::stoi(argv[i]);
         } else if (arg == "--head") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2132,12 +2301,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_layer = std::stoi(argv[i]);
-        } else if (arg == "--rotmax") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rotmax = std::stoi(argv[i]);
         } else if (arg == "-t" || arg == "--threads") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2410,18 +2573,6 @@ int main(int argc, char ** argv) {
     struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
     struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
 
-    struct llama_vocab vocab;
-    {
-        const int n_vocab = llama_n_vocab(lctx);
-        vocab.id_to_token.resize(n_vocab);
-        for (int i=0; i<n_vocab; ++i) {
-            vocab.id_to_token[i].text  = llama_token_get_text(lctx, i);
-            vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
-            vocab.id_to_token[i].type  = llama_token_get_type(lctx, i);
-            vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
-        }
-    }
-
     printf("%s: tokenize training data\n", __func__);
     std::vector<llama_token> train_tokens;
     if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) {
@@ -2433,10 +2584,11 @@ int main(int argc, char ** argv) {
     model.hparams.n_vocab = llama_n_vocab(lctx);
     model.hparams.n_ctx   = params.n_ctx;
     model.hparams.n_embd  = params.n_embd;
-    model.hparams.n_mult  = params.n_mult;
     model.hparams.n_head  = params.n_head;
     model.hparams.n_layer = params.n_layer;
-    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
+    model.hparams.n_ff    = params.n_ff;
+    // llama.cpp requires n_rot to be exactly n_embd / n_head
+    model.hparams.n_rot   = model.hparams.n_embd / model.hparams.n_head;
 
     print_params(&model.hparams);
 
@@ -2701,7 +2853,7 @@ int main(int argc, char ** argv) {
     }
 
     if (strlen(params.fn_model_out) > 0) {
-        save_as_llama_model(&vocab, &model, params.fn_model_out);
+        save_as_llama_model(params.fn_vocab_model, &model, params.fn_model_out);
     }
 
     {

From 540798132b3b7b2186ceb4134cc578d8097e6b3d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 24 Aug 2023 21:57:16 +0200
Subject: [PATCH 067/100] implement loading/saving of checkpointing files using
 GGUF

---
 .../train-text-from-scratch.cpp               | 846 +++++++-----------
 1 file changed, 347 insertions(+), 499 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 9b12991bfe9a0..e4304126de487 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -235,6 +235,84 @@ struct my_llama_model {
     uint32_t train_tokens = 0;
 };
 
+
+// gguf constants
+const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
+const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
+const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
+const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count";
+const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT_LOW        = "optimizer.parameter_count.low";
+const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT_HIGH       = "optimizer.parameter_count.high";
+const char * LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count";
+const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss";
+const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss";
+const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count";
+const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
+const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss";
+const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step";
+const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j";
+const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k";
+const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end";
+const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
+
+const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments";
+const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments";
+const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
+
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
+
+const char * LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count";
+const char * LLM_KV_TRAINING_SAMPLE_COUNT    = "training.sample_count";
+const char * LLM_KV_TRAINING_TOKEN_COUNT     = "training.token_count";
+
+// gguf constants (sync with gguf.py)
+
+const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
+const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
+
+const char * LLM_KV_CONTEXT_LENGTH              = "%s.context_length";
+const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
+const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
+const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
+const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
+const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
+const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
+const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
+const char * LLM_KV_ROPE_SCALE_LINEAR           = "%s.rope.scale_linear";
+
+const char * LLM_KV_TOKENIZER_MODEL             = "tokenizer.ggml.model";
+const char * LLM_KV_TOKENIZER_LIST              = "tokenizer.ggml.tokens";
+const char * LLM_KV_TOKENIZER_TOKEN_TYPE        = "tokenizer.ggml.token_type";
+const char * LLM_KV_TOKENIZER_SCORES            = "tokenizer.ggml.scores";
+const char * LLM_KV_TOKENIZER_MERGES            = "tokenizer.ggml.merges";
+const char * LLM_KV_TOKENIZER_BOS_ID            = "tokenizer.ggml.bos_token_id";
+const char * LLM_KV_TOKENIZER_EOS_ID            = "tokenizer.ggml.eos_token_id";
+const char * LLM_KV_TOKENIZER_UNK_ID            = "tokenizer.ggml.unknown_token_id";
+const char * LLM_KV_TOKENIZER_SEP_ID            = "tokenizer.ggml.seperator_token_id";
+const char * LLM_KV_TOKENIZER_PAD_ID            = "tokenizer.ggml.padding_token_id";
+
+const char * LLM_TENSOR_TOKEN_EMBD    = "token_embd";
+const char * LLM_TENSOR_OUTPUT_NORM   = "output_norm";
+const char * LLM_TENSOR_OUTPUT        = "output";
+const char * LLM_TENSOR_ATTN_NORM     = "blk.%d.attn_norm";
+const char * LLM_TENSOR_ATTN_Q        = "blk.%d.attn_q";
+const char * LLM_TENSOR_ATTN_K        = "blk.%d.attn_k";
+const char * LLM_TENSOR_ATTN_V        = "blk.%d.attn_v";
+const char * LLM_TENSOR_ATTN_OUT      = "blk.%d.attn_output";
+const char * LLM_TENSOR_FFN_NORM      = "blk.%d.ffn_norm";
+const char * LLM_TENSOR_FFN_GATE      = "blk.%d.ffn_gate";
+const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
+const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
+
 void print_params(struct my_llama_hparams * params) {
     printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
     printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
@@ -261,21 +339,6 @@ void init_model(struct my_llama_model * model) {
 
     const char * arch = "llama";
 
-    // gguf constants (sync with gguf.py)
-
-    const char * LLM_TENSOR_TOKEN_EMBD    = "token_embd";
-    const char * LLM_TENSOR_OUTPUT_NORM   = "output_norm";
-    const char * LLM_TENSOR_OUTPUT        = "output";
-    const char * LLM_TENSOR_ATTN_NORM     = "blk.%d.attn_norm";
-    const char * LLM_TENSOR_ATTN_Q        = "blk.%d.attn_q";
-    const char * LLM_TENSOR_ATTN_K        = "blk.%d.attn_k";
-    const char * LLM_TENSOR_ATTN_V        = "blk.%d.attn_v";
-    const char * LLM_TENSOR_ATTN_OUT      = "blk.%d.attn_output";
-    const char * LLM_TENSOR_FFN_NORM      = "blk.%d.ffn_norm";
-    const char * LLM_TENSOR_FFN_GATE      = "blk.%d.ffn_gate";
-    const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
-    const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
-
     std::vector<char> tn_buf;
     tn_buf.resize(GGML_MAX_NAME);
     auto tn = [arch, &tn_buf](const char * key) -> const char * {
@@ -1216,89 +1279,6 @@ static std::string format(const char * fmt, ...) {
     return std::string(buf.data(), size);
 }
 
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
-        } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
-        }
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            throw std::runtime_error(format("read error: %s", strerror(errno)));
-        }
-        if (ret != 1) {
-            throw std::runtime_error(std::string("unexpectedly reached end of file"));
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            throw std::runtime_error(format("write error: %s", strerror(errno)));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
 int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
     struct llama_file f(filename, "rb");
 
@@ -1474,371 +1454,6 @@ void set_logits_masked(struct ggml_tensor * logits, std::vector<bool>& mask, flo
     }
 }
 
-// void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
-//     if (tensor == NULL) {
-//         file->write_u32(0);
-//         file->write_u32(0);
-//         file->write_u32(GGML_TYPE_F32);
-//         file->seek((0-file->tell()) & 31, SEEK_CUR);
-//         return;
-//     }
-//     const char * name = ggml_get_name(tensor);
-//     uint32_t name_len = strlen(name);
-//     uint32_t nd = tensor->n_dims;
-//     uint32_t ne[4] = { (uint32_t)tensor->ne[0],
-//                        (uint32_t)tensor->ne[1],
-//                        (uint32_t)tensor->ne[2],
-//                        (uint32_t)tensor->ne[3] };
-//     file->write_u32(nd);
-//     file->write_u32(name_len);
-//     file->write_u32(tensor->type);
-//     file->write_raw(ne, sizeof(ne[0]) * nd);
-//     file->write_raw(name, name_len);
-//     file->seek((0-file->tell()) & 31, SEEK_CUR);
-//     file->write_raw(tensor->data, ggml_nbytes(tensor));
-// }
-
-// void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
-//     int32_t nd = file->read_u32();
-//     GGML_ASSERT(nd == tensor->n_dims);
-
-//     uint32_t name_len       = file->read_u32();
-//     enum     ggml_type type = (enum ggml_type) file->read_u32();
-//     GGML_ASSERT(type == tensor->type);
-
-//     uint32_t ne[4];
-//     file->read_raw(ne, sizeof(ne[0]) * nd);
-//     for (int i=0; i<nd; ++i) {
-//         GGML_ASSERT(ne[i] == tensor->ne[i]);
-//     }
-
-//     std::string name = file->read_string(name_len);
-//     GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
-
-//     file->seek((0-file->tell()) & 31, SEEK_CUR);
-//     file->read_raw(tensor->data, ggml_nbytes(tensor));
-// }
-
-// void skip_tensor(struct llama_file * file) {
-//     int32_t nd = file->read_u32();
-
-//     uint32_t name_len       = file->read_u32();
-//     enum     ggml_type type = (enum ggml_type) file->read_u32();
-
-//     uint32_t ne[4] = { 1, 1, 1, 1 };
-
-//     file->read_raw(ne, sizeof(ne[0]) * nd);
-
-//     std::string name = file->read_string(name_len);
-
-//     file->seek(-file->tell() & 31, SEEK_CUR);
-
-//     size_t nelements = ne[0]*ne[1]*ne[2]*ne[3];
-//     size_t nbytes = nelements*ggml_type_size(type)/ggml_blck_size(type);
-//     file->seek(nbytes, SEEK_CUR);
-// }
-
-void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt) {
-#pragma message("TODO: implement file saving using gguf: write_opt_context")
-    // const uint32_t version = 1;
-    // GGML_ASSERT(opt->nx   >= 0);
-    // GGML_ASSERT(opt->iter >= 0);
-    // file->write_u32(version);
-    // file->write_u32(opt->params.past);
-    // file->write_u32(opt->params.lbfgs.m);
-    // file->write_raw(&opt->nx,     sizeof(opt->nx));
-    // file->write_raw(&opt->iter,   sizeof(opt->iter));
-    // file->write_u32((uint32_t)  opt->just_initialized);
-    // switch (opt->params.type) {
-    //     case GGML_OPT_ADAM:
-    //         {
-    //             GGML_ASSERT(opt->adam.m  != NULL);
-    //             GGML_ASSERT(opt->adam.v  != NULL);
-    //             write_tensor(file, opt->adam.m);
-    //             write_tensor(file, opt->adam.v);
-    //             write_tensor(file, opt->adam.pf);
-    //             file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
-    //             file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
-    //             file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
-    //         } break;
-    //     case GGML_OPT_LBFGS:
-    //         {
-    //             GGML_ASSERT(opt->lbfgs.x != NULL);
-    //             write_tensor(file, opt->lbfgs.x);
-    //             write_tensor(file, opt->lbfgs.xp);
-    //             write_tensor(file, opt->lbfgs.g);
-    //             write_tensor(file, opt->lbfgs.gp);
-    //             write_tensor(file, opt->lbfgs.d);
-    //             write_tensor(file, opt->lbfgs.pf);
-    //             write_tensor(file, opt->lbfgs.lmal);
-    //             write_tensor(file, opt->lbfgs.lmys);
-    //             write_tensor(file, opt->lbfgs.lms);
-    //             write_tensor(file, opt->lbfgs.lmy);
-    //             file->write_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
-    //             file->write_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
-    //             file->write_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
-    //             file->write_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
-    //             file->write_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
-    //             file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
-    //         } break;
-    // }
-}
-
-// struct ggml_opt_params_v0 {
-//     enum ggml_opt_type type;
-//     int n_threads;
-//     int past;
-//     float delta;
-//     int max_no_improvement;
-//     bool print_forward_graph;
-//     bool print_backward_graph;
-//     struct {
-//         int n_iter;
-//         float sched;
-//         float decay;
-//         float alpha;
-//         float beta1;
-//         float beta2;
-//         float eps;
-//         float eps_f;
-//         float eps_g;
-//     } adam;
-//     struct {
-//         int m;
-//         int n_iter;
-//         int max_linesearch;
-//         float eps;
-//         float ftol;
-//         float wolfe;
-//         float min_step;
-//         float max_step;
-//         enum ggml_linesearch linesearch;
-//     } lbfgs;
-// };
-
-// void read_opt_context_v0(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-//     ggml_opt_params_v0 pv0;
-//     file->read_raw(&pv0, sizeof(pv0));
-//     opt->params.past = pv0.past;
-//     opt->params.lbfgs.m = pv0.lbfgs.m;
-//     file->read_raw(&opt->nx, sizeof(opt->nx));
-//     ggml_opt_init(ctx, opt, opt->params, opt->nx);
-
-//     file->read_raw(&opt->iter,   sizeof(opt->iter));
-//     opt->just_initialized = (bool) file->read_u32();
-
-//     switch (opt->params.type) {
-//         case GGML_OPT_ADAM:
-//             {
-//                 skip_tensor(file);
-//                 skip_tensor(file);
-//                 skip_tensor(file);
-//                 read_tensor(file, opt->adam.m);
-//                 read_tensor(file, opt->adam.v);
-//                 skip_tensor(file);
-//                 skip_tensor(file);
-//                 if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
-//                 file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
-//                 file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
-//                 file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
-//             } break;
-//         case GGML_OPT_LBFGS:
-//             {
-//                 GGML_ASSERT(opt->lbfgs.x != NULL);
-//                 read_tensor(file, opt->lbfgs.x);
-//                 read_tensor(file, opt->lbfgs.xp);
-//                 read_tensor(file, opt->lbfgs.g);
-//                 read_tensor(file, opt->lbfgs.gp);
-//                 read_tensor(file, opt->lbfgs.d);
-//                 if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
-//                 read_tensor(file, opt->lbfgs.lmal);
-//                 read_tensor(file, opt->lbfgs.lmys);
-//                 read_tensor(file, opt->lbfgs.lms);
-//                 read_tensor(file, opt->lbfgs.lmy);
-//                 file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
-//                 file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
-//                 file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
-//                 file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
-//                 file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
-//                 file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
-//             } break;
-//     }
-// }
-
-// void read_opt_context_v1(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-//     opt->params.past    = (int) file->read_u32();
-//     opt->params.lbfgs.m = (int) file->read_u32();
-//     file->read_raw(&opt->nx,     sizeof(opt->nx));
-//     ggml_opt_init(ctx, opt, opt->params, opt->nx);
-
-//     file->read_raw(&opt->iter,   sizeof(opt->iter));
-//     opt->just_initialized = (bool) file->read_u32();
-
-//     switch (opt->params.type) {
-//         case GGML_OPT_ADAM:
-//             {
-//                 read_tensor(file, opt->adam.m);
-//                 read_tensor(file, opt->adam.v);
-//                 if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
-//                 file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
-//                 file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
-//                 file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
-//             } break;
-//         case GGML_OPT_LBFGS:
-//             {
-//                 GGML_ASSERT(opt->lbfgs.x != NULL);
-//                 read_tensor(file, opt->lbfgs.x);
-//                 read_tensor(file, opt->lbfgs.xp);
-//                 read_tensor(file, opt->lbfgs.g);
-//                 read_tensor(file, opt->lbfgs.gp);
-//                 read_tensor(file, opt->lbfgs.d);
-//                 if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
-//                 read_tensor(file, opt->lbfgs.lmal);
-//                 read_tensor(file, opt->lbfgs.lmys);
-//                 read_tensor(file, opt->lbfgs.lms);
-//                 read_tensor(file, opt->lbfgs.lmy);
-//                 file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
-//                 file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
-//                 file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
-//                 file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
-//                 file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
-//                 file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
-//             } break;
-//     }
-// }
-
-void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-#pragma message("TODO: implement file loading using gguf: read_opt_context")
-    // uint32_t version = file->read_u32();
-    // printf("%s: opt context version %u\n", __func__, version);
-    // switch (version) {
-    //     case 0:
-    //         {
-    //             read_opt_context_v0(file, ctx, opt);
-    //         } break;
-    //     case 1:
-    //         {
-    //             read_opt_context_v1(file, ctx, opt);
-    //         } break;
-    //     default:
-    //         {
-    //             fprintf(stderr, "%s: unknown version %u\n", __func__, version);
-    //         }
-    // }
-}
-
-void save_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename) {
-#pragma message("TODO: implement file saving using gguf: save_checkpoint")
-    // struct llama_file file(filename, "wb");
-    // if (file.fp == NULL) {
-    //     return;
-    // }
-
-    // const uint32_t magic   = 'ggcp';
-    // const uint32_t version = 0;
-
-    // file.write_u32(magic);
-    // file.write_u32(version);
-    // file.write_u32(model->train_its);
-    // file.write_u32(model->train_samples);
-    // file.write_u32(model->train_tokens);
-    // file.write_u32(model->hparams.n_vocab);
-    // file.write_u32(model->hparams.n_embd);
-    // // file.write_u32(model->hparams.n_mult);
-    // file.write_u32(model->hparams.n_head);
-    // file.write_u32(model->hparams.n_layer);
-    // file.write_u32(model->hparams.n_rot);
-
-    // write_tensor(&file, model->tok_embeddings);
-    // write_tensor(&file, model->norm);
-    // write_tensor(&file, model->output);
-
-    // for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-    //     auto & layer = model->layers[i];
-
-    //     write_tensor(&file, layer.attention_norm);
-    //     write_tensor(&file, layer.wq);
-    //     write_tensor(&file, layer.wk);
-    //     write_tensor(&file, layer.wv);
-    //     write_tensor(&file, layer.wo);
-    //     write_tensor(&file, layer.ffn_norm);
-    //     write_tensor(&file, layer.w1);
-    //     write_tensor(&file, layer.w2);
-    //     write_tensor(&file, layer.w3);
-    // }
-
-    // write_opt_context(&file, opt);
-}
-
-bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, bool init) {
-#pragma message("TODO: implement file loading using gguf: load_checkpoint")
-    return false;
-    // struct llama_file file(filename, "rb");
-
-    // uint32_t magic;
-    // uint32_t version;
-
-    // uint32_t train_its = 0;
-    // uint32_t train_samples = 0;
-    // uint32_t train_tokens = 0;
-
-    // if (file.fp) {
-    //     printf("%s: Loading model from '%s'.\n", __func__, filename);
-    //     magic                  = file.read_u32();
-    //     GGML_ASSERT(magic     == 'ggcp');
-    //     version                = file.read_u32();
-    //     GGML_ASSERT(version   == 0);
-    //     train_its              = file.read_u32();
-    //     train_samples          = file.read_u32();
-    //     train_tokens           = file.read_u32();
-    //     model->hparams.n_vocab = file.read_u32();
-    //     model->hparams.n_embd  = file.read_u32();
-    //     // model->hparams.n_mult  = file.read_u32();
-    //     model->hparams.n_head  = file.read_u32();
-    //     model->hparams.n_layer = file.read_u32();
-    //     model->hparams.n_rot   = file.read_u32();
-    //     print_params(&model->hparams);
-    // }
-
-    // if (init) {
-    //     init_model(model);
-    // }
-
-    // if (file.fp) {
-    //     model->train_its = train_its;
-    //     model->train_samples = train_samples;
-    //     model->train_tokens = train_tokens;
-    // }
-
-    // printf("%s: Training iterations: %u.\n", __func__, model->train_its);
-    // printf("%s: Training samples:    %u.\n", __func__, model->train_samples);
-    // printf("%s: Training tokens:     %u.\n", __func__, model->train_tokens);
-
-    // if (file.fp) {
-    //     read_tensor(&file, model->tok_embeddings);
-    //     read_tensor(&file, model->norm);
-    //     read_tensor(&file, model->output);
-
-    //     for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-    //         auto & layer = model->layers[i];
-
-    //         read_tensor(&file, layer.attention_norm);
-    //         read_tensor(&file, layer.wq);
-    //         read_tensor(&file, layer.wk);
-    //         read_tensor(&file, layer.wv);
-    //         read_tensor(&file, layer.wo);
-    //         read_tensor(&file, layer.ffn_norm);
-    //         read_tensor(&file, layer.w1);
-    //         read_tensor(&file, layer.w2);
-    //         read_tensor(&file, layer.w3);
-    //     }
-
-    //     read_opt_context(&file, model->ctx, opt);
-    // }
-
-    // return (file.fp != NULL);
-}
-
 #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
 { \
     const std::string skey(key); \
@@ -1854,12 +1469,221 @@ bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * op
     } \
 }
 
-void save_as_llama_model(const char * fn_vocab_model, struct my_llama_model * model, const char * filename) {
-    struct llama_file file(filename, "wb");
-    if (file.fp == NULL) {
+
+bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
+    GGML_ASSERT(a != NULL);
+    GGML_ASSERT(b != NULL);
+    GGML_ASSERT(a->type == b->type);
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+    GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b));
+}
+
+void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
+    if (dst == NULL) {
         return;
     }
+    struct ggml_tensor * t  = ggml_get_tensor(f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
+    GGML_ASSERT(are_same_layout(dst, t));
+    memcpy(dst->data, t->data, ggml_nbytes(t));
+}
+
+void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
+    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
+
+    GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT);
+    GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT);
+
+    // gguf v1 only supports values with up to 32-bit precision
+    uint32_t nx[2] = { 0, 0 };
+    GGUF_GET_KEY(fctx, nx[0], gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT_LOW);
+    GGUF_GET_KEY(fctx, nx[1], gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT_HIGH);
+    memcpy(&opt->nx, &nx[0], sizeof(opt->nx));
+    // TODO read as 64-bit uint
+
+    // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know
+
+    std::string opt_type;
+    GGUF_GET_KEY(fctx, opt_type, gguf_get_arr_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
+    if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
+        opt->params.type = GGML_OPT_ADAM;
+
+        GGUF_GET_KEY(fctx, opt->adam.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
+        GGUF_GET_KEY(fctx, opt->adam.fx_prev,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
+        GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT);
+
+        GGML_ASSERT(opt->ctx != NULL);
+        ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
+
+        read_tensor_by_name(opt->adam.m,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
+        read_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
+        read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
+    } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
+        opt->params.type = GGML_OPT_LBFGS;
+
+        GGUF_GET_KEY(fctx, opt->params.lbfgs.m,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
+        GGUF_GET_KEY(fctx, opt->lbfgs.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
+        GGUF_GET_KEY(fctx, opt->lbfgs.step,             gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP);
+        GGUF_GET_KEY(fctx, opt->lbfgs.j,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J);
+        GGUF_GET_KEY(fctx, opt->lbfgs.k,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K);
+        GGUF_GET_KEY(fctx, opt->lbfgs.end,              gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END);
+        GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT);
+
+        GGML_ASSERT(opt->ctx != NULL);
+        ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
+
+        read_tensor_by_name(opt->lbfgs.x,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
+        read_tensor_by_name(opt->lbfgs.xp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
+        read_tensor_by_name(opt->lbfgs.g,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
+        read_tensor_by_name(opt->lbfgs.gp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
+        read_tensor_by_name(opt->lbfgs.d,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
+        read_tensor_by_name(opt->lbfgs.pf,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
+        read_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
+        read_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
+        read_tensor_by_name(opt->lbfgs.lms,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
+        read_tensor_by_name(opt->lbfgs.lmy,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
+    } else {
+        throw std::runtime_error("unknown optimizer type\n");
+    }
+}
+
+void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
+    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
 
+    // gguf v1 only supports values with up to 32-bit precision,
+    uint32_t nx[2] = { 0, 0 };
+    memcpy(&nx[0], &opt->nx, sizeof(opt->nx));
+    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT_LOW,  nx[0]);
+    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT_HIGH, nx[1]);
+    // TODO set as 64-bit uint
+
+    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter);
+
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
+                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS,            opt->adam.fx_best);
+                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS,        opt->adam.fx_prev);
+                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement);
+
+                ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
+                ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
+                if (opt->adam.pf) {
+                    ggml_set_name(pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
+                }
+
+                gguf_add_tensor(fctx, opt->adam.m);
+                gguf_add_tensor(fctx, opt->adam.v);
+                if (opt->adam.pf) {
+                    gguf_add_tensor(fctx, opt->adam.pf);
+                }
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
+                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
+                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS,            opt->lbfgs.fx_best);
+                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP,     opt->lbfgs.step);
+                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J,        opt->lbfgs.j);
+                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K,        opt->lbfgs.k);
+                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END,      opt->lbfgs.end);
+                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement);
+
+                ggml_set_name(opt->lbfgs.x,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
+                ggml_set_name(opt->lbfgs.xp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
+                ggml_set_name(opt->lbfgs.g,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
+                ggml_set_name(opt->lbfgs.gp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
+                ggml_set_name(opt->lbfgs.d,    LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
+                if (opt->lbfgs.pf) {
+                    ggml_set_name(pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
+                }
+                ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
+                ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
+                ggml_set_name(opt->lbfgs.lms,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
+                ggml_set_name(opt->lbfgs.lmy,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
+
+                gguf_add_tensor(fctx, opt->lbfgs.x);
+                gguf_add_tensor(fctx, opt->lbfgs.xp);
+                gguf_add_tensor(fctx, opt->lbfgs.g);
+                gguf_add_tensor(fctx, opt->lbfgs.gp);
+                gguf_add_tensor(fctx, opt->lbfgs.d);
+                if (opt->lbfgs.pf) {
+                    gguf_add_tensor(fctx, opt->lbfgs.pf);
+                }
+                gguf_add_tensor(fctx, opt->lbfgs.lmal);
+                gguf_add_tensor(fctx, opt->lbfgs.lmys);
+                gguf_add_tensor(fctx, opt->lbfgs.lms);
+                gguf_add_tensor(fctx, opt->lbfgs.lmy);
+            } break;
+    }
+}
+
+void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) {
+    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
+    std::string arch;
+    
+    std::vector<char> keybuf;
+    keybuf.resize(512);
+    auto kv = [arch, &keybuf](const char * key) -> const char * {
+        snprintf(keybuf.data(), keybuf.size(), key, arch.c_str());
+        return keybuf.data();
+    };
+
+    std::vector<char> tn_buf;
+    tn_buf.resize(GGML_MAX_NAME);
+    auto tn = [arch, &tn_buf](const char * key) -> const char * {
+        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
+        return tn_buf.data();
+    };
+    auto tni = [arch, &tn_buf](const char * key, int bid) -> const char * {
+        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
+        std::string s = tn_buf.data();
+        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
+        return tn_buf.data();
+    };
+
+    GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
+    GGML_ASSERT(arch == "llama");
+
+    uint32_t ftype_u;
+    GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_U32, true, LLM_KV_GENERAL_FILE_TYPE);
+    GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);
+
+    GGUF_GET_KEY(fctx, model->hparams.n_ctx,   gguf_get_val_u32, GGUF_TYPE_U32, true, kv(LLM_KV_CONTEXT_LENGTH));
+    GGUF_GET_KEY(fctx, model->hparams.n_embd,  gguf_get_val_u32, GGUF_TYPE_U32, true, kv(LLM_KV_EMBEDDING_LENGTH));
+    GGUF_GET_KEY(fctx, model->hparams.n_ff,    gguf_get_val_u32, GGUF_TYPE_U32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
+    GGUF_GET_KEY(fctx, model->hparams.n_head,  gguf_get_val_u32, GGUF_TYPE_U32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
+    GGUF_GET_KEY(fctx, model->hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_U32, true, kv(LLM_KV_BLOCK_COUNT));
+    GGUF_GET_KEY(fctx, model->hparams.n_rot,   gguf_get_val_u32, GGUF_TYPE_U32, true, kv(LLM_KV_ROPE_DIMENSION_COUNT));
+    
+    float rope_freq_scale;
+    GGUF_GET_KEY(fctx, model->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_F32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+    GGUF_GET_KEY(fctx, model->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_F32, true, kv(LLM_KV_ROPE_FREQ_BASE));
+    GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_F32, true, kv(LLM_KV_ROPE_SCALE_LINEAR));
+    model->hparams.rope_freq_scale = 1.0f / rope_freq_scale;
+
+    init_model(model);
+
+    read_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD));
+    read_tensor_by_name(model->norm,           f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM));
+    read_tensor_by_name(model->output,         f_ggml_ctx, tn(LLM_TENSOR_OUTPUT));
+
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        read_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i));
+        read_tensor_by_name(layer.wq,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i));
+        read_tensor_by_name(layer.wk,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i));
+        read_tensor_by_name(layer.wv,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
+        read_tensor_by_name(layer.wo,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
+        read_tensor_by_name(layer.ffn_norm,       f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
+        read_tensor_by_name(layer.w1,             f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
+        read_tensor_by_name(layer.w2,             f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
+        read_tensor_by_name(layer.w3,             f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
+    }
+}
+
+void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) {
     const char * arch = "llama";
     enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
 
@@ -1870,34 +1694,6 @@ void save_as_llama_model(const char * fn_vocab_model, struct my_llama_model * mo
         return keybuf.data();
     };
 
-    // gguf constants (sync with gguf.py)
-
-    const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
-    const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
-
-    const char * LLM_KV_CONTEXT_LENGTH              = "%s.context_length";
-    const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
-    const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
-    const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
-    const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
-    const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
-    const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
-    const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
-    const char * LLM_KV_ROPE_SCALE_LINEAR           = "%s.rope.scale_linear";
-
-    const char * LLM_KV_TOKENIZER_MODEL             = "tokenizer.ggml.model";
-    const char * LLM_KV_TOKENIZER_LIST              = "tokenizer.ggml.tokens";
-    const char * LLM_KV_TOKENIZER_TOKEN_TYPE        = "tokenizer.ggml.token_type";
-    const char * LLM_KV_TOKENIZER_SCORES            = "tokenizer.ggml.scores";
-    const char * LLM_KV_TOKENIZER_MERGES            = "tokenizer.ggml.merges";
-    const char * LLM_KV_TOKENIZER_BOS_ID            = "tokenizer.ggml.bos_token_id";
-    const char * LLM_KV_TOKENIZER_EOS_ID            = "tokenizer.ggml.eos_token_id";
-    const char * LLM_KV_TOKENIZER_UNK_ID            = "tokenizer.ggml.unknown_token_id";
-    const char * LLM_KV_TOKENIZER_SEP_ID            = "tokenizer.ggml.seperator_token_id";
-    const char * LLM_KV_TOKENIZER_PAD_ID            = "tokenizer.ggml.padding_token_id";
-
-    struct gguf_context * fctx = gguf_init_empty();
-
     // set arch
     gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
     gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
@@ -1910,9 +1706,9 @@ void save_as_llama_model(const char * fn_vocab_model, struct my_llama_model * mo
     gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT),                 model->hparams.n_layer                );
     gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT),        model->hparams.n_rot                  );
 
-    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps         );
-    gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_FREQ_BASE),              model->hparams.rope_freq_base         ); // TODO load in llama.cpp
-    gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR),           1.0f / model->hparams.rope_freq_scale );
+    gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps         );
+    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE),              model->hparams.rope_freq_base         ); // TODO load in llama.cpp
+    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR),           1.0f / model->hparams.rope_freq_scale );
 
     // set vocab by copying from vocab_model gguf file
     {
@@ -2027,6 +1823,58 @@ void save_as_llama_model(const char * fn_vocab_model, struct my_llama_model * mo
         gguf_add_tensor(fctx, layer.w2);
         gguf_add_tensor(fctx, layer.w3);
     }
+}
+
+void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) {
+    struct gguf_context * fctx = gguf_init_empty();
+
+    save_llama_model_gguf(fctx, fn_vocab_model, model);
+
+    // write file
+    const bool only_meta = false;
+    gguf_write_to_file(fctx, filename, only_meta);
+    gguf_free(fctx);
+}
+
+void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct ggml_opt_context * opt) {
+    load_llama_model_gguf(fctx, f_ggml_ctx, model);
+
+    GGUF_GET_KEY(fctx, model->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
+    GGUF_GET_KEY(fctx, model->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
+    GGUF_GET_KEY(fctx, model->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
+
+    load_opt_context_gguf(fctx, f_ggml_ctx, opt);
+}
+
+void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) {
+    save_llama_model_gguf(fctx, fn_vocab_model, model);
+
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_ITERATION_COUNT, model->train_its);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    model->train_samples);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     model->train_tokens);
+
+    save_opt_context_gguf(fctx, opt);
+}
+
+bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct ggml_opt_context * opt) {
+    struct ggml_context * f_ggml_ctx;
+    struct gguf_init_params params;
+    params.no_alloc = false;
+    params.ctx = &f_ggml_ctx;
+    struct gguf_context * fctx = gguf_init_from_file(filename, params);
+    if (fctx == NULL) {
+        return false;
+    }
+
+    load_checkpoint_gguf(fctx, f_ggml_ctx, model, opt);
+
+    return true;
+}
+
+void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) {
+    struct gguf_context * fctx = gguf_init_empty();
+
+    save_checkpoint_gguf(fctx, fn_vocab_model, model, opt);
 
     // write file
     const bool only_meta = false;
@@ -2849,11 +2697,11 @@ int main(int argc, char ** argv) {
     printf("%s: total training time=%f seconds\n", __func__, dd);
 
     if (params.n_examples > 0) {
-        save_checkpoint(&model, opt, params.fn_checkpoint_out);
+        save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, opt);
     }
 
     if (strlen(params.fn_model_out) > 0) {
-        save_as_llama_model(params.fn_vocab_model, &model, params.fn_model_out);
+        save_llama_model_file(params.fn_model_out, params.fn_vocab_model, &model);
     }
 
     {

From 6a20f7a2f0ef884e9092552d6998ff741c488957 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 25 Aug 2023 22:32:39 +0200
Subject: [PATCH 068/100] bug fixes

---
 .../train-text-from-scratch.cpp               | 77 +++++++++++++------
 1 file changed, 53 insertions(+), 24 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index e4304126de487..9fed3dd3593e0 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -235,7 +235,6 @@ struct my_llama_model {
     uint32_t train_tokens = 0;
 };
 
-
 // gguf constants
 const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
 const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
@@ -1280,13 +1279,43 @@ static std::string format(const char * fmt, ...) {
 }
 
 int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
-    struct llama_file f(filename, "rb");
+    FILE * fp = std::fopen(filename, "rb");
+    if (fp == NULL) {
+        return 0;
+    } 
+
+#ifdef _WIN32
+    GGML_ASSERT(_fseeki64(fp, (__int64) 0, SEEK_END) == 0);
+#else
+    GGML_ASSERT(std::fseek(fp, (long) 0, SEEK_END) == 0);
+#endif
+
+    size_t size = 0;
+#ifdef _WIN32
+    __int64 ret = _ftelli64(fp);
+    size = ret;
+#else
+    long ret = std::ftell(fp);
+    size = ret;
+#endif
+
+#ifdef _WIN32
+    GGML_ASSERT(_fseeki64(fp, (__int64) 0, SEEK_SET) == 0);
+#else
+    GGML_ASSERT(std::fseek(fp, (long) 0, SEEK_SET) == 0);
+#endif
 
     std::vector<char> buf;
-    buf.resize(f.size+1);
+    buf.resize(size+1);
 
-    f.read_raw(buf.data(), f.size);
-    buf[f.size] = '\0';
+    if (std::fread(buf.data(), size, 1, fp) != 1) {
+        throw std::runtime_error(std::string("unexpectedly reached end of file"));
+    }
+    if (ferror(fp)) {
+        throw std::runtime_error(format("read error: %s", strerror(errno)));
+    }
+
+    buf[size] = '\0';
 
     int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
     if (n_tokens < 0) {
@@ -1482,7 +1511,7 @@ void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, co
     if (dst == NULL) {
         return;
     }
-    struct ggml_tensor * t  = ggml_get_tensor(f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
+    struct ggml_tensor * t  = ggml_get_tensor(ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
     GGML_ASSERT(are_same_layout(dst, t));
     memcpy(dst->data, t->data, ggml_nbytes(t));
 }
@@ -1503,7 +1532,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
     // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know
 
     std::string opt_type;
-    GGUF_GET_KEY(fctx, opt_type, gguf_get_arr_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
+    GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
     if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
         opt->params.type = GGML_OPT_ADAM;
 
@@ -1514,9 +1543,9 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
         GGML_ASSERT(opt->ctx != NULL);
         ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
 
-        read_tensor_by_name(opt->adam.m,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
-        read_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
-        read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
+        read_tensor_by_name(opt->adam.m,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
+        read_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
+        read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
     } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
         opt->params.type = GGML_OPT_LBFGS;
 
@@ -1569,7 +1598,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
                 ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
                 ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
                 if (opt->adam.pf) {
-                    ggml_set_name(pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
+                    ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
                 }
 
                 gguf_add_tensor(fctx, opt->adam.m);
@@ -1595,7 +1624,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
                 ggml_set_name(opt->lbfgs.gp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
                 ggml_set_name(opt->lbfgs.d,    LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
                 if (opt->lbfgs.pf) {
-                    ggml_set_name(pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
+                    ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
                 }
                 ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
                 ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
@@ -1646,20 +1675,20 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g
     GGML_ASSERT(arch == "llama");
 
     uint32_t ftype_u;
-    GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_U32, true, LLM_KV_GENERAL_FILE_TYPE);
+    GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
     GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);
 
-    GGUF_GET_KEY(fctx, model->hparams.n_ctx,   gguf_get_val_u32, GGUF_TYPE_U32, true, kv(LLM_KV_CONTEXT_LENGTH));
-    GGUF_GET_KEY(fctx, model->hparams.n_embd,  gguf_get_val_u32, GGUF_TYPE_U32, true, kv(LLM_KV_EMBEDDING_LENGTH));
-    GGUF_GET_KEY(fctx, model->hparams.n_ff,    gguf_get_val_u32, GGUF_TYPE_U32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
-    GGUF_GET_KEY(fctx, model->hparams.n_head,  gguf_get_val_u32, GGUF_TYPE_U32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
-    GGUF_GET_KEY(fctx, model->hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_U32, true, kv(LLM_KV_BLOCK_COUNT));
-    GGUF_GET_KEY(fctx, model->hparams.n_rot,   gguf_get_val_u32, GGUF_TYPE_U32, true, kv(LLM_KV_ROPE_DIMENSION_COUNT));
+    GGUF_GET_KEY(fctx, model->hparams.n_ctx,   gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
+    GGUF_GET_KEY(fctx, model->hparams.n_embd,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
+    GGUF_GET_KEY(fctx, model->hparams.n_ff,    gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
+    GGUF_GET_KEY(fctx, model->hparams.n_head,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
+    GGUF_GET_KEY(fctx, model->hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
+    GGUF_GET_KEY(fctx, model->hparams.n_rot,   gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ROPE_DIMENSION_COUNT));
     
     float rope_freq_scale;
-    GGUF_GET_KEY(fctx, model->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_F32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
-    GGUF_GET_KEY(fctx, model->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_F32, true, kv(LLM_KV_ROPE_FREQ_BASE));
-    GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_F32, true, kv(LLM_KV_ROPE_SCALE_LINEAR));
+    GGUF_GET_KEY(fctx, model->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+    GGUF_GET_KEY(fctx, model->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ROPE_FREQ_BASE));
+    GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ROPE_SCALE_LINEAR));
     model->hparams.rope_freq_scale = 1.0f / rope_freq_scale;
 
     init_model(model);
@@ -1668,7 +1697,7 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g
     read_tensor_by_name(model->norm,           f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM));
     read_tensor_by_name(model->output,         f_ggml_ctx, tn(LLM_TENSOR_OUTPUT));
 
-    for (uint32_t i = 0; i < n_layer; ++i) {
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
         auto & layer = model->layers[i];
 
         read_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i));
@@ -2509,7 +2538,7 @@ int main(int argc, char ** argv) {
     opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
 
     printf("%s: init model\n", __func__);
-    bool existed = load_checkpoint(&model, opt, params.fn_checkpoint_in, true);
+    bool existed = load_checkpoint_file(params.fn_checkpoint_in, &model, opt);
     set_param_model(&model);
 
     opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;

From 167dd2dcec37caba576a43b0c0f9cbe71c7b88cb Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 26 Aug 2023 21:04:01 +0200
Subject: [PATCH 069/100] add checkpoint file version for future compatibility

---
 .../train-text-from-scratch.cpp                      | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 9fed3dd3593e0..c7345c3009b86 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -239,6 +239,7 @@ struct my_llama_model {
 const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
 const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
 const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
+const char * LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version";
 const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count";
 const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT_LOW        = "optimizer.parameter_count.low";
 const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT_HIGH       = "optimizer.parameter_count.high";
@@ -269,6 +270,7 @@ const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.m
 const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
 const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
 
+const char * LLM_KV_TRAINING_FILE_VERSION    = "training.file_version";
 const char * LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count";
 const char * LLM_KV_TRAINING_SAMPLE_COUNT    = "training.sample_count";
 const char * LLM_KV_TRAINING_TOKEN_COUNT     = "training.token_count";
@@ -1519,6 +1521,10 @@ void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, co
 void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
     // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
 
+    uint32_t file_version;
+    GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION);
+    GGML_ASSERT(file_version == 0);
+
     GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT);
     GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT);
 
@@ -1576,6 +1582,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
 }
 
 void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
+    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0);
     gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
 
     // gguf v1 only supports values with up to 32-bit precision,
@@ -1868,6 +1875,10 @@ void save_llama_model_file(const char * filename, const char * fn_vocab_model, s
 void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct ggml_opt_context * opt) {
     load_llama_model_gguf(fctx, f_ggml_ctx, model);
 
+    uint32_t file_version;
+    GGUF_GET_KEY(fctx, file_version,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
+    GGML_ASSERT(file_version == 0);
+
     GGUF_GET_KEY(fctx, model->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
     GGUF_GET_KEY(fctx, model->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
     GGUF_GET_KEY(fctx, model->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
@@ -1878,6 +1889,7 @@ void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_gg
 void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) {
     save_llama_model_gguf(fctx, fn_vocab_model, model);
 
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    0);
     gguf_set_val_u32(fctx, LLM_KV_TRAINING_ITERATION_COUNT, model->train_its);
     gguf_set_val_u32(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    model->train_samples);
     gguf_set_val_u32(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     model->train_tokens);

From 2978e0308603f6e35909962aefe34fa4adc7133b Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 26 Aug 2023 21:04:14 +0200
Subject: [PATCH 070/100] update readme with gguf filenames

---
 examples/train-text-from-scratch/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md
index 726ec47c0ce4f..001a6abf98bd4 100644
--- a/examples/train-text-from-scratch/README.md
+++ b/examples/train-text-from-scratch/README.md
@@ -8,15 +8,15 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
 
 # train
 ./bin/train-text-from-scratch \
-        --vocab-model ../models/ggml-vocab.bin \
+        --vocab-model ../models/ggml-vocab-llama.gguf \
         --ctx 64 --embd 256 --head 8 --layer 16 \
-        --checkpoint-in  chk-shakespeare-256x16.bin \
-        --checkpoint-out chk-shakespeare-256x16.bin \
-        --model-out ggml-shakespeare-256x16-f32.bin \
+        --checkpoint-in  chk-shakespeare-256x16.gguf \
+        --checkpoint-out chk-shakespeare-256x16.gguf \
+        --model-out ggml-shakespeare-256x16-f32.gguf \
         --train-data "shakespeare.txt" \
         -t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
         --print-details-interval 0 --predict 16 --use-flash
 
 # predict
-./bin/main -m ggml-shakespeare-256x16-f32.bin
+./bin/main -m ggml-shakespeare-256x16-f32.gguf
 ```

From 0c494cc60ebef97a5ba847203f78fae4a96491ad Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 27 Aug 2023 22:05:24 +0200
Subject: [PATCH 071/100] save & load opt->just_initialized value

---
 .../train-text-from-scratch.cpp               | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index c7345c3009b86..0a32cc248d219 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -244,6 +244,7 @@ const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergenc
 const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT_LOW        = "optimizer.parameter_count.low";
 const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT_HIGH       = "optimizer.parameter_count.high";
 const char * LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count";
+const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized";
 const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss";
 const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss";
 const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count";
@@ -1527,6 +1528,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
 
     GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT);
     GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT);
+    GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED);
 
     // gguf v1 only supports values with up to 32-bit precision
     uint32_t nx[2] = { 0, 0 };
@@ -1587,12 +1589,14 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
 
     // gguf v1 only supports values with up to 32-bit precision,
     uint32_t nx[2] = { 0, 0 };
-    memcpy(&nx[0], &opt->nx, sizeof(opt->nx));
+    nx[0] = opt->nx & 0xFFFFFFFF;
+    nx[1] = (opt->nx >> 32) & 0xFFFFFFFF;
     gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT_LOW,  nx[0]);
     gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT_HIGH, nx[1]);
     // TODO set as 64-bit uint
 
     gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter);
+    gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
 
     switch (opt->params.type) {
         case GGML_OPT_ADAM:
@@ -1685,18 +1689,24 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g
     GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
     GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);
 
-    GGUF_GET_KEY(fctx, model->hparams.n_ctx,   gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
+    // n_ctx was not saved in earlier checkpoint file versions, so we make it optional here
+    GGUF_GET_KEY(fctx, model->hparams.n_ctx,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
+
     GGUF_GET_KEY(fctx, model->hparams.n_embd,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
     GGUF_GET_KEY(fctx, model->hparams.n_ff,    gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
     GGUF_GET_KEY(fctx, model->hparams.n_head,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
     GGUF_GET_KEY(fctx, model->hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
-    GGUF_GET_KEY(fctx, model->hparams.n_rot,   gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ROPE_DIMENSION_COUNT));
     
-    float rope_freq_scale;
-    GGUF_GET_KEY(fctx, model->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
-    GGUF_GET_KEY(fctx, model->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ROPE_FREQ_BASE));
-    GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ROPE_SCALE_LINEAR));
-    model->hparams.rope_freq_scale = 1.0f / rope_freq_scale;
+    model->hparams.n_rot = model->hparams.n_embd / model->hparams.n_head;
+    GGUF_GET_KEY(fctx, model->hparams.n_rot,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
+    
+    float rope_freq_scale = 1.0f;
+    GGUF_GET_KEY(fctx, model->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+    GGUF_GET_KEY(fctx, model->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
+    GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+    if (rope_freq_scale != 1.0f) {
+        model->hparams.rope_freq_scale = 1.0f / rope_freq_scale;
+    }
 
     init_model(model);
 

From 3a91c975a6ea5ca5c561d9cffce168619030bd53 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 27 Aug 2023 22:05:36 +0200
Subject: [PATCH 072/100] add first draft for checkpoint conversion script

---
 .../convert-train-checkpoint-to-gguf.py       | 448 ++++++++++++++++++
 1 file changed, 448 insertions(+)
 create mode 100644 examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py

diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
new file mode 100644
index 0000000000000..55a6667744cb9
--- /dev/null
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@@ -0,0 +1,448 @@
+#!/usr/bin/env python3
+# train-text-from-scratch checkpoint --> gguf conversion
+
+import argparse
+import gguf
+import os
+import sys
+from pathlib import Path
+
+class Tensor:
+    def __init__(self, dtype='f', ne=None):
+        if ne is None:
+            ne = []
+        self.dtype = dtype
+        self.ne = ne
+        self.nbytes = 0
+        if self.dtype == 'f':
+            self.nbytes = product(self.ne) * 4
+        else:
+            raise ValueError(f"Unhandled data type '{self.dtype}'")
+
+    def load(self, data, offset):
+        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+
+        assert(nd == len(self.ne))
+        ne = []
+        for d in range(nd):
+            n = struct.unpack('<Q', bytes(data[offset:offset + 8]))[0]; offset += 8
+            ne.append(n)
+
+        assert(tuple(ne) == tuple(self.ne))
+
+        if self.dtype == 'f':
+            assert(dtype == 0)
+        else:
+            raise ValueError(f"Unhandled data type '{self.dtype}'")
+
+        self.name = bytes(data[offset:offset+namelen]); offset += namelen
+        # 32-byte alignment
+        offset += (0 - offset) & 31
+        self.data = data[offset:offset+self.nbytes]
+
+        return offset
+
+    def max_storage_size(self):
+        result = 0
+        result += 4 # nd
+        result += 4 # namelen
+        result += 4 # dtype
+        result += len(self.ne)*8 # ne
+        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
+        result += 31 # 32-byte alignment
+        result += self.nbytes
+        return result
+
+    def save_gguf(self, gguf_writer, name):
+        gguf_writer.add_tensor(
+            name=name, 
+            tensor=self.data, 
+            raw_shape=np.array(list(reversed(self.ne))),
+            raw_type=gguf.GGMLQuantizationType.F32)
+
+class OptimizationParamsV0:
+    def __init__(self):
+        pass
+
+    def load(self, data, offset):
+        self.type                 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_threads            = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.past                 = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.delta                = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.print_forward_graph  = struct.unpack('<?', bytes(data[offset:offset + 4]))[0];  offset += 4 # 32bit-aligned
+        self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 4]))[0];  offset += 4 # 32bit-aligned
+        self.adam_n_iter          = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.adam_sched           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.adam_decay           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.adam_alpha           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.adam_beta1           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.adam_beta2           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.adam_eps             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.adam_eps_f           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.adam_eps_g           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.lbfgs_m              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.lbfgs_n_iter         = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.lbfgs_eps            = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.lbfgs_ftol           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.lbfgs_wolfe          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.lbfgs_min_step       = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.lbfgs_max_step       = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.lbfgs_linesearch     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        return offset
+
+class OptimizationContext:
+    def __init__(self):
+        pass
+
+    def load(self, data, offset):
+        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
+        offset += 4
+
+        if version == 0:
+            params = OptimizationParamsV0()
+            offset += params.load(data, offset)
+            self.past = params.past
+            self.lbfgs_m = params.lbfgs_m
+            self.nx = struct.unpack('<Q', bytes(data[offset:offset + 8]))[0];  offset += 8
+            self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
+            self.type = params.type
+
+            self.adam_m  = Tensor('f', [self.nx])
+            self.adam_v  = Tensor('f', [self.nx])
+            self.adam_pf = Tensor('f', [self.past])
+
+            self.lbfgs_x    = Tensor('f', [self.nx])
+            self.lbfgs_xp   = Tensor('f', [self.nx])
+            self.lbfgs_g    = Tensor('f', [self.nx])
+            self.lbfgs_gp   = Tensor('f', [self.nx])
+            self.lbfgs_d    = Tensor('f', [self.nx])
+            self.lbfgs_pf   = Tensor('f', [self.past])
+            self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
+            self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
+            self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
+            self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
+
+            if self.type == 0:
+                # these tensors are stored, but we don't need their data
+                x  = Tensor('f', [self.nx])
+                g  = Tensor('f', [self.nx])
+                g2 = Tensor('f', [self.nx])
+                mh = Tensor('f', [self.nx])
+                mv = Tensor('f', [self.nx])
+
+                offset += x.load(data, offset)
+                offset += g.load(data, offset)
+                offset += g2.load(data, offset)
+                offset += self.adam_m.load(data, offset)
+                offset += self.adam_v.load(data, offset)
+                offset += mh.load(data, offset)
+                offset += vh.load(data, offset)
+                if self.past > 0:
+                    offset += self.adam_pf.load(data, offset)
+
+                self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+                self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+                self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+
+            elif self.type == 1:
+                offset += self.lbfgs_x.load(data, offset)
+                offset += self.lbfgs_xp.load(data, offset)
+                offset += self.lbfgs_g.load(data, offset)
+                offset += self.lbfgs_gp.load(data, offset)
+                offset += self.lbfgs_d.load(data, offset)
+                if self.past > 0:
+                    offset += self.lbfgs_pf.load(data, offset)
+                offset += self.lbfgs_lmal.load(data, offset)
+                offset += self.lbfgs_lmys.load(data, offset)
+                offset += self.lbfgs_lms.load(data, offset)
+                offset += self.lbfgs_lmy.load(data, offset)
+
+                self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+                self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+                self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+                self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+                self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+                self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+
+            else:
+                raise ValueError('Unknown optimizer type')
+
+
+        elif version == 1:
+            self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.nx      = struct.unpack('<Q', bytes(data[offset:offset + 8]))[0];  offset += 8
+            self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
+
+            self.adam_m  = Tensor('f', [self.nx])
+            self.adam_w  = Tensor('f', [self.nx])
+            self.adam_pf = Tensor('f', [self.past])
+
+            self.lbfgs_x    = Tensor('f', [self.nx])
+            self.lbfgs_xp   = Tensor('f', [self.nx])
+            self.lbfgs_g    = Tensor('f', [self.nx])
+            self.lbfgs_gp   = Tensor('f', [self.nx])
+            self.lbfgs_d    = Tensor('f', [self.nx])
+            self.lbfgs_pf   = Tensor('f', [self.past])
+            self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
+            self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
+            self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
+            self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
+
+            # forgot to save type in version 1:
+            # guess self.type from number of remaining bytes
+            size_type_0 = 12 + sum([t.max_storage_size() for t in 
+                                    [self.adam_m, self.adam_w] 
+                                    +[self.adam_pf] if self.past > 0 else []])
+            size_type_1 = 24 + sum([t.max_storage_size() for t in 
+                                    [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
+                                     self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
+                                     self.lbfgs_lmal, self.lbfgs_lmys,
+                                     self.lbfgs_lms, self.lbfgs_lmy]
+                                     +[self.lbfgs_pf] if self.past > 0 else []])
+            # due to alignment padding the size might not by exact
+            # but the difference in size for both types is significant, 
+            # so we can just use whichever is closest
+            remaining = len(data) - offset
+            if abs(remaining - size_type_0) < abs(remaining - size_type_1):
+                self.type = 0
+            else:
+                self.type = 1
+
+            if self.type == 0:
+                offset += self.adam_m.load(data, offset)
+                offset += self.adam_w.load(data, offset)
+                if self.past > 0:
+                    offset += self.adam_pf.load(data,offset)
+
+                self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+                self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+                self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+
+            elif self.type == 1:
+
+                offset += self.lbfgs_x.load(data, offset)
+                offset += self.lbfgs_xp.load(data, offset)
+                offset += self.lbfgs_g.load(data, offset)
+                offset += self.lbfgs_gp.load(data, offset)
+                offset += self.lbfgs_d.load(data, offset)
+                if self.past > 0:
+                    offset += self.lbfgs_pf.load(data, offset)
+                offset += self.lbfgs_lmal.load(data, offset)
+                offset += self.lbfgs_lmys.load(data, offset)
+                offset += self.lbfgs_lms.load(data, offset)
+                offset += self.lbfgs_lmy.load(data, offset)
+
+                self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+                self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+                self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+                self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+                self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+                self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+
+        else:
+            raise ValueError('Invalid version of checkpoint file')
+
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_PARAMETER_COUNT_LOW, self.nx & 0xffffffff)
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_PARAMETER_COUNT_HIGH, (self.nx >> 32) & 0xffffffff)
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
+
+        if self.type == 0:
+            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
+            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
+
+            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
+            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
+            if self.past > 0:
+                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
+
+        elif self.type == 1:
+            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
+            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
+            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
+            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
+            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
+            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
+
+            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
+            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
+            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
+            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
+            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
+            if self.past > 0:
+                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
+            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
+            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
+            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
+            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
+        else:
+            raise ValueError('Unknown optimizer type')
+
+class ModelParams:
+    def __init__(self):
+        pass
+
+    def load(self, data, offset):
+        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        return offset
+
+    def get_n_ff(self):
+        # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
+        return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
+    
+    def save_gguf(self, gguf_writer):
+        # self.n_vocab not saved
+        gguf_writer.add_embedding_length(self.n_embd)
+        gguf_writer.add_head_count(self.n_head)
+        gguf_writer.add_block_count(self.n_layer)
+        gguf_writer.add_rope_dimension_count(self.n_rot)
+        gguf_writer.add_feed_forward_length(self.get_n_ff())
+
+class Layer:
+    def __init__(self, params, bid):
+        self.bid = bid
+        self.att_norm = Tensor('f', [params.n_embd])
+        self.wq       = Tensor('f', [params.n_embd, params.n_embd])
+        self.wk       = Tensor('f', [params.n_embd, params.n_embd])
+        self.wv       = Tensor('f', [params.n_embd, params.n_embd])
+        self.wo       = Tensor('f', [params.n_embd, params.n_embd])
+        self.ffn_norm = Tensor('f', [params.n_embd])
+        self.w1       = Tensor('f', [params.n_embd, params.get_n_ff()])
+        self.w2       = Tensor('f', [params.get_n_ff(), params.n_embd])
+        self.w3       = Tensor('f', [params.n_embd, params.get_n_ff()])
+
+    def load(self, data, offset):
+        offset += self.att_norm.load(data, offset)
+        offset += self.wq.load(data, offset)
+        offset += self.wk.load(data, offset)
+        offset += self.wv.load(data, offset)
+        offset += self.wo.load(data, offset)
+        offset += self.ffn_norm.load(data, offset)
+        offset += self.w1.load(data, offset)
+        offset += self.w2.load(data, offset)
+        offset += self.w3.load(data, offset)
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        self.att_norm.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_NORM].format(bid=self.bid))
+        self.wq.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_Q].format(bid=self.bid))
+        self.wk.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_K].format(bid=self.bid))
+        self.wv.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_V].format(bid=self.bid))
+        self.wo.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_OUT].format(bid=self.bid))
+        self.ffn_norm.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.FFN_NORM].format(bid=self.bid))
+        self.w1.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.FFN_GATE].format(bid=self.bid))
+        self.w2.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.FFN_DOWN].format(bid=self.bid))
+        self.w3.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.FFN_UP].format(bid=self.bid))
+
+class Model:
+    def __init__(self):
+        self.params = ModelParams()
+        self.layers = []
+
+    def load(self, data, offset):
+        offset = self.params.load(data, offset)
+
+        self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
+        self.norm     = Tensor('f', [self.params.n_embd])
+        self.output   = Tensor('f', [self.params.n_embd, self.params.n_vocab])
+
+        offset += self.tok_embd.load(data, offset)
+        offset += self.norm.load(data, offset)
+        offset += self.output.load(data, offset)
+
+        self.layers.clear()
+        for bid in range(self.n_layer):
+            layer = Layer(self.params, bid)
+            offset += layer.load(data, offset)
+            self.layers.append(layer)
+
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        self.params.save_gguf(gguf_writer)
+
+        self.tok_embd.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.TOKEN_EMBD])
+        self.norm.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.OUTPUT_NORM])
+        self.output.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.OUTPUT])
+
+        for layer in self.layers:
+            layer.save_gguf(gguf_writer)
+
+class Checkpoint:
+    def __init__(self):
+        self.model = Model()
+        self.opt_ctx = OptimizationContext()
+
+    def load(self, data, offset):
+        magic   = bytes(data[offset:offset + 4]); offset += 4
+        if magic != b'ggcp':
+            raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
+
+        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        if version != 0:
+            raise ValueError('Invalid version of checkpoint file')
+
+        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        
+        offset += self.model.load(data, offset)
+        offset += self.opt_ctx.load(data, offset)
+
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
+        self.model.save_gguf(gguf_writer)
+        self.opt_ctx.save_gguf(gguf_writer)
+
+def handle_args():
+    parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
+    parser.add_argument('--input',  '-i', type = Path, help = 'Input train checkpoint filename')
+    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
+    return parser.parse_args()
+
+def main():
+    cfg = handle_args()
+    data = np.memmap(cfg.input, mode = 'r')
+    chk = Checkpoint()
+    offset = 0
+    offset = chk.load(data, offset)
+    # we should have read all available data
+    assert(offset == len(data))
+
+    gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
+    chk.save_gguf(gguf_writer)
+    print("    gguf: write header")
+    gguf_writer.write_header_to_file()
+    print("    gguf: write metadata")
+    gguf_writer.write_kv_data_to_file()
+    print("    gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+    gguf_writer.close()
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From cb42324d6a88c65ae9ee1b08bf44f8bfb58b402c Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 27 Aug 2023 23:20:18 +0200
Subject: [PATCH 073/100] add gguf arch and ftype

---
 .../train-text-from-scratch/convert-train-checkpoint-to-gguf.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
index 55a6667744cb9..3313ac4ca8f70 100644
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@@ -412,6 +412,8 @@ def load(self, data, offset):
         return offset
 
     def save_gguf(self, gguf_writer):
+        gguf_writer.add_architecture()
+        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
         gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
         gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
         gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)

From 495a62a14286c5538d464a6c45d6528ddf9bbac5 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 27 Aug 2023 23:21:08 +0200
Subject: [PATCH 074/100] save opt parameter counter as uint64

---
 .../convert-train-checkpoint-to-gguf.py       |  3 +--
 .../train-text-from-scratch.cpp               | 22 +++++--------------
 2 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
index 3313ac4ca8f70..7c2e982d8cc42 100644
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@@ -253,8 +253,7 @@ def load(self, data, offset):
     def save_gguf(self, gguf_writer):
         gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
         gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_PARAMETER_COUNT_LOW, self.nx & 0xffffffff)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_PARAMETER_COUNT_HIGH, (self.nx >> 32) & 0xffffffff)
+        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
         gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
         gguf_writer.add_uint32(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
 
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 28a7d6811be8c..546a16c97a11c 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -241,8 +241,7 @@ const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
 const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
 const char * LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version";
 const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count";
-const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT_LOW        = "optimizer.parameter_count.low";
-const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT_HIGH       = "optimizer.parameter_count.high";
+const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count";
 const char * LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count";
 const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized";
 const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss";
@@ -1530,12 +1529,9 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
     GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT);
     GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED);
 
-    // gguf v1 only supports values with up to 32-bit precision
-    uint32_t nx[2] = { 0, 0 };
-    GGUF_GET_KEY(fctx, nx[0], gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT_LOW);
-    GGUF_GET_KEY(fctx, nx[1], gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT_HIGH);
-    memcpy(&opt->nx, &nx[0], sizeof(opt->nx));
-    // TODO read as 64-bit uint
+    uint64_t nx;
+    GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT);
+    opt->nx = (size_t) nx;
 
     // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know
 
@@ -1586,15 +1582,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
 void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
     gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0);
     gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
-
-    // gguf v1 only supports values with up to 32-bit precision,
-    uint32_t nx[2] = { 0, 0 };
-    nx[0] = opt->nx & 0xFFFFFFFF;
-    nx[1] = (opt->nx >> 32) & 0xFFFFFFFF;
-    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT_LOW,  nx[0]);
-    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT_HIGH, nx[1]);
-    // TODO set as 64-bit uint
-
+    gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx);
     gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter);
     gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
 

From ef899fbe890f0fa09c76981b093dc144f04704b3 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 27 Aug 2023 23:21:59 +0200
Subject: [PATCH 075/100] add gguf key and tensor names for optimizer and
 training

---
 .../convert-train-checkpoint-to-gguf.py       | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
index 7c2e982d8cc42..37d0244fb6583 100644
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@@ -7,6 +7,46 @@
 import sys
 from pathlib import Path
 
+# gguf constants
+LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
+LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
+LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
+LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
+LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
+LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
+LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
+LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
+LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
+LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
+LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
+LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
+LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
+LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
+
+LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
+LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
+LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
+
+LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
+LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
+LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
+LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
+LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
+LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
+
+LLM_KV_TRAINING_FILE_VERSION    = "training.file_version"
+LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
+LLM_KV_TRAINING_SAMPLE_COUNT    = "training.sample_count"
+LLM_KV_TRAINING_TOKEN_COUNT     = "training.token_count"
+
 class Tensor:
     def __init__(self, dtype='f', ne=None):
         if ne is None:

From d71069c4fba085ea52b6362ac3c9634429b1a66f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 27 Aug 2023 23:25:41 +0200
Subject: [PATCH 076/100] add layer_norm_rms_eps to checkpoint convert script

---
 .../train-text-from-scratch/convert-train-checkpoint-to-gguf.py  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
index 37d0244fb6583..d7ea4e6fe590a 100644
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@@ -453,6 +453,7 @@ def load(self, data, offset):
     def save_gguf(self, gguf_writer):
         gguf_writer.add_architecture()
         gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
+        gguf_writer.add_layer_norm_rms_eps(1e-5)
         gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
         gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
         gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)

From 91a4ccaf96cc0c369f11c8dda83648aef4ba1cbf Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 27 Aug 2023 23:32:49 +0200
Subject: [PATCH 077/100] use same GGUF_GET_KEY macro as in llama.cpp

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 546a16c97a11c..88439cfe307fc 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1492,11 +1492,11 @@ void set_logits_masked(struct ggml_tensor * logits, std::vector<bool>& mask, flo
     if (kid >= 0) { \
         enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
         if (ktype != (type)) { \
-            throw std::runtime_error("key has wrong type"); \
+            throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
         } \
         (dst) = func(ctx, kid); \
     } else if (req) { \
-        throw std::runtime_error("key not found in model"); \
+        throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
     } \
 }
 

From 0b2c85b0251bbde2e22e450746d3ebb1c7ee91ba Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 27 Aug 2023 23:39:21 +0200
Subject: [PATCH 078/100] use norm_rms_eps, and rope parameters and command
 line options to set them

---
 .../train-text-from-scratch.cpp               | 56 ++++++++++++++++---
 1 file changed, 49 insertions(+), 7 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 88439cfe307fc..8f35fe2c95565 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -18,8 +18,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-static const float rms_norm_eps = 1e-5f;
-
 struct random_normal_distribution {
     std::mt19937 gen;
     std::normal_distribution<float> rd;
@@ -502,6 +500,7 @@ struct ggml_tensor * forward(
     const int n_layer = hparams.n_layer;
     const int n_head  = hparams.n_head;
     const int n_rot   = hparams.n_rot;
+    const float rms_norm_eps = hparams.f_norm_rms_eps;
 
     struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
@@ -959,6 +958,9 @@ struct ggml_tensor * llama_build_train_graphs(
     const int n_head     = hparams.n_head;
     const int n_rot      = hparams.n_rot;
     const int n_ff       = hparams.n_ff;
+    const float f_norm_rms_eps  = hparams.f_norm_rms_eps;
+    const float rope_freq_base  = hparams.rope_freq_base;
+    const float rope_freq_scale = hparams.rope_freq_scale;
     const int rope_mode  = 0;
 
     auto set_name = [](struct ggml_tensor * t, const char * n) {
@@ -968,6 +970,14 @@ struct ggml_tensor * llama_build_train_graphs(
         }
     };
 
+    // rope has so much parameters that we make a custom function for it
+    auto rope = [ctx, n_past, n_rot, rope_mode, n_ctx, rope_freq_base, rope_freq_scale]
+                (struct ggml_tensor * t) -> struct ggml_tensor * {
+        return ggml_rope_custom(ctx,
+            t, n_past, n_rot, rope_mode, n_ctx,
+            rope_freq_base, rope_freq_scale);
+    };
+
     set_name(tokens_input, "tokens_input");
     set_name(targets,      "targets");
 
@@ -990,15 +1000,15 @@ struct ggml_tensor * llama_build_train_graphs(
 
     for (int il = 0; il < n_layer; ++il) {
         struct my_llama_layer & layer = model->layers[il];
-        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                      set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
+        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, f_norm_rms_eps);                    set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
         struct ggml_tensor * t03 = ggml_repeat       (ctx, layer.attention_norm, t02);              set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
         struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
         struct ggml_tensor * t05 = ggml_mul_mat      (ctx, layer.wq, t04);                          set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
         struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); set_name(t06, "t06");     assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t07 = ggml_rope_inplace (ctx, t06, n_past, n_rot, rope_mode, n_ctx);   set_name(t07, "t07");     assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t07 = rope              (t06);                                         set_name(t07, "t07");     assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
         struct ggml_tensor * t08 = ggml_mul_mat      (ctx, layer.wk, t04);                          set_name(t08, "t08");     assert_shape_2d(t08, n_embd, N*n_batch);
         struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t10 = ggml_rope_inplace (ctx, t09, n_past, n_rot, rope_mode, n_ctx);   set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t10 = rope              (t09);                                         set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
         struct ggml_tensor * t11 = ggml_mul_mat      (ctx, t04, layer.wv);                          set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd);
         struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd/n_head, n_head); set_name(t12, "t12");     assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
         struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                        set_name(t13, "t13");     assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
@@ -1019,7 +1029,7 @@ struct ggml_tensor * llama_build_train_graphs(
         struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                 set_name(t19, "t19");     assert_shape_2d(t19, n_embd, N*n_batch);
         struct ggml_tensor * t20 = ggml_mul_mat      (ctx, layer.wo, t19);                          set_name(t20, "t20");     assert_shape_2d(t20, n_embd, N*n_batch);
         struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                               set_name(t21, "t21");     assert_shape_2d(t21, n_embd, N*n_batch);
-        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                      set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
+        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, f_norm_rms_eps);                    set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
         struct ggml_tensor * t23 = ggml_repeat       (ctx, layer.ffn_norm, t22);                    set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
         struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
         struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.w3, t24);                          set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
@@ -1031,7 +1041,7 @@ struct ggml_tensor * llama_build_train_graphs(
         cur = t30;
         checkpoints.push_back(cur);
     }
-    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                   set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
+    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, f_norm_rms_eps);                 set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
     struct ggml_tensor * t32   = ggml_repeat            (ctx, model->norm, t31);                    set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
     struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                            set_name(t33, "t33");     assert_shape_2d(t33, n_embd, N*n_batch);
     struct ggml_tensor * t34   = ggml_mul_mat           (ctx, model->output, t33);                  set_name(t34, "t34");     assert_shape_2d(t34, n_vocab, N*n_batch);
@@ -1960,6 +1970,10 @@ struct train_params {
     int n_examples;
     int n_predict;
 
+    float f_norm_rms_eps;
+    float rope_freq_base;
+    float rope_freq_scale;
+
     int print_info_interval;
     int print_details_interval;
 
@@ -2017,6 +2031,10 @@ struct train_params get_default_train_params() {
     params.n_examples =    1;
     params.n_predict  = 1024;
 
+    params.f_norm_rms_eps  = 1e-5;
+    params.rope_freq_base  = 10000.0f;
+    params.rope_freq_scale = 1.0f;
+
     params.print_info_interval    = 1;
     params.print_details_interval = 2;
 
@@ -2070,6 +2088,9 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --ff N                     Feedforward size used for new models. (default %d)\n", params->n_ff);
     fprintf(stderr, "  --head N                   Number of heads for new models (default %d)\n", params->n_head);
     fprintf(stderr, "  --layer N                  Number of layers for new models (default %d)\n", params->n_layer);
+    fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
+    fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
+    fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
     fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
     fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
     fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
@@ -2188,6 +2209,24 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_layer = std::stoi(argv[i]);
+        } else if (arg == "--norm-rms-eps") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->f_norm_rms_eps = std::stof(argv[i]);
+        } else if (arg == "--rope-freq-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->rope_freq_base = std::stof(argv[i]);
+        } else if (arg == "--rope-freq-scale") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->rope_freq_scale = std::stof(argv[i]);
         } else if (arg == "-t" || arg == "--threads") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2476,6 +2515,9 @@ int main(int argc, char ** argv) {
     model.hparams.n_ff    = params.n_ff;
     // llama.cpp requires n_rot to be exactly n_embd / n_head
     model.hparams.n_rot   = model.hparams.n_embd / model.hparams.n_head;
+    model.hparams.f_norm_rms_eps  = params.f_norm_rms_eps;
+    model.hparams.rope_freq_base  = params.rope_freq_base;
+    model.hparams.rope_freq_scale = params.rope_freq_scale;
 
     print_params(&model.hparams);
 

From ca5b344fb14b3daaa0bcc7e949f85d6bcbac4f5f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 01:46:37 +0200
Subject: [PATCH 079/100] fix memory corruption bug in gguf

ctx->kv and ctx->infos was reallocated using not-aligned realloc, but freed with aligned free.
to fix this a GGML_ALIGNED_REALLOC was added, but there is no posix_memalign_realloc function.
so on non-windows and non-mingw32 platforms we fall back to aligned malloc, followed by copying
and freeing the old data.
---
 ggml.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/ggml.c b/ggml.c
index a6324c4b47c28..cfc8798b8793f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -194,8 +194,9 @@ typedef void * thread_ret_t;
 //
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
-#define GGML_ALIGNED_MALLOC(size)  _aligned_malloc(size, GGML_MEM_ALIGN)
-#define GGML_ALIGNED_FREE(ptr)     _aligned_free(ptr)
+#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
+#define GGML_ALIGNED_REALLOC(ptr, old_size, size) _aligned_realloc(ptr, size, GGML_MEM_ALIGN)
+#define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
 #else
 inline static void * ggml_aligned_malloc(size_t size) {
     void * aligned_memory = NULL;
@@ -220,8 +221,16 @@ inline static void * ggml_aligned_malloc(size_t size) {
     }
     return aligned_memory;
 }
-#define GGML_ALIGNED_MALLOC(size)  ggml_aligned_malloc(size)
-#define GGML_ALIGNED_FREE(ptr)     free(ptr)
+inline static void * ggml_aligned_realloc(void * ptr, size_t old_size, size_t size) {
+    // There is no posix_memalign_realloc function
+    void * result = ggml_aligned_malloc(size);
+    memcpy(result, ptr, old_size);
+    free(ptr);
+    return result;
+}
+#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
+#define GGML_ALIGNED_REALLOC(ptr, old_size, size) ggml_aligned_realloc(size)
+#define GGML_ALIGNED_FREE(ptr)    free(ptr)
 #endif
 
 #define UNUSED GGML_UNUSED
@@ -20073,7 +20082,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
 
     const int n_kv = gguf_get_n_kv(ctx);
 
-    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
+    ctx->kv = GGML_ALIGNED_REALLOC(ctx->kv, n_kv * sizeof(struct gguf_kv), (n_kv + 1) * sizeof(struct gguf_kv));
     ctx->kv[n_kv].key.n    = strlen(key);
     ctx->kv[n_kv].key.data = strdup(key);
     ctx->header.n_kv++;
@@ -20230,7 +20239,7 @@ void gguf_add_tensor(
              struct gguf_context * ctx,
         const struct ggml_tensor * tensor) {
     const int idx = ctx->header.n_tensors;
-    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
+    ctx->infos = GGML_ALIGNED_REALLOC(ctx->infos, idx*sizeof(struct gguf_tensor_info), (idx + 1)*sizeof(struct gguf_tensor_info));
 
     ctx->infos[idx].name.n    = strlen(tensor->name);
     ctx->infos[idx].name.data = strdup(tensor->name);

From 5d94997a09b30cc8365a3399c81eae209e729a64 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 01:46:53 +0200
Subject: [PATCH 080/100] add gguf example cmake file

---
 examples/gguf/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 examples/gguf/CMakeLists.txt

diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt
new file mode 100644
index 0000000000000..7d1806af3ebfc
--- /dev/null
+++ b/examples/gguf/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET gguf)
+add_executable(${TARGET} gguf.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)

From 76d2794e11f481aef9c920f32357b952bb815120 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 01:47:31 +0200
Subject: [PATCH 081/100] bug fixes in tokenize_file

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 8f35fe2c95565..52495a6b330d9 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1319,6 +1319,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
 
     std::vector<char> buf;
     buf.resize(size+1);
+    out.resize(size+1);
 
     if (std::fread(buf.data(), size, 1, fp) != 1) {
         throw std::runtime_error(std::string("unexpectedly reached end of file"));
@@ -1332,8 +1333,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
     int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
     if (n_tokens < 0) {
         out.resize(-n_tokens);
-        llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
+        n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
     }
+    GGML_ASSERT(n_tokens >= 0);
+    out.resize(n_tokens);
 
     bool verify = false;
     if (verify) {

From 4882ff0c595b38199070022a1fd6efcc86cda820 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 01:47:45 +0200
Subject: [PATCH 082/100] bug fixes in load_llama_model_gguf

---
 .../train-text-from-scratch/train-text-from-scratch.cpp     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 52495a6b330d9..5d5e486332027 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1665,18 +1665,18 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g
     
     std::vector<char> keybuf;
     keybuf.resize(512);
-    auto kv = [arch, &keybuf](const char * key) -> const char * {
+    auto kv = [&arch, &keybuf](const char * key) -> const char * {
         snprintf(keybuf.data(), keybuf.size(), key, arch.c_str());
         return keybuf.data();
     };
 
     std::vector<char> tn_buf;
     tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [arch, &tn_buf](const char * key) -> const char * {
+    auto tn = [&arch, &tn_buf](const char * key) -> const char * {
         snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
         return tn_buf.data();
     };
-    auto tni = [arch, &tn_buf](const char * key, int bid) -> const char * {
+    auto tni = [&arch, &tn_buf](const char * key, int bid) -> const char * {
         snprintf(tn_buf.data(), tn_buf.size(), key, bid);
         std::string s = tn_buf.data();
         snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());

From 152cfaac3643c976321695eef97a54f1d837b81b Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 01:48:21 +0200
Subject: [PATCH 083/100] bug fix: init model when no checkpoint was loaded

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 5d5e486332027..18683f18f2c6b 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2594,6 +2594,9 @@ int main(int argc, char ** argv) {
 
     printf("%s: init model\n", __func__);
     bool existed = load_checkpoint_file(params.fn_checkpoint_in, &model, opt);
+    if (!existed) {
+        init_model(&model);
+    }
     set_param_model(&model);
 
     opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;

From 1f83343498e7630cec9ddaf2cea5fc3352233c28 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 02:02:05 +0200
Subject: [PATCH 084/100] bug fix in read_tensor_by_name

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 18683f18f2c6b..4f0abca5acaac 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1526,7 +1526,7 @@ void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, co
     if (dst == NULL) {
         return;
     }
-    struct ggml_tensor * t  = ggml_get_tensor(ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
+    struct ggml_tensor * t  = ggml_get_tensor(ctx, name);
     GGML_ASSERT(are_same_layout(dst, t));
     memcpy(dst->data, t->data, ggml_nbytes(t));
 }

From 3d8d88404986ce07c8e8e6382b638e5b1dacbcde Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 15:07:00 +0200
Subject: [PATCH 085/100] bug fix in load_opt_context_gguf

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 4f0abca5acaac..ade3a64ea8dfa 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1561,8 +1561,8 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
         ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
 
         read_tensor_by_name(opt->adam.m,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
-        read_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
-        read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
+        read_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
+        read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
     } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
         opt->params.type = GGML_OPT_LBFGS;
 

From e86b3e3257c614e7eac8c2ffe03d007976845708 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 15:26:44 +0200
Subject: [PATCH 086/100] avoid printing lots of spaced on the unusual case
 that loss gets nan

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index ade3a64ea8dfa..a18f8f828522e 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2457,7 +2457,7 @@ void opt_callback(void * vdata, float * sched) {
     float min_sched = params->adam_min_alpha / params->adam_alpha;
     *sched = min_sched + *sched * (1.0f - min_sched);
 
-    int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
+    int impr_plot = std::isnan(opt->loss_after) ? 0 : -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
     printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0);
 
     if (data->shuffle_countdown < n_batch) {

From daa0b6c6a4af4ec97a2ad317c97249ff7e5295fa Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 15:27:26 +0200
Subject: [PATCH 087/100] set name of tensors with empty name from what was
 read from gguf

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index a18f8f828522e..6d25edd78099c 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1529,6 +1529,10 @@ void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, co
     struct ggml_tensor * t  = ggml_get_tensor(ctx, name);
     GGML_ASSERT(are_same_layout(dst, t));
     memcpy(dst->data, t->data, ggml_nbytes(t));
+
+    if (strlen(ggml_get_name(dst)) == 0) {
+        ggml_set_name(dst, name);
+    }
 }
 
 void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {

From f97f92bce575f108710835fbeef0667d09e86961 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 15:28:19 +0200
Subject: [PATCH 088/100] remove trailing whitespace

---
 .../train-text-from-scratch/train-text-from-scratch.cpp   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 6d25edd78099c..770e1a1c2dc9a 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1294,7 +1294,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
     FILE * fp = std::fopen(filename, "rb");
     if (fp == NULL) {
         return 0;
-    } 
+    }
 
 #ifdef _WIN32
     GGML_ASSERT(_fseeki64(fp, (__int64) 0, SEEK_END) == 0);
@@ -1666,7 +1666,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
 void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) {
     // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
     std::string arch;
-    
+
     std::vector<char> keybuf;
     keybuf.resize(512);
     auto kv = [&arch, &keybuf](const char * key) -> const char * {
@@ -1701,10 +1701,10 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g
     GGUF_GET_KEY(fctx, model->hparams.n_ff,    gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
     GGUF_GET_KEY(fctx, model->hparams.n_head,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
     GGUF_GET_KEY(fctx, model->hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
-    
+
     model->hparams.n_rot = model->hparams.n_embd / model->hparams.n_head;
     GGUF_GET_KEY(fctx, model->hparams.n_rot,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
-    
+
     float rope_freq_scale = 1.0f;
     GGUF_GET_KEY(fctx, model->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
     GGUF_GET_KEY(fctx, model->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));

From c690c203628b8c23b2ad663de4244507c85d0ccf Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 16:09:53 +0200
Subject: [PATCH 089/100] print data checksums before saving and after loading
 to verify correctness

---
 .../train-text-from-scratch.cpp               | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 770e1a1c2dc9a..9db0f1afaa834 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -18,6 +18,53 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+uint32_t compute_data_checksum(struct ggml_tensor * tensor) {
+    const int n3 = (tensor->n_dims >= 3) ? tensor->ne[3] : 1;
+    const int n2 = (tensor->n_dims >= 2) ? tensor->ne[2] : 1;
+    const int n1 = (tensor->n_dims >= 1) ? tensor->ne[1] : 1;
+    const int n0 = (tensor->n_dims >= 0) ? tensor->ne[0] : 1;
+    const size_t nb0 = tensor->nb[0];
+    const size_t nb1 = tensor->nb[1];
+    const size_t nb2 = tensor->nb[2];
+    const size_t nb3 = tensor->nb[3];
+    const size_t nb  = ggml_element_size(tensor);
+    uint32_t result = 0;
+    for (int i3 = 0; i3 < n3; ++i3) {
+        for (int i2 = 0; i2 < n2; ++i2) {
+            for (int i1 = 0; i1 < n1; ++i1) {
+                for (int i0 = 0; i0 < n0; ++i0) {
+                    char * ptr = ((char *) tensor->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
+                    uint32_t val;
+                    memcpy(&val, ptr, nb);
+                    result = result ^ val;
+                    result = (((result << 1u) | ((result >> 31u) & 0x1u)) + 1u) & 0xffffffffu;
+                }
+            }
+        }
+    }
+    return result;
+}
+
+void print_data_checksum(struct ggml_tensor * tensor) {
+    uint32_t chk = compute_data_checksum(tensor);
+    printf("%s: chk=[%08x] data=[%p] name=%s\n", __func__, chk, tensor->data, ggml_get_name(tensor));
+}
+
+void print_data_checksums(struct ggml_cgraph * g) {
+    for (int i = 0; i < g->n_nodes; ++i) {
+        struct ggml_tensor * node = g->nodes[i];
+        for (int j = 0; j<GGML_MAX_SRC; ++j) {
+            if (node->src[j]) {
+                struct ggml_tensor * src = node->src[j];
+                uint32_t chk = compute_data_checksum(src);
+                printf("%s: node[%3d]->src[%d] chk=[%08x] data=[%p] op=%s name=%s\n", __func__, i, j, chk, src->data, ggml_op_name(src->op), ggml_get_name(src));
+            }
+        }
+        uint32_t chk = compute_data_checksum(node);
+        printf("%s: node[%3d]         chk=[%08x] data=[%p] op=%s name=%s\n", __func__, i, chk, node->data, ggml_op_name(node->op), ggml_get_name(node));
+    }
+}
+
 struct random_normal_distribution {
     std::mt19937 gen;
     std::normal_distribution<float> rd;
@@ -1567,6 +1614,12 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
         read_tensor_by_name(opt->adam.m,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
         read_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
         read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
+
+        print_data_checksum(opt->adam.m);
+        print_data_checksum(opt->adam.v);
+        if (opt->adam.pf) {
+            print_data_checksum(opt->adam.pf);
+        }
     } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
         opt->params.type = GGML_OPT_LBFGS;
 
@@ -1617,6 +1670,12 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
                     ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
                 }
 
+                print_data_checksum(opt->adam.m);
+                print_data_checksum(opt->adam.v);
+                if (opt->adam.pf) {
+                    print_data_checksum(opt->adam.pf);
+                }
+
                 gguf_add_tensor(fctx, opt->adam.m);
                 gguf_add_tensor(fctx, opt->adam.v);
                 if (opt->adam.pf) {
@@ -1719,6 +1778,10 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g
     read_tensor_by_name(model->norm,           f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM));
     read_tensor_by_name(model->output,         f_ggml_ctx, tn(LLM_TENSOR_OUTPUT));
 
+    print_data_checksum(model->tok_embeddings);
+    print_data_checksum(model->norm);
+    print_data_checksum(model->output);
+
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
         auto & layer = model->layers[i];
 
@@ -1731,6 +1794,16 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g
         read_tensor_by_name(layer.w1,             f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
         read_tensor_by_name(layer.w2,             f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
         read_tensor_by_name(layer.w3,             f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
+
+        print_data_checksum(layer.attention_norm);
+        print_data_checksum(layer.wq);
+        print_data_checksum(layer.wk);
+        print_data_checksum(layer.wv);
+        print_data_checksum(layer.wo);
+        print_data_checksum(layer.ffn_norm);
+        print_data_checksum(layer.w1);
+        print_data_checksum(layer.w2);
+        print_data_checksum(layer.w3);
     }
 }
 
@@ -1857,6 +1930,10 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod
         gguf_free(vctx);
     }
 
+    print_data_checksum(model->tok_embeddings);
+    print_data_checksum(model->norm);
+    print_data_checksum(model->output);
+
     // add tensors
     gguf_add_tensor(fctx, model->tok_embeddings);
     gguf_add_tensor(fctx, model->norm);
@@ -1864,6 +1941,16 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
         auto & layer = model->layers[i];
 
+        print_data_checksum(layer.attention_norm);
+        print_data_checksum(layer.wq);
+        print_data_checksum(layer.wk);
+        print_data_checksum(layer.wv);
+        print_data_checksum(layer.wo);
+        print_data_checksum(layer.ffn_norm);
+        print_data_checksum(layer.w1);
+        print_data_checksum(layer.w2);
+        print_data_checksum(layer.w3);
+
         gguf_add_tensor(fctx, layer.attention_norm);
         gguf_add_tensor(fctx, layer.wq);
         gguf_add_tensor(fctx, layer.wk);

From 5f27ade48e38087f87c8d0fe7872cc2e1b864311 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 17:57:10 +0200
Subject: [PATCH 090/100] bug fixes for convert-train-checkpoint-to-gguf

---
 .../convert-train-checkpoint-to-gguf.py       | 180 +++++++++---------
 1 file changed, 91 insertions(+), 89 deletions(-)

diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
index d7ea4e6fe590a..a69a9687d5494 100644
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@@ -4,7 +4,9 @@
 import argparse
 import gguf
 import os
+import struct
 import sys
+import numpy as np
 from pathlib import Path
 
 # gguf constants
@@ -55,7 +57,10 @@ def __init__(self, dtype='f', ne=None):
         self.ne = ne
         self.nbytes = 0
         if self.dtype == 'f':
-            self.nbytes = product(self.ne) * 4
+            if len(self.ne) == 0:
+                self.nbytes = 0
+            else:
+                self.nbytes = int(np.product(self.ne)) * 4
         else:
             raise ValueError(f"Unhandled data type '{self.dtype}'")
 
@@ -67,7 +72,7 @@ def load(self, data, offset):
         assert(nd == len(self.ne))
         ne = []
         for d in range(nd):
-            n = struct.unpack('<Q', bytes(data[offset:offset + 8]))[0]; offset += 8
+            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
             ne.append(n)
 
         assert(tuple(ne) == tuple(self.ne))
@@ -81,7 +86,7 @@ def load(self, data, offset):
         # 32-byte alignment
         offset += (0 - offset) & 31
         self.data = data[offset:offset+self.nbytes]
-
+        offset += self.nbytes
         return offset
 
     def max_storage_size(self):
@@ -100,7 +105,7 @@ def save_gguf(self, gguf_writer, name):
             name=name, 
             tensor=self.data, 
             raw_shape=np.array(list(reversed(self.ne))),
-            raw_type=gguf.GGMLQuantizationType.F32)
+            raw_dtype=gguf.GGMLQuantizationType.F32)
 
 class OptimizationParamsV0:
     def __init__(self):
@@ -141,26 +146,26 @@ def load(self, data, offset):
         self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
         offset += 4
 
-        if version == 0:
+        if self.version == 0:
             params = OptimizationParamsV0()
-            offset += params.load(data, offset)
+            offset = params.load(data, offset)
             self.past = params.past
             self.lbfgs_m = params.lbfgs_m
-            self.nx = struct.unpack('<Q', bytes(data[offset:offset + 8]))[0];  offset += 8
+            self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0];  offset += 8
             self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
             self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
             self.type = params.type
 
             self.adam_m  = Tensor('f', [self.nx])
             self.adam_v  = Tensor('f', [self.nx])
-            self.adam_pf = Tensor('f', [self.past])
+            self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
 
             self.lbfgs_x    = Tensor('f', [self.nx])
             self.lbfgs_xp   = Tensor('f', [self.nx])
             self.lbfgs_g    = Tensor('f', [self.nx])
             self.lbfgs_gp   = Tensor('f', [self.nx])
             self.lbfgs_d    = Tensor('f', [self.nx])
-            self.lbfgs_pf   = Tensor('f', [self.past])
+            self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
             self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
             self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
             self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
@@ -174,32 +179,30 @@ def load(self, data, offset):
                 mh = Tensor('f', [self.nx])
                 mv = Tensor('f', [self.nx])
 
-                offset += x.load(data, offset)
-                offset += g.load(data, offset)
-                offset += g2.load(data, offset)
-                offset += self.adam_m.load(data, offset)
-                offset += self.adam_v.load(data, offset)
-                offset += mh.load(data, offset)
-                offset += vh.load(data, offset)
-                if self.past > 0:
-                    offset += self.adam_pf.load(data, offset)
+                offset = x.load(data, offset)
+                offset = g.load(data, offset)
+                offset = g2.load(data, offset)
+                offset = self.adam_m.load(data, offset)
+                offset = self.adam_v.load(data, offset)
+                offset = mh.load(data, offset)
+                offset = vh.load(data, offset)
+                offset = self.adam_pf.load(data, offset)
 
                 self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
                 self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
                 self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
 
             elif self.type == 1:
-                offset += self.lbfgs_x.load(data, offset)
-                offset += self.lbfgs_xp.load(data, offset)
-                offset += self.lbfgs_g.load(data, offset)
-                offset += self.lbfgs_gp.load(data, offset)
-                offset += self.lbfgs_d.load(data, offset)
-                if self.past > 0:
-                    offset += self.lbfgs_pf.load(data, offset)
-                offset += self.lbfgs_lmal.load(data, offset)
-                offset += self.lbfgs_lmys.load(data, offset)
-                offset += self.lbfgs_lms.load(data, offset)
-                offset += self.lbfgs_lmy.load(data, offset)
+                offset = self.lbfgs_x.load(data, offset)
+                offset = self.lbfgs_xp.load(data, offset)
+                offset = self.lbfgs_g.load(data, offset)
+                offset = self.lbfgs_gp.load(data, offset)
+                offset = self.lbfgs_d.load(data, offset)
+                offset = self.lbfgs_pf.load(data, offset)
+                offset = self.lbfgs_lmal.load(data, offset)
+                offset = self.lbfgs_lmys.load(data, offset)
+                offset = self.lbfgs_lms.load(data, offset)
+                offset = self.lbfgs_lmy.load(data, offset)
 
                 self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
                 self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
@@ -212,23 +215,23 @@ def load(self, data, offset):
                 raise ValueError('Unknown optimizer type')
 
 
-        elif version == 1:
+        elif self.version == 1:
             self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
             self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.nx      = struct.unpack('<Q', bytes(data[offset:offset + 8]))[0];  offset += 8
+            self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
             self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
             self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
 
             self.adam_m  = Tensor('f', [self.nx])
-            self.adam_w  = Tensor('f', [self.nx])
-            self.adam_pf = Tensor('f', [self.past])
+            self.adam_v  = Tensor('f', [self.nx])
+            self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
 
             self.lbfgs_x    = Tensor('f', [self.nx])
             self.lbfgs_xp   = Tensor('f', [self.nx])
             self.lbfgs_g    = Tensor('f', [self.nx])
             self.lbfgs_gp   = Tensor('f', [self.nx])
             self.lbfgs_d    = Tensor('f', [self.nx])
-            self.lbfgs_pf   = Tensor('f', [self.past])
+            self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
             self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
             self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
             self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
@@ -237,14 +240,14 @@ def load(self, data, offset):
             # forgot to save type in version 1:
             # guess self.type from number of remaining bytes
             size_type_0 = 12 + sum([t.max_storage_size() for t in 
-                                    [self.adam_m, self.adam_w] 
-                                    +[self.adam_pf] if self.past > 0 else []])
+                                    [self.adam_m, self.adam_v] 
+                                    +([self.adam_pf] if (self.past > 0) else [])])
             size_type_1 = 24 + sum([t.max_storage_size() for t in 
                                     [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
                                      self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
                                      self.lbfgs_lmal, self.lbfgs_lmys,
                                      self.lbfgs_lms, self.lbfgs_lmy]
-                                     +[self.lbfgs_pf] if self.past > 0 else []])
+                                     +([self.lbfgs_pf] if (self.past > 0) else [])])
             # due to alignment padding the size might not by exact
             # but the difference in size for both types is significant, 
             # so we can just use whichever is closest
@@ -255,28 +258,25 @@ def load(self, data, offset):
                 self.type = 1
 
             if self.type == 0:
-                offset += self.adam_m.load(data, offset)
-                offset += self.adam_w.load(data, offset)
-                if self.past > 0:
-                    offset += self.adam_pf.load(data,offset)
+                offset = self.adam_m.load(data, offset)
+                offset = self.adam_v.load(data, offset)
+                offset = self.adam_pf.load(data,offset)
 
                 self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
                 self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
                 self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
 
             elif self.type == 1:
-
-                offset += self.lbfgs_x.load(data, offset)
-                offset += self.lbfgs_xp.load(data, offset)
-                offset += self.lbfgs_g.load(data, offset)
-                offset += self.lbfgs_gp.load(data, offset)
-                offset += self.lbfgs_d.load(data, offset)
-                if self.past > 0:
-                    offset += self.lbfgs_pf.load(data, offset)
-                offset += self.lbfgs_lmal.load(data, offset)
-                offset += self.lbfgs_lmys.load(data, offset)
-                offset += self.lbfgs_lms.load(data, offset)
-                offset += self.lbfgs_lmy.load(data, offset)
+                offset = self.lbfgs_x.load(data, offset)
+                offset = self.lbfgs_xp.load(data, offset)
+                offset = self.lbfgs_g.load(data, offset)
+                offset = self.lbfgs_gp.load(data, offset)
+                offset = self.lbfgs_d.load(data, offset)
+                offset = self.lbfgs_pf.load(data, offset)
+                offset = self.lbfgs_lmal.load(data, offset)
+                offset = self.lbfgs_lmys.load(data, offset)
+                offset = self.lbfgs_lms.load(data, offset)
+                offset = self.lbfgs_lmy.load(data, offset)
 
                 self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
                 self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
@@ -295,7 +295,7 @@ def save_gguf(self, gguf_writer):
         gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
         gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
         gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
+        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
 
         if self.type == 0:
             gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
@@ -357,6 +357,9 @@ def save_gguf(self, gguf_writer):
         gguf_writer.add_rope_dimension_count(self.n_rot)
         gguf_writer.add_feed_forward_length(self.get_n_ff())
 
+def tensor_name(key, bid=None):
+    return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + ".weight"
+
 class Layer:
     def __init__(self, params, bid):
         self.bid = bid
@@ -371,27 +374,27 @@ def __init__(self, params, bid):
         self.w3       = Tensor('f', [params.n_embd, params.get_n_ff()])
 
     def load(self, data, offset):
-        offset += self.att_norm.load(data, offset)
-        offset += self.wq.load(data, offset)
-        offset += self.wk.load(data, offset)
-        offset += self.wv.load(data, offset)
-        offset += self.wo.load(data, offset)
-        offset += self.ffn_norm.load(data, offset)
-        offset += self.w1.load(data, offset)
-        offset += self.w2.load(data, offset)
-        offset += self.w3.load(data, offset)
+        offset = self.att_norm.load(data, offset)
+        offset = self.wq.load(data, offset)
+        offset = self.wk.load(data, offset)
+        offset = self.wv.load(data, offset)
+        offset = self.wo.load(data, offset)
+        offset = self.ffn_norm.load(data, offset)
+        offset = self.w1.load(data, offset)
+        offset = self.w2.load(data, offset)
+        offset = self.w3.load(data, offset)
         return offset
 
     def save_gguf(self, gguf_writer):
-        self.att_norm.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_NORM].format(bid=self.bid))
-        self.wq.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_Q].format(bid=self.bid))
-        self.wk.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_K].format(bid=self.bid))
-        self.wv.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_V].format(bid=self.bid))
-        self.wo.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.ATTN_OUT].format(bid=self.bid))
-        self.ffn_norm.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.FFN_NORM].format(bid=self.bid))
-        self.w1.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.FFN_GATE].format(bid=self.bid))
-        self.w2.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.FFN_DOWN].format(bid=self.bid))
-        self.w3.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.FFN_UP].format(bid=self.bid))
+        self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
+        self.wq.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid))
+        self.wk.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid))
+        self.wv.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid))
+        self.wo.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid))
+        self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid))
+        self.w1.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid))
+        self.w2.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid))
+        self.w3.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid))
 
 class Model:
     def __init__(self):
@@ -405,14 +408,14 @@ def load(self, data, offset):
         self.norm     = Tensor('f', [self.params.n_embd])
         self.output   = Tensor('f', [self.params.n_embd, self.params.n_vocab])
 
-        offset += self.tok_embd.load(data, offset)
-        offset += self.norm.load(data, offset)
-        offset += self.output.load(data, offset)
+        offset = self.tok_embd.load(data, offset)
+        offset = self.norm.load(data, offset)
+        offset = self.output.load(data, offset)
 
         self.layers.clear()
-        for bid in range(self.n_layer):
+        for bid in range(self.params.n_layer):
             layer = Layer(self.params, bid)
-            offset += layer.load(data, offset)
+            offset = layer.load(data, offset)
             self.layers.append(layer)
 
         return offset
@@ -420,9 +423,9 @@ def load(self, data, offset):
     def save_gguf(self, gguf_writer):
         self.params.save_gguf(gguf_writer)
 
-        self.tok_embd.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.TOKEN_EMBD])
-        self.norm.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.OUTPUT_NORM])
-        self.output.save_gguf(gguf_writer, name=gguf.MODEL_TENSOR_NAMES[gguf_writer.arch][gguf.MODEL_TENSOR.OUTPUT])
+        self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
+        self.norm.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
+        self.output.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
 
         for layer in self.layers:
             layer.save_gguf(gguf_writer)
@@ -433,25 +436,24 @@ def __init__(self):
         self.opt_ctx = OptimizationContext()
 
     def load(self, data, offset):
-        magic   = bytes(data[offset:offset + 4]); offset += 4
+        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
         if magic != b'ggcp':
             raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
 
         self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        if version != 0:
+        if self.version != 0:
             raise ValueError('Invalid version of checkpoint file')
 
         self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
         self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
         self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        
-        offset += self.model.load(data, offset)
-        offset += self.opt_ctx.load(data, offset)
+
+        offset = self.model.load(data, offset)
+        offset = self.opt_ctx.load(data, offset)
 
         return offset
 
     def save_gguf(self, gguf_writer):
-        gguf_writer.add_architecture()
         gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
         gguf_writer.add_layer_norm_rms_eps(1e-5)
         gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
@@ -463,8 +465,8 @@ def save_gguf(self, gguf_writer):
 
 def handle_args():
     parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
-    parser.add_argument('--input',  '-i', type = Path, help = 'Input train checkpoint filename')
-    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
+    parser.add_argument('--input',  '-i', type = Path, help = 'Input train checkpoint filename', required=True)
+    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
     return parser.parse_args()
 
 def main():
@@ -476,7 +478,7 @@ def main():
     # we should have read all available data
     assert(offset == len(data))
 
-    gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
+    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
     chk.save_gguf(gguf_writer)
     print("    gguf: write header")
     gguf_writer.write_header_to_file()

From e8df9e6815d44fc1e28ad963b3fa5e16b0e34410 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 18:17:51 +0200
Subject: [PATCH 091/100] temporarily add code to write old checkpoint files

used to verify that old checkpoint files are correctly converted to gguf
---
 .../train-text-from-scratch.cpp               | 324 ++++++++++++++++++
 1 file changed, 324 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 9db0f1afaa834..b62c19540326f 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2025,6 +2025,321 @@ void save_checkpoint_file(const char * filename, const char * fn_vocab_model, st
     gguf_free(fctx);
 }
 
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+    if (tensor == NULL) {
+        file->write_u32(0);
+        file->write_u32(0);
+        file->write_u32(GGML_TYPE_F32);
+        file->seek((0-file->tell()) & 31, SEEK_CUR);
+        printf("%s: write tensor name='%s' data offset='%zu' nbytes='%zu'\n",
+            __func__, "(empty tensor)", file->tell(), (size_t) 0);
+        return;
+    }
+    const char * name = ggml_get_name(tensor);
+    uint32_t name_len = strlen(name);
+    uint32_t nd = tensor->n_dims;
+    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
+                       (uint32_t)tensor->ne[1],
+                       (uint32_t)tensor->ne[2],
+                       (uint32_t)tensor->ne[3] };
+    printf("%s: write tensor name='%s' begin offset='%zu'\n",
+        __func__, name, file->tell());
+    file->write_u32(nd);
+    file->write_u32(name_len);
+    file->write_u32(tensor->type);
+    file->write_raw(ne, sizeof(ne[0]) * nd);
+    file->write_raw(name, name_len);
+    file->seek((0-file->tell()) & 31, SEEK_CUR);
+    printf("%s: write tensor name='%s' data offset='%zu' nbytes='%zu'\n",
+        __func__, name, file->tell(), ggml_nbytes(tensor));
+    file->write_raw(tensor->data, ggml_nbytes(tensor));
+}
+
+struct ggml_opt_params_v0 {
+    enum ggml_opt_type type;
+    int n_threads;
+    int past;
+    float delta;
+    int max_no_improvement;
+    bool print_forward_graph;
+    bool print_backward_graph;
+    struct {
+        int n_iter;
+        float sched;
+        float decay;
+        float alpha;
+        float beta1;
+        float beta2;
+        float eps;
+        float eps_f;
+        float eps_g;
+    } adam;
+    struct {
+        int m;
+        int n_iter;
+        int max_linesearch;
+        float eps;
+        float ftol;
+        float wolfe;
+        float min_step;
+        float max_step;
+        enum ggml_linesearch linesearch;
+    } lbfgs;
+};
+
+void write_opt_context_v0(struct llama_file * file, struct ggml_opt_context * opt) {
+    const uint32_t version = 0;
+    GGML_ASSERT(opt->nx   >= 0);
+    GGML_ASSERT(opt->iter >= 0);
+    file->write_u32(version);
+    ggml_opt_params_v0 params_v0;
+    params_v0.type                 = opt->params.type;
+    params_v0.n_threads            = opt->params.n_threads;
+    params_v0.past                 = opt->params.past;
+    params_v0.delta                = opt->params.delta;
+    params_v0.max_no_improvement   = opt->params.max_no_improvement;
+    params_v0.print_forward_graph  = opt->params.print_forward_graph;
+    params_v0.print_backward_graph = opt->params.print_backward_graph;
+    params_v0.adam.n_iter          = opt->params.adam.n_iter;
+    params_v0.adam.sched           = opt->params.adam.sched;
+    params_v0.adam.decay           = opt->params.adam.decay;
+    params_v0.adam.alpha           = opt->params.adam.alpha;
+    params_v0.adam.beta1           = opt->params.adam.beta1;
+    params_v0.adam.beta2           = opt->params.adam.beta2;
+    params_v0.adam.eps             = opt->params.adam.eps;
+    params_v0.adam.eps_f           = opt->params.adam.eps_f;
+    params_v0.adam.eps_g           = opt->params.adam.eps_g;
+    params_v0.lbfgs.m              = opt->params.lbfgs.m;
+    params_v0.lbfgs.n_iter         = opt->params.lbfgs.n_iter;
+    params_v0.lbfgs.max_linesearch = opt->params.lbfgs.max_linesearch;
+    params_v0.lbfgs.eps            = opt->params.lbfgs.eps;
+    params_v0.lbfgs.ftol           = opt->params.lbfgs.ftol;
+    params_v0.lbfgs.wolfe          = opt->params.lbfgs.wolfe;
+    params_v0.lbfgs.min_step       = opt->params.lbfgs.min_step;
+    params_v0.lbfgs.max_step       = opt->params.lbfgs.max_step;
+    file->write_raw(&params_v0,   sizeof(params_v0));
+    file->write_raw(&opt->nx,     sizeof(opt->nx));
+    file->write_raw(&opt->iter,   sizeof(opt->iter));
+    file->write_u32((uint32_t)  opt->just_initialized);
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                struct ggml_tensor * adam_x  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx);
+                struct ggml_tensor * adam_g1 = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx);
+                struct ggml_tensor * adam_g2 = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx);
+                struct ggml_tensor * adam_mh = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx);
+                struct ggml_tensor * adam_vh = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx);
+                write_tensor(file, adam_x);
+                write_tensor(file, adam_g1);
+                write_tensor(file, adam_g2);
+                write_tensor(file, opt->adam.m);
+                write_tensor(file, opt->adam.v);
+                write_tensor(file, adam_mh);
+                write_tensor(file, adam_vh);
+                write_tensor(file, opt->adam.pf);
+                file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+                file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+                file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                write_tensor(file, opt->lbfgs.x);
+                write_tensor(file, opt->lbfgs.xp);
+                write_tensor(file, opt->lbfgs.g);
+                write_tensor(file, opt->lbfgs.gp);
+                write_tensor(file, opt->lbfgs.d);
+                write_tensor(file, opt->lbfgs.pf);
+                write_tensor(file, opt->lbfgs.lmal);
+                write_tensor(file, opt->lbfgs.lmys);
+                write_tensor(file, opt->lbfgs.lms);
+                write_tensor(file, opt->lbfgs.lmy);
+                file->write_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+                file->write_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+                file->write_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+                file->write_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+                file->write_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+                file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+            } break;
+    }
+}
+
+void write_opt_context_v1(struct llama_file * file, struct ggml_opt_context * opt) {
+    const uint32_t version = 1;
+    GGML_ASSERT(opt->nx   >= 0);
+    GGML_ASSERT(opt->iter >= 0);
+    file->write_u32(version);
+    file->write_u32(opt->params.past);
+    file->write_u32(opt->params.lbfgs.m);
+    file->write_raw(&opt->nx,     sizeof(opt->nx));
+    file->write_raw(&opt->iter,   sizeof(opt->iter));
+    file->write_u32((uint32_t)  opt->just_initialized);
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                GGML_ASSERT(opt->adam.m  != NULL);
+                GGML_ASSERT(opt->adam.v  != NULL);
+                write_tensor(file, opt->adam.m);
+                write_tensor(file, opt->adam.v);
+                write_tensor(file, opt->adam.pf);
+                file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+                file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+                file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                GGML_ASSERT(opt->lbfgs.x != NULL);
+                write_tensor(file, opt->lbfgs.x);
+                write_tensor(file, opt->lbfgs.xp);
+                write_tensor(file, opt->lbfgs.g);
+                write_tensor(file, opt->lbfgs.gp);
+                write_tensor(file, opt->lbfgs.d);
+                write_tensor(file, opt->lbfgs.pf);
+                write_tensor(file, opt->lbfgs.lmal);
+                write_tensor(file, opt->lbfgs.lmys);
+                write_tensor(file, opt->lbfgs.lms);
+                write_tensor(file, opt->lbfgs.lmy);
+                file->write_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+                file->write_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+                file->write_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+                file->write_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+                file->write_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+                file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+            } break;
+    }
+}
+
+void save_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, int opt_version) {
+    struct llama_file file(filename, "wb");
+    if (file.fp == NULL) {
+        return;
+    }
+
+    const uint32_t magic   = 'ggcp';
+    const uint32_t version = 0;
+
+    file.write_u32(magic);
+    file.write_u32(version);
+    file.write_u32(model->train_its);
+    file.write_u32(model->train_samples);
+    file.write_u32(model->train_tokens);
+    file.write_u32(model->hparams.n_vocab);
+    file.write_u32(model->hparams.n_embd);
+    file.write_u32(/*model->hparams.n_mult*/ 256);
+    file.write_u32(model->hparams.n_head);
+    file.write_u32(model->hparams.n_layer);
+    file.write_u32(model->hparams.n_rot);
+
+    write_tensor(&file, model->tok_embeddings);
+    write_tensor(&file, model->norm);
+    write_tensor(&file, model->output);
+
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        write_tensor(&file, layer.attention_norm);
+        write_tensor(&file, layer.wq);
+        write_tensor(&file, layer.wk);
+        write_tensor(&file, layer.wv);
+        write_tensor(&file, layer.wo);
+        write_tensor(&file, layer.ffn_norm);
+        write_tensor(&file, layer.w1);
+        write_tensor(&file, layer.w2);
+        write_tensor(&file, layer.w3);
+    }
+
+    if (opt_version == 0) {
+        write_opt_context_v0(&file, opt);
+    } else {
+        write_opt_context_v1(&file, opt);
+    }
+
+    printf("%s: all written offset='%zu'\n",
+        __func__, file.tell());
+
+}
 float cosine_decay(const int decay_steps, const float minimum, int step) {
     if (step > decay_steps) {
         step = decay_steps;
@@ -2875,6 +3190,15 @@ int main(int argc, char ** argv) {
     printf("%s: total training time=%f seconds\n", __func__, dd);
 
     if (params.n_examples > 0) {
+        for (int opt_version = 0; opt_version < 2; ++opt_version) {
+            std::string fn_checkpoint_out_old = (
+                std::string(params.fn_checkpoint_out)
+                + std::string(".")
+                + std::to_string(opt_version)
+                + std::string(".old.bin"));
+            save_checkpoint(&model, opt, fn_checkpoint_out_old.c_str(), opt_version);
+        }
+
         save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, opt);
     }
 

From 31c093c2ccc9b602755817569750bfe196a446f4 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 18:33:00 +0200
Subject: [PATCH 092/100] bug fixes for convert-train-checkpoint-to-gguf.py
 loading checkpoints with opt_version=0

---
 .../convert-train-checkpoint-to-gguf.py                     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
index a69a9687d5494..03e6d62450f79 100644
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@@ -116,8 +116,8 @@ def load(self, data, offset):
         self.n_threads            = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
         self.past                 = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
         self.delta                = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.print_forward_graph  = struct.unpack('<?', bytes(data[offset:offset + 4]))[0];  offset += 4 # 32bit-aligned
-        self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 4]))[0];  offset += 4 # 32bit-aligned
+        self.print_forward_graph  = struct.unpack('<?', bytes(data[offset:offset + 1]))[0];  offset += 4 # 32bit-aligned
+        self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0];  offset += 4 # 32bit-aligned
         self.adam_n_iter          = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
         self.adam_sched           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
         self.adam_decay           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
@@ -177,7 +177,7 @@ def load(self, data, offset):
                 g  = Tensor('f', [self.nx])
                 g2 = Tensor('f', [self.nx])
                 mh = Tensor('f', [self.nx])
-                mv = Tensor('f', [self.nx])
+                vh = Tensor('f', [self.nx])
 
                 offset = x.load(data, offset)
                 offset = g.load(data, offset)

From 63bf200b87def14b9e7599ff2c9477e4bdfa9f3b Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 18:38:52 +0200
Subject: [PATCH 093/100] remove code used to verify correctness of checkpoint
 file conversion

---
 .../train-text-from-scratch.cpp               | 410 ------------------
 1 file changed, 410 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index b62c19540326f..aafdbff747d8f 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -18,53 +18,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-uint32_t compute_data_checksum(struct ggml_tensor * tensor) {
-    const int n3 = (tensor->n_dims >= 3) ? tensor->ne[3] : 1;
-    const int n2 = (tensor->n_dims >= 2) ? tensor->ne[2] : 1;
-    const int n1 = (tensor->n_dims >= 1) ? tensor->ne[1] : 1;
-    const int n0 = (tensor->n_dims >= 0) ? tensor->ne[0] : 1;
-    const size_t nb0 = tensor->nb[0];
-    const size_t nb1 = tensor->nb[1];
-    const size_t nb2 = tensor->nb[2];
-    const size_t nb3 = tensor->nb[3];
-    const size_t nb  = ggml_element_size(tensor);
-    uint32_t result = 0;
-    for (int i3 = 0; i3 < n3; ++i3) {
-        for (int i2 = 0; i2 < n2; ++i2) {
-            for (int i1 = 0; i1 < n1; ++i1) {
-                for (int i0 = 0; i0 < n0; ++i0) {
-                    char * ptr = ((char *) tensor->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
-                    uint32_t val;
-                    memcpy(&val, ptr, nb);
-                    result = result ^ val;
-                    result = (((result << 1u) | ((result >> 31u) & 0x1u)) + 1u) & 0xffffffffu;
-                }
-            }
-        }
-    }
-    return result;
-}
-
-void print_data_checksum(struct ggml_tensor * tensor) {
-    uint32_t chk = compute_data_checksum(tensor);
-    printf("%s: chk=[%08x] data=[%p] name=%s\n", __func__, chk, tensor->data, ggml_get_name(tensor));
-}
-
-void print_data_checksums(struct ggml_cgraph * g) {
-    for (int i = 0; i < g->n_nodes; ++i) {
-        struct ggml_tensor * node = g->nodes[i];
-        for (int j = 0; j<GGML_MAX_SRC; ++j) {
-            if (node->src[j]) {
-                struct ggml_tensor * src = node->src[j];
-                uint32_t chk = compute_data_checksum(src);
-                printf("%s: node[%3d]->src[%d] chk=[%08x] data=[%p] op=%s name=%s\n", __func__, i, j, chk, src->data, ggml_op_name(src->op), ggml_get_name(src));
-            }
-        }
-        uint32_t chk = compute_data_checksum(node);
-        printf("%s: node[%3d]         chk=[%08x] data=[%p] op=%s name=%s\n", __func__, i, chk, node->data, ggml_op_name(node->op), ggml_get_name(node));
-    }
-}
-
 struct random_normal_distribution {
     std::mt19937 gen;
     std::normal_distribution<float> rd;
@@ -1614,12 +1567,6 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
         read_tensor_by_name(opt->adam.m,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
         read_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
         read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
-
-        print_data_checksum(opt->adam.m);
-        print_data_checksum(opt->adam.v);
-        if (opt->adam.pf) {
-            print_data_checksum(opt->adam.pf);
-        }
     } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
         opt->params.type = GGML_OPT_LBFGS;
 
@@ -1670,12 +1617,6 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
                     ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
                 }
 
-                print_data_checksum(opt->adam.m);
-                print_data_checksum(opt->adam.v);
-                if (opt->adam.pf) {
-                    print_data_checksum(opt->adam.pf);
-                }
-
                 gguf_add_tensor(fctx, opt->adam.m);
                 gguf_add_tensor(fctx, opt->adam.v);
                 if (opt->adam.pf) {
@@ -1778,10 +1719,6 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g
     read_tensor_by_name(model->norm,           f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM));
     read_tensor_by_name(model->output,         f_ggml_ctx, tn(LLM_TENSOR_OUTPUT));
 
-    print_data_checksum(model->tok_embeddings);
-    print_data_checksum(model->norm);
-    print_data_checksum(model->output);
-
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
         auto & layer = model->layers[i];
 
@@ -1794,16 +1731,6 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g
         read_tensor_by_name(layer.w1,             f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
         read_tensor_by_name(layer.w2,             f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
         read_tensor_by_name(layer.w3,             f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
-
-        print_data_checksum(layer.attention_norm);
-        print_data_checksum(layer.wq);
-        print_data_checksum(layer.wk);
-        print_data_checksum(layer.wv);
-        print_data_checksum(layer.wo);
-        print_data_checksum(layer.ffn_norm);
-        print_data_checksum(layer.w1);
-        print_data_checksum(layer.w2);
-        print_data_checksum(layer.w3);
     }
 }
 
@@ -1930,10 +1857,6 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod
         gguf_free(vctx);
     }
 
-    print_data_checksum(model->tok_embeddings);
-    print_data_checksum(model->norm);
-    print_data_checksum(model->output);
-
     // add tensors
     gguf_add_tensor(fctx, model->tok_embeddings);
     gguf_add_tensor(fctx, model->norm);
@@ -1941,15 +1864,6 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
         auto & layer = model->layers[i];
 
-        print_data_checksum(layer.attention_norm);
-        print_data_checksum(layer.wq);
-        print_data_checksum(layer.wk);
-        print_data_checksum(layer.wv);
-        print_data_checksum(layer.wo);
-        print_data_checksum(layer.ffn_norm);
-        print_data_checksum(layer.w1);
-        print_data_checksum(layer.w2);
-        print_data_checksum(layer.w3);
 
         gguf_add_tensor(fctx, layer.attention_norm);
         gguf_add_tensor(fctx, layer.wq);
@@ -2025,321 +1939,6 @@ void save_checkpoint_file(const char * filename, const char * fn_vocab_model, st
     gguf_free(fctx);
 }
 
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
-        } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
-        }
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            throw std::runtime_error(format("read error: %s", strerror(errno)));
-        }
-        if (ret != 1) {
-            throw std::runtime_error(std::string("unexpectedly reached end of file"));
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            throw std::runtime_error(format("write error: %s", strerror(errno)));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
-    if (tensor == NULL) {
-        file->write_u32(0);
-        file->write_u32(0);
-        file->write_u32(GGML_TYPE_F32);
-        file->seek((0-file->tell()) & 31, SEEK_CUR);
-        printf("%s: write tensor name='%s' data offset='%zu' nbytes='%zu'\n",
-            __func__, "(empty tensor)", file->tell(), (size_t) 0);
-        return;
-    }
-    const char * name = ggml_get_name(tensor);
-    uint32_t name_len = strlen(name);
-    uint32_t nd = tensor->n_dims;
-    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
-                       (uint32_t)tensor->ne[1],
-                       (uint32_t)tensor->ne[2],
-                       (uint32_t)tensor->ne[3] };
-    printf("%s: write tensor name='%s' begin offset='%zu'\n",
-        __func__, name, file->tell());
-    file->write_u32(nd);
-    file->write_u32(name_len);
-    file->write_u32(tensor->type);
-    file->write_raw(ne, sizeof(ne[0]) * nd);
-    file->write_raw(name, name_len);
-    file->seek((0-file->tell()) & 31, SEEK_CUR);
-    printf("%s: write tensor name='%s' data offset='%zu' nbytes='%zu'\n",
-        __func__, name, file->tell(), ggml_nbytes(tensor));
-    file->write_raw(tensor->data, ggml_nbytes(tensor));
-}
-
-struct ggml_opt_params_v0 {
-    enum ggml_opt_type type;
-    int n_threads;
-    int past;
-    float delta;
-    int max_no_improvement;
-    bool print_forward_graph;
-    bool print_backward_graph;
-    struct {
-        int n_iter;
-        float sched;
-        float decay;
-        float alpha;
-        float beta1;
-        float beta2;
-        float eps;
-        float eps_f;
-        float eps_g;
-    } adam;
-    struct {
-        int m;
-        int n_iter;
-        int max_linesearch;
-        float eps;
-        float ftol;
-        float wolfe;
-        float min_step;
-        float max_step;
-        enum ggml_linesearch linesearch;
-    } lbfgs;
-};
-
-void write_opt_context_v0(struct llama_file * file, struct ggml_opt_context * opt) {
-    const uint32_t version = 0;
-    GGML_ASSERT(opt->nx   >= 0);
-    GGML_ASSERT(opt->iter >= 0);
-    file->write_u32(version);
-    ggml_opt_params_v0 params_v0;
-    params_v0.type                 = opt->params.type;
-    params_v0.n_threads            = opt->params.n_threads;
-    params_v0.past                 = opt->params.past;
-    params_v0.delta                = opt->params.delta;
-    params_v0.max_no_improvement   = opt->params.max_no_improvement;
-    params_v0.print_forward_graph  = opt->params.print_forward_graph;
-    params_v0.print_backward_graph = opt->params.print_backward_graph;
-    params_v0.adam.n_iter          = opt->params.adam.n_iter;
-    params_v0.adam.sched           = opt->params.adam.sched;
-    params_v0.adam.decay           = opt->params.adam.decay;
-    params_v0.adam.alpha           = opt->params.adam.alpha;
-    params_v0.adam.beta1           = opt->params.adam.beta1;
-    params_v0.adam.beta2           = opt->params.adam.beta2;
-    params_v0.adam.eps             = opt->params.adam.eps;
-    params_v0.adam.eps_f           = opt->params.adam.eps_f;
-    params_v0.adam.eps_g           = opt->params.adam.eps_g;
-    params_v0.lbfgs.m              = opt->params.lbfgs.m;
-    params_v0.lbfgs.n_iter         = opt->params.lbfgs.n_iter;
-    params_v0.lbfgs.max_linesearch = opt->params.lbfgs.max_linesearch;
-    params_v0.lbfgs.eps            = opt->params.lbfgs.eps;
-    params_v0.lbfgs.ftol           = opt->params.lbfgs.ftol;
-    params_v0.lbfgs.wolfe          = opt->params.lbfgs.wolfe;
-    params_v0.lbfgs.min_step       = opt->params.lbfgs.min_step;
-    params_v0.lbfgs.max_step       = opt->params.lbfgs.max_step;
-    file->write_raw(&params_v0,   sizeof(params_v0));
-    file->write_raw(&opt->nx,     sizeof(opt->nx));
-    file->write_raw(&opt->iter,   sizeof(opt->iter));
-    file->write_u32((uint32_t)  opt->just_initialized);
-    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
-            {
-                struct ggml_tensor * adam_x  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx);
-                struct ggml_tensor * adam_g1 = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx);
-                struct ggml_tensor * adam_g2 = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx);
-                struct ggml_tensor * adam_mh = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx);
-                struct ggml_tensor * adam_vh = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, opt->nx);
-                write_tensor(file, adam_x);
-                write_tensor(file, adam_g1);
-                write_tensor(file, adam_g2);
-                write_tensor(file, opt->adam.m);
-                write_tensor(file, opt->adam.v);
-                write_tensor(file, adam_mh);
-                write_tensor(file, adam_vh);
-                write_tensor(file, opt->adam.pf);
-                file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
-                file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
-                file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                write_tensor(file, opt->lbfgs.x);
-                write_tensor(file, opt->lbfgs.xp);
-                write_tensor(file, opt->lbfgs.g);
-                write_tensor(file, opt->lbfgs.gp);
-                write_tensor(file, opt->lbfgs.d);
-                write_tensor(file, opt->lbfgs.pf);
-                write_tensor(file, opt->lbfgs.lmal);
-                write_tensor(file, opt->lbfgs.lmys);
-                write_tensor(file, opt->lbfgs.lms);
-                write_tensor(file, opt->lbfgs.lmy);
-                file->write_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
-                file->write_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
-                file->write_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
-                file->write_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
-                file->write_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
-                file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
-            } break;
-    }
-}
-
-void write_opt_context_v1(struct llama_file * file, struct ggml_opt_context * opt) {
-    const uint32_t version = 1;
-    GGML_ASSERT(opt->nx   >= 0);
-    GGML_ASSERT(opt->iter >= 0);
-    file->write_u32(version);
-    file->write_u32(opt->params.past);
-    file->write_u32(opt->params.lbfgs.m);
-    file->write_raw(&opt->nx,     sizeof(opt->nx));
-    file->write_raw(&opt->iter,   sizeof(opt->iter));
-    file->write_u32((uint32_t)  opt->just_initialized);
-    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
-            {
-                GGML_ASSERT(opt->adam.m  != NULL);
-                GGML_ASSERT(opt->adam.v  != NULL);
-                write_tensor(file, opt->adam.m);
-                write_tensor(file, opt->adam.v);
-                write_tensor(file, opt->adam.pf);
-                file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
-                file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
-                file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                GGML_ASSERT(opt->lbfgs.x != NULL);
-                write_tensor(file, opt->lbfgs.x);
-                write_tensor(file, opt->lbfgs.xp);
-                write_tensor(file, opt->lbfgs.g);
-                write_tensor(file, opt->lbfgs.gp);
-                write_tensor(file, opt->lbfgs.d);
-                write_tensor(file, opt->lbfgs.pf);
-                write_tensor(file, opt->lbfgs.lmal);
-                write_tensor(file, opt->lbfgs.lmys);
-                write_tensor(file, opt->lbfgs.lms);
-                write_tensor(file, opt->lbfgs.lmy);
-                file->write_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
-                file->write_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
-                file->write_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
-                file->write_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
-                file->write_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
-                file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
-            } break;
-    }
-}
-
-void save_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, int opt_version) {
-    struct llama_file file(filename, "wb");
-    if (file.fp == NULL) {
-        return;
-    }
-
-    const uint32_t magic   = 'ggcp';
-    const uint32_t version = 0;
-
-    file.write_u32(magic);
-    file.write_u32(version);
-    file.write_u32(model->train_its);
-    file.write_u32(model->train_samples);
-    file.write_u32(model->train_tokens);
-    file.write_u32(model->hparams.n_vocab);
-    file.write_u32(model->hparams.n_embd);
-    file.write_u32(/*model->hparams.n_mult*/ 256);
-    file.write_u32(model->hparams.n_head);
-    file.write_u32(model->hparams.n_layer);
-    file.write_u32(model->hparams.n_rot);
-
-    write_tensor(&file, model->tok_embeddings);
-    write_tensor(&file, model->norm);
-    write_tensor(&file, model->output);
-
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        write_tensor(&file, layer.attention_norm);
-        write_tensor(&file, layer.wq);
-        write_tensor(&file, layer.wk);
-        write_tensor(&file, layer.wv);
-        write_tensor(&file, layer.wo);
-        write_tensor(&file, layer.ffn_norm);
-        write_tensor(&file, layer.w1);
-        write_tensor(&file, layer.w2);
-        write_tensor(&file, layer.w3);
-    }
-
-    if (opt_version == 0) {
-        write_opt_context_v0(&file, opt);
-    } else {
-        write_opt_context_v1(&file, opt);
-    }
-
-    printf("%s: all written offset='%zu'\n",
-        __func__, file.tell());
-
-}
 float cosine_decay(const int decay_steps, const float minimum, int step) {
     if (step > decay_steps) {
         step = decay_steps;
@@ -3190,15 +2789,6 @@ int main(int argc, char ** argv) {
     printf("%s: total training time=%f seconds\n", __func__, dd);
 
     if (params.n_examples > 0) {
-        for (int opt_version = 0; opt_version < 2; ++opt_version) {
-            std::string fn_checkpoint_out_old = (
-                std::string(params.fn_checkpoint_out)
-                + std::string(".")
-                + std::to_string(opt_version)
-                + std::string(".old.bin"));
-            save_checkpoint(&model, opt, fn_checkpoint_out_old.c_str(), opt_version);
-        }
-
         save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, opt);
     }
 

From 3155019b53ceb7c233026b81b3a52d462c6e9921 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 18:39:50 +0200
Subject: [PATCH 094/100] remove trailing whitespace

---
 .../convert-train-checkpoint-to-gguf.py            | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
index 03e6d62450f79..9d3b78e71b029 100644
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@@ -102,8 +102,8 @@ def max_storage_size(self):
 
     def save_gguf(self, gguf_writer, name):
         gguf_writer.add_tensor(
-            name=name, 
-            tensor=self.data, 
+            name=name,
+            tensor=self.data,
             raw_shape=np.array(list(reversed(self.ne))),
             raw_dtype=gguf.GGMLQuantizationType.F32)
 
@@ -239,17 +239,17 @@ def load(self, data, offset):
 
             # forgot to save type in version 1:
             # guess self.type from number of remaining bytes
-            size_type_0 = 12 + sum([t.max_storage_size() for t in 
-                                    [self.adam_m, self.adam_v] 
+            size_type_0 = 12 + sum([t.max_storage_size() for t in
+                                    [self.adam_m, self.adam_v]
                                     +([self.adam_pf] if (self.past > 0) else [])])
-            size_type_1 = 24 + sum([t.max_storage_size() for t in 
+            size_type_1 = 24 + sum([t.max_storage_size() for t in
                                     [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
                                      self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
                                      self.lbfgs_lmal, self.lbfgs_lmys,
                                      self.lbfgs_lms, self.lbfgs_lmy]
                                      +([self.lbfgs_pf] if (self.past > 0) else [])])
             # due to alignment padding the size might not by exact
-            # but the difference in size for both types is significant, 
+            # but the difference in size for both types is significant,
             # so we can just use whichever is closest
             remaining = len(data) - offset
             if abs(remaining - size_type_0) < abs(remaining - size_type_1):
@@ -348,7 +348,7 @@ def load(self, data, offset):
     def get_n_ff(self):
         # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
         return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
-    
+
     def save_gguf(self, gguf_writer):
         # self.n_vocab not saved
         gguf_writer.add_embedding_length(self.n_embd)

From 3e7dfd08c4fc18c48cf70be7d2e61a8e132ea60a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 19:08:11 +0200
Subject: [PATCH 095/100] remove prediction related code

use main for prediction, it is better optimized
---
 .../train-text-from-scratch.cpp               | 589 +-----------------
 1 file changed, 1 insertion(+), 588 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index aafdbff747d8f..c9bba95c7ad76 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -62,17 +62,6 @@ float frand_uniform(struct random_uniform_distribution * rnd) {
     return rnd->rd(rnd->gen);
 }
 
-void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
 struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
     float scale = 1.0f; // xavier
     switch (tensor->n_dims) {
@@ -205,17 +194,6 @@ struct my_llama_layer {
     struct ggml_tensor * w3;
 };
 
-struct my_llama_kv_cache {
-    struct ggml_context * ctx = NULL;
-
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
-
-    // llama_ctx_buffer buf;
-
-    int n; // number of tokens currently in the cache
-};
-
 struct my_llama_model {
     struct ggml_context * ctx = NULL;
 
@@ -446,268 +424,6 @@ void randomize_model(struct my_llama_model * model, int seed, float mean, float
     }
 }
 
-bool init_kv_cache(struct my_llama_kv_cache* cache, struct my_llama_model * model, int n_batch) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_ctx   = hparams.n_ctx;
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-
-    const int64_t n_mem      = n_layer*n_ctx*n_batch;
-    const int64_t n_elements = n_embd*n_mem;
-
-    // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
-
-    // struct ggml_init_params params;
-    // params.mem_size   = cache.buf.size;
-    // params.mem_buffer = cache.buf.addr;
-    // params.no_alloc   = false;
-    if (!cache->ctx) {
-        struct ggml_init_params params;
-        params.mem_size   = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
-        params.mem_buffer = NULL;
-        params.no_alloc   = false;
-
-        cache->ctx = ggml_init(params);
-
-        if (!cache->ctx) {
-            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
-            return false;
-        }
-    }
-
-    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
-    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
-
-    return true;
-}
-
-struct ggml_tensor * forward(
-        struct my_llama_model    * model,
-        struct my_llama_kv_cache * cache,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_tensor    * tokens_input,
-        const  int              n_tokens,
-        const  int              n_past) {
-
-    const int N = n_tokens;
-
-    struct my_llama_kv_cache& kv_self = *cache;
-    const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-    const float rms_norm_eps = hparams.f_norm_rms_eps;
-
-    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
-
-    struct ggml_tensor * kc = kv_self.k;
-    struct ggml_tensor * vc = kv_self.v;
-
-    // inpL shape [n_embd,N,1,1]
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // lctx.use_buf(ctx0, 0);
-
-        // norm
-        {
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            // Qcur shape [n_embd/n_head, n_head, N, 1]
-            // Kcur shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
-
-            // store key and value to memory
-            {
-                // compute the transposed [N, n_embd] V matrix
-                // wv   shape [n_embd, n_embd, 1, 1]
-                // Vcur shape [n_embd, N, 1, 1]
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
-
-                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
-                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
-                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
-
-                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-
-                    // important: storing RoPE-ed version of K in the KV cache!
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-                } //*/
-
-                kc = ggml_set_1d_inplace(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                vc = ggml_set_2d_inplace(ctx0, vc, Vcur, (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-            }
-
-            // Qcur shape [n_embd/n_head, n_head, N, 1]
-            // Q shape    [n_embd/n_head, N, n_head, 1]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-
-            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-            // K shape [n_embd/n_head, n_past + N, n_head, 1]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            // KQ shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // KQ_masked shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // KQ_soft_max shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // split cached V into n_head heads
-            //// V shape [n_past + N, n_embd/n_head, n_head, 1]
-            // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, vc,
-                        n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(vc),
-                        n_ctx*ggml_element_size(vc)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(vc)*n_embd);
-
-            // KQV shape [n_embd/n_head, N, n_head, 1]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // KQV_merged shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            // KQV_merged shape
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
-            // cur = ggml_cpy(ctx0,
-            //         KQV_merged,
-            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection (no bias)
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].wo,
-                    cur);
-        }
-
-        // lctx.use_buf(ctx0, 1);
-
-        // inpFF shape [n_embd,N,1,1]
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-
-                // cur = ffn_norm*cur
-                // cur shape [n_embd,N,1,1]
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-            }
-
-            // tmp shape [n_ff,N,1,1]
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-
-            // SILU activation
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_silu(ctx0, cur);
-
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_mul(ctx0, cur, tmp);
-
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-        }
-
-        // cur shape [n_embd,N,1,1]
-        cur = ggml_add(ctx0, cur, inpFF);
-
-        // input for next layer
-        // inpL shape [n_embd,N,1,1]
-        inpL = cur;
-    }
-
-    // norm
-    {
-
-        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-
-        // inpL = norm*inpL
-        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        //embeddings = inpL;
-    }
-
-    // lm_head
-    // inpL shape [n_vocab,N,1,1]
-    inpL = ggml_mul_mat(ctx0, model->output, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
 void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
     GGML_ASSERT(tensor->n_dims == 1);
     GGML_ASSERT(tensor->ne[0] == ne0);
@@ -1157,42 +873,6 @@ void print_matrix(struct ggml_tensor * probs) {
     }
 }
 
-
-void print_token(struct llama_context * ctx, llama_token token) {
-    printf("%s", llama_token_to_piece(ctx, token).c_str());
-}
-
-void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
-    for (int i=0; i<tokens->ne[0]; ++i) {
-        int token = ggml_get_i32_1d(tokens, i);
-        print_token(ctx, token);
-    }
-}
-
-void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) {
-    for (int i1=0; i1<tokens->ne[1]; ++i1) {
-        //int num_newline = 0;
-        for (int i0=0; i0<tokens->ne[0]; ++i0) {
-            int token = get_i32_2d(tokens, i0, i1);
-            print_token(ctx, token);
-            // bool isnl = (token == llama_token_nl());
-            // if (isnl) {
-            //     ++num_newline;
-            // }
-            // if (isnl) {
-            //     if (num_newline < 2) {
-            //         print_token(ctx, token);
-            //     } else {
-            //         printf("\\n");
-            //     }
-            // } else {
-            //     print_token(ctx, token);
-            // }
-        }
-        printf("\n--\n");
-    }
-}
-
 void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     int n_tokens = tokens_input->ne[0];
     int n_vocab  = target_logits->ne[0];
@@ -1249,26 +929,6 @@ void get_example_targets_batch(struct llama_context * lctx, const int * train_sa
 }
 
 
-void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs, int n_shift) {
-    int n_tokens = tokens_input->ne[0];
-    int n_vocab = target_logits->ne[0];
-    for (int i=0; i<n_tokens-n_shift; ++i) {
-        ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
-        for (int k=0; k<n_vocab; ++k) {
-            ggml_set_f32_1d(target_logits, i*n_vocab + k, ggml_get_f32_1d(target_logits, (i + n_shift)*n_vocab + k));
-            ggml_set_f32_1d(target_probs, i*n_vocab + k,  ggml_get_f32_1d(target_probs,  (i + n_shift)*n_vocab + k));
-        }
-    }
-}
-
-struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * target) {
-    return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, target, a)));
-}
-
-struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * probs) {
-    return ggml_cross_entropy_loss(ctx, a, probs);
-}
-
 #ifdef __GNUC__
 #ifdef __MINGW32__
 __attribute__((format(gnu_printf, 1, 2)))
@@ -1379,125 +1039,6 @@ void shuffle_ints(int * begin, int * end) {
     });
 }
 
-struct my_llama_sampler_params {
-    float temp              = 0.0f;  // <= 0.0 disabled
-    int   top_k             = 20;    // <= 0 to use vocab size
-    float top_p             = 0.95f; // 1.0 = disabled
-    float tfs_z             = 1.00f; // 1.0 = disabled
-    float typical_p         = 1.00f; // 1.0 = disabled
-    int   repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float repeat_penalty    = 1.0f;  // 1.0 = disabled
-    float presence_penalty  = 0.0f;  // 0.0 = disabled
-    float frequency_penalty = 0.0f;  // 0.0 = disabled
-    int   mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float mirostat_tau      = 5.00f; // target entropy
-    float mirostat_eta      = 0.10f; // learning rate
-    bool  penalize_nl       = true;  // consider newlines as a repeatable token
-};
-
-struct my_llama_sampler {
-    struct llama_context * ctx = NULL;
-    my_llama_sampler_params params;
-
-    int n_vocab = 0;
-    int n_ctx = 0;
-
-    float mirostat_mu;
-
-    std::vector<llama_token_data> candidates;
-    llama_token_data_array candidates_p;
-
-};
-
-void init_sampler(struct my_llama_sampler * sampler, struct llama_context * ctx) {
-    sampler->ctx = ctx;
-    sampler->n_vocab = llama_n_vocab(sampler->ctx);
-    sampler->n_ctx   = llama_n_ctx(sampler->ctx);
-    sampler->mirostat_mu = 2.0f * sampler->params.mirostat_tau;
-}
-
-llama_token sample(struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) {
-    GGML_ASSERT(sampler->ctx != NULL);
-
-    struct llama_context * ctx = sampler->ctx;
-
-    sampler->candidates.resize(sampler->n_vocab);
-    for (llama_token token_id = 0; token_id < sampler->n_vocab; ++token_id) {
-        sampler->candidates[token_id].id = token_id;
-        sampler->candidates[token_id].logit = logits[token_id];
-        sampler->candidates[token_id].p = 0.0;
-    }
-
-    llama_token_data_array * candidates_p = & sampler->candidates_p;
-
-    candidates_p->data = sampler->candidates.data();
-    candidates_p->size = sampler->candidates.size();
-    candidates_p->sorted = false;
-
-    const auto params = sampler->params;
-
-    // Apply penalties
-    const float nl_logit = logits[llama_token_nl(ctx)];
-
-    const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);
-
-    llama_sample_repetition_penalty(
-        ctx,
-        candidates_p,
-        last_tokens + n_last_tokens - n_last,
-        n_last,
-        params.repeat_penalty);
-    llama_sample_frequency_and_presence_penalties(
-        ctx,
-        candidates_p,
-        last_tokens + n_last_tokens - n_last,
-        n_last,
-        params.frequency_penalty,
-        params.presence_penalty);
-
-    if (!params.penalize_nl) {
-        logits[llama_token_nl(ctx)] = nl_logit;
-    }
-
-    llama_token token = 0;
-    if (params.temp <= 0) {
-        // Greedy sampling
-        token = llama_sample_token_greedy(ctx, candidates_p);
-    } else {
-        if (params.mirostat == 1) {
-            int mirostat_m = 100;
-            llama_sample_temperature(ctx, candidates_p, params.temp);
-            token = llama_sample_token_mirostat(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, mirostat_m, &sampler->mirostat_mu);
-        } else if (params.mirostat == 2) {
-            llama_sample_temperature(ctx, candidates_p, params.temp);
-            token = llama_sample_token_mirostat_v2(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, &sampler->mirostat_mu);
-        } else {
-            // Temperature sampling
-            llama_sample_top_k        (ctx, candidates_p, params.top_k, 1);
-            llama_sample_tail_free    (ctx, candidates_p, params.tfs_z, 1);
-            llama_sample_typical      (ctx, candidates_p, params.typical_p, 1);
-
-            llama_sample_top_p        (ctx, candidates_p, params.top_p, 1);
-            llama_sample_temperature  (ctx, candidates_p, params.temp);
-            token = llama_sample_token(ctx, candidates_p);
-        }
-    }
-    return token;
-}
-
-void set_logits_masked(struct ggml_tensor * logits, std::vector<bool>& mask, float value) {
-    GGML_ASSERT(logits->ne[0] == (int64_t) mask.size());
-    for (int i2 = 0; i2 < logits->ne[2]; ++i2) {
-        for (int i1 = 0; i1 < logits->ne[1]; ++i1) {
-            for (int i0 = 0; i0 < logits->ne[0]; ++i0) {
-                if (!mask[i0]) continue;
-                float * ptr = (float *) ((char *) logits->data + i2*logits->nb[2] + i1*logits->nb[1] + i0*logits->nb[0]);
-                *ptr = value;
-            }
-        }
-    }
-}
-
 #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
 { \
     const std::string skey(key); \
@@ -1976,14 +1517,12 @@ struct train_params {
     int n_threads;
     int n_batch;
     int n_examples;
-    int n_predict;
 
     float f_norm_rms_eps;
     float rope_freq_base;
     float rope_freq_scale;
 
     int print_info_interval;
-    int print_details_interval;
 
     bool samples_start_after_nl;
     bool use_adam;
@@ -2037,14 +1576,12 @@ struct train_params get_default_train_params() {
     params.n_threads  =    6;
     params.n_batch    =    8;
     params.n_examples =    1;
-    params.n_predict  = 1024;
 
     params.f_norm_rms_eps  = 1e-5;
     params.rope_freq_base  = 10000.0f;
     params.rope_freq_scale = 1.0f;
 
     params.print_info_interval    = 1;
-    params.print_details_interval = 2;
 
     params.samples_start_after_nl = false;
     params.use_adam               = true;
@@ -2102,9 +1639,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
     fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
     fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
-    fprintf(stderr, "  --predict N                Number of tokens to generate after training (default %d)\n", params->n_predict);
     fprintf(stderr, "  --print-info-interval N    Print infos during training each N examples (default %d)\n", params->print_info_interval);
-    fprintf(stderr, "  --print-details-interval N Print details during training each N examples (default %d)\n", params->print_details_interval);
     fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
     fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
     fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
@@ -2253,24 +1788,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_examples = std::stoi(argv[i]);
-        } else if (arg == "--predict") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_predict = std::stoi(argv[i]);
         } else if (arg == "--print-info-interval") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->print_info_interval = std::stoi(argv[i]);
-        } else if (arg == "--print-details-interval") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->print_details_interval = std::stoi(argv[i]);
         } else if (arg == "--samples-after-nl") {
             params->samples_start_after_nl = true;
         } else if (arg == "--use-lbfgs") {
@@ -2547,19 +2070,12 @@ int main(int argc, char ** argv) {
     }
     printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
 
-    struct my_llama_kv_cache kv_self;
-
-
     struct ggml_init_params lcparams;
     lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
     lcparams.mem_buffer = NULL;
     lcparams.no_alloc   = false;
 
     model.ctx = ggml_init(lcparams);
-    kv_self.ctx = model.ctx;
-
-    my_llama_sampler sampler;
-
 
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
@@ -2614,11 +2130,7 @@ int main(int argc, char ** argv) {
         randomize_model(&model, params.seed, 0.0f, 1.0f, -1.0f, +1.0f);
     }
 
-    init_kv_cache(&kv_self, &model, 1);
-    // init_kv_cache(&kv_self, &model, n_batch);
-    init_sampler(&sampler, lctx);
-
-    printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
+    printf("used_mem model: %zu bytes\n", ggml_used_mem(model.ctx));
     // ggml_print_tensor_objects(model.ctx);
 
     // TODO: use std::vector<uint8_t> intead of "new"
@@ -2647,8 +2159,6 @@ int main(int argc, char ** argv) {
         GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
     }
 
-    std::vector<uint8_t> work_buffer;
-
     printf("%s: begin training\n", __func__);
 
     struct opt_callback_data opt_cb_data;
@@ -2755,31 +2265,6 @@ int main(int argc, char ** argv) {
             printf("used_mem_after_opt:  %zu bytes\n", used_mem_after_opt);
         }
 
-        if (params.print_details_interval > 0 && ex % params.print_details_interval == 0) {
-            // set_logits_masked(logits, token_notavail, -1e9);
-            for (int i=0; i<n_batch; ++i) {
-                init_sampler(&sampler, lctx);
-                for (int k=0; k<n_tokens; ++k) {
-                    int32_t token = sample(&sampler,
-                        (float *)       ((char *) logits->data + i*logits->nb[2] + k*logits->nb[1]),
-                        (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]),
-                        k);
-                    * ((int32_t *) ((char *) after_opt_best_samples->data + i*after_opt_best_samples->nb[1] + k*after_opt_best_samples->nb[0])) = token;
-                }
-            }
-
-            // printf("probabilities after optimization:\n");
-            // print_matrix(after_opt_probs);
-            printf("Example:\n---\n");
-            print_tokens_batch(lctx, tokens_input);
-            printf("\n---\n");
-
-            // printf("best samples after optimization:\n---\n");
-            printf("samples after optimization:\n---\n");
-            print_tokens_batch(lctx, after_opt_best_samples);
-            printf("\n---\n");
-        }
-
         ggml_free(ctx0);
     }
 
@@ -2796,78 +2281,6 @@ int main(int argc, char ** argv) {
         save_llama_model_file(params.fn_model_out, params.fn_vocab_model, &model);
     }
 
-    {
-        int n_gen = params.n_predict;
-        int sample_ctx = n_tokens - n_tokens/8;
-
-        // use defaults from common.h
-        sampler.params.top_k             = 40;
-        sampler.params.top_p             = 0.95f;
-        sampler.params.tfs_z             = 1.00f;
-        sampler.params.typical_p         = 1.00f;
-        sampler.params.temp              = 0.8f;
-        sampler.params.repeat_penalty    = 1.1f;
-        sampler.params.repeat_last_n     = 64;
-        sampler.params.frequency_penalty = 0.0f;
-        sampler.params.presence_penalty  = 0.0f;
-        sampler.params.mirostat          = 0;
-        sampler.params.mirostat_tau      = 5.00f;
-        sampler.params.mirostat_eta      = 0.10f;
-        init_sampler(&sampler, lctx);
-
-        printf("[Prediction context]\n");
-
-        struct ggml_tensor * tokens_input  = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
-        struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
-        struct ggml_tensor * target_probs  = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
-
-        get_example_targets(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
-        for (int i=sample_ctx; i<n_tokens; ++i) {
-            ggml_set_i32_1d(tokens_input, i, n_vocab/2);
-        }
-
-        for (int i=0; i<sample_ctx-1; ++i) {
-            print_token(lctx, ggml_get_i32_1d(tokens_input, i));
-        }
-
-        printf("\n[Generating %d tokens]\n", n_gen);
-        for (int i=0; i<n_gen; ++i) {
-            struct ggml_init_params cparams = {
-                compute_size, // .mem_size
-                compute_addr, // .mem_buffer
-                false,        // .no_alloc
-            };
-            struct ggml_context * ctx0 = ggml_init(cparams);
-
-            struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-            int n_past = 0;
-            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
-
-            ggml_build_forward_expand(gf, logits);
-            ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
-
-            //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
-            //struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
-
-            // set_logits_masked(logits, token_notavail, -1e9);
-            int token = sample(&sampler,
-                (float *) ((char *) logits->data + (sample_ctx-1)*logits->nb[1]),
-                (llama_token *) tokens_input->data,
-                sample_ctx-1);
-            //int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
-
-            // print_row(probs, sample_at);
-            print_token(lctx, token);
-
-            lshift_examples(tokens_input, target_logits, target_probs, 1);
-            ggml_set_i32_1d(tokens_input, 0, 0);
-            ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
-
-            ggml_free(ctx0);
-        }
-    }
-
     if (alloc) {
         ggml_allocr_free(alloc);
     }

From 17ab46dffcfde2464f2e8e2d08982b58cfc783b8 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 19:13:20 +0200
Subject: [PATCH 096/100] update train-text-from-scratch README.md

---
 examples/train-text-from-scratch/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md
index 001a6abf98bd4..f4ffcd9876c0c 100644
--- a/examples/train-text-from-scratch/README.md
+++ b/examples/train-text-from-scratch/README.md
@@ -14,8 +14,8 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
         --checkpoint-out chk-shakespeare-256x16.gguf \
         --model-out ggml-shakespeare-256x16-f32.gguf \
         --train-data "shakespeare.txt" \
-        -t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
-        --print-details-interval 0 --predict 16 --use-flash
+        -t 6 -b 16 --seed 1 --adam-iter 256 \
+        --no-checkpointing
 
 # predict
 ./bin/main -m ggml-shakespeare-256x16-f32.gguf

From a925e9304aeda8010ea012e5d4afb98056135e61 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 19:16:27 +0200
Subject: [PATCH 097/100] fix non-windows GGML_ALIGNED_REALLOC

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 11810408450cc..e7ab22eb3a605 100644
--- a/ggml.c
+++ b/ggml.c
@@ -223,7 +223,7 @@ inline static void * ggml_aligned_realloc(void * ptr, size_t old_size, size_t si
     return result;
 }
 #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
-#define GGML_ALIGNED_REALLOC(ptr, old_size, size) ggml_aligned_realloc(size)
+#define GGML_ALIGNED_REALLOC(ptr, old_size, size) ggml_aligned_realloc(ptr, old_size, size)
 #define GGML_ALIGNED_FREE(ptr)    free(ptr)
 #endif
 

From 440d221c6262bdffe7c51363763e084db8476cc6 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 19:17:47 +0200
Subject: [PATCH 098/100] add missing blank line at end of file

---
 .../train-text-from-scratch/convert-train-checkpoint-to-gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
index 9d3b78e71b029..01b3ee92a5a0c 100644
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@@ -489,4 +489,4 @@ def main():
     gguf_writer.close()
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()

From f6828cba9ef57c2b6fad0e5d9b0800cf8d05c1c2 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 28 Aug 2023 20:21:03 +0200
Subject: [PATCH 099/100] remove GGML_ALIGNED_REALLOC and use normal
 malloc/realloc/free for gguf ctx->kv & ctx->infos

---
 ggml.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/ggml.c b/ggml.c
index e7ab22eb3a605..8dc37433eddf2 100644
--- a/ggml.c
+++ b/ggml.c
@@ -189,7 +189,6 @@ typedef void * thread_ret_t;
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
-#define GGML_ALIGNED_REALLOC(ptr, old_size, size) _aligned_realloc(ptr, size, GGML_MEM_ALIGN)
 #define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
 #else
 inline static void * ggml_aligned_malloc(size_t size) {
@@ -215,15 +214,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
     }
     return aligned_memory;
 }
-inline static void * ggml_aligned_realloc(void * ptr, size_t old_size, size_t size) {
-    // There is no posix_memalign_realloc function
-    void * result = ggml_aligned_malloc(size);
-    memcpy(result, ptr, old_size);
-    free(ptr);
-    return result;
-}
 #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
-#define GGML_ALIGNED_REALLOC(ptr, old_size, size) ggml_aligned_realloc(ptr, old_size, size)
 #define GGML_ALIGNED_FREE(ptr)    free(ptr)
 #endif
 
@@ -19624,7 +19615,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     // read the kv pairs
     {
-        ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
+        ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
 
         for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
             struct gguf_kv * kv = &ctx->kv[i];
@@ -19707,7 +19698,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     // read the tensor infos
     {
-        ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
+        ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
 
         for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
             struct gguf_tensor_info * info = &ctx->infos[i];
@@ -19908,7 +19899,7 @@ void gguf_free(struct gguf_context * ctx) {
             }
         }
 
-        GGML_ALIGNED_FREE(ctx->kv);
+        free(ctx->kv);
     }
 
     if (ctx->infos) {
@@ -19920,7 +19911,7 @@ void gguf_free(struct gguf_context * ctx) {
             }
         }
 
-        GGML_ALIGNED_FREE(ctx->infos);
+        free(ctx->infos);
     }
 
     GGML_ALIGNED_FREE(ctx);
@@ -20077,7 +20068,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
 
     const int n_kv = gguf_get_n_kv(ctx);
 
-    ctx->kv = GGML_ALIGNED_REALLOC(ctx->kv, n_kv * sizeof(struct gguf_kv), (n_kv + 1) * sizeof(struct gguf_kv));
+    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
     ctx->kv[n_kv].key.n    = strlen(key);
     ctx->kv[n_kv].key.data = strdup(key);
     ctx->header.n_kv++;
@@ -20234,7 +20225,7 @@ void gguf_add_tensor(
              struct gguf_context * ctx,
         const struct ggml_tensor * tensor) {
     const int idx = ctx->header.n_tensors;
-    ctx->infos = GGML_ALIGNED_REALLOC(ctx->infos, idx*sizeof(struct gguf_tensor_info), (idx + 1)*sizeof(struct gguf_tensor_info));
+    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
 
     ctx->infos[idx].name.n    = strlen(tensor->name);
     ctx->infos[idx].name.data = strdup(tensor->name);

From 93535a460a6850f639e81151f955d0799244c5de Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 28 Aug 2023 22:26:10 +0300
Subject: [PATCH 100/100] train : fix compile warnings

---
 common/common.cpp                             |  5 +--
 .../convert-llama2c-to-ggml.cpp               |  1 -
 .../train-text-from-scratch.cpp               | 31 ++++++++++---------
 ggml.c                                        | 16 +++++-----
 llama.cpp                                     |  9 +++---
 5 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 4a0d43c13ece9..90fe2e84e331a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -15,6 +15,7 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
+#include <cinttypes>
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -938,8 +939,8 @@ std::string get_sortable_timestamp() {
 
     const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
         current_time.time_since_epoch() % 1000000000).count();
-    char timestamp_ns[10];
-    snprintf(timestamp_ns, 11, "%09ld", ns);
+    char timestamp_ns[11];
+    snprintf(timestamp_ns, 11, "%09" PRId64, ns);
 
     return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
 }
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 51d90ea6a7e73..e9e070b1fa321 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -681,7 +681,6 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
 
     // for rms-att-weight
     int row_length = model->hparams.n_embd;
-    const auto & hparams = model->hparams;
     int n_ff = model->hparams.n_ff;
 
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index c9bba95c7ad76..6fe85d419618f 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -314,15 +314,13 @@ void init_model(struct my_llama_model * model) {
     model->train_samples = 0;
     model->train_tokens = 0;
 
-    const char * arch = "llama";
-
     std::vector<char> tn_buf;
     tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [arch, &tn_buf](const char * key) -> const char * {
+    auto tn = [&tn_buf](const char * key) -> const char * {
         snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
         return tn_buf.data();
     };
-    auto tni = [arch, &tn_buf](const char * key, int bid) -> const char * {
+    auto tni = [&tn_buf](const char * key, int bid) -> const char * {
         snprintf(tn_buf.data(), tn_buf.size(), key, bid);
         std::string s = tn_buf.data();
         snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
@@ -470,7 +468,7 @@ static size_t hash_find(void * hash_table[], void * p) {
 }
 
 static bool hash_insert(void * hash_table[], void * p) {
-    size_t h = hash(p);
+    //size_t h = hash(p);
     size_t i = hash_find(hash_table, p);
 
     GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
@@ -494,7 +492,7 @@ struct hash_map {
     void * keys[GGML_GRAPH_HASHTABLE_SIZE];
     void * vals[GGML_GRAPH_HASHTABLE_SIZE];
 };
-static const size_t HASH_MAP_SIZE = sizeof(struct hash_map);
+//static const size_t HASH_MAP_SIZE = sizeof(struct hash_map);
 
 struct hash_map * new_hash_map() {
     struct hash_map * result = new struct hash_map;
@@ -677,7 +675,6 @@ struct ggml_tensor * llama_build_train_graphs(
     const float f_norm_rms_eps  = hparams.f_norm_rms_eps;
     const float rope_freq_base  = hparams.rope_freq_base;
     const float rope_freq_scale = hparams.rope_freq_scale;
-    const int rope_mode  = 0;
 
     auto set_name = [](struct ggml_tensor * t, const char * n) {
         ggml_set_name(t, n);
@@ -687,8 +684,12 @@ struct ggml_tensor * llama_build_train_graphs(
     };
 
     // rope has so much parameters that we make a custom function for it
-    auto rope = [ctx, n_past, n_rot, rope_mode, n_ctx, rope_freq_base, rope_freq_scale]
+    auto rope = [ctx, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
                 (struct ggml_tensor * t) -> struct ggml_tensor * {
+        // not capturing these, to silcence warnings
+        const int n_past    = 0;
+        const int rope_mode = 0;
+
         return ggml_rope_custom(ctx,
             t, n_past, n_rot, rope_mode, n_ctx,
             rope_freq_base, rope_freq_scale);
@@ -803,14 +804,14 @@ struct ggml_tensor * llama_build_train_graphs(
         }
         // allocating checkpoints in one block to reduce memory fragmentation
         // note: they will be freed in reverse order
-        for (int i = 0; i < checkpoints.size(); ++i) {
+        for (int i = 0; i < (int) checkpoints.size(); ++i) {
             if (checkpoints[i]->data == NULL && !ggml_is_view(checkpoints[i])) {
                 ggml_allocr_alloc(alloc, checkpoints[i]);
             }
         }
 
-        int n_leafs_after = gb->n_leafs;
-        int n_nodes_after = gb->n_nodes;
+        //int n_leafs_after = gb->n_leafs;
+        //int n_nodes_after = gb->n_nodes;
 
         ggml_allocr_alloc_graph(alloc, gb);
 
@@ -1061,6 +1062,8 @@ bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
     GGML_ASSERT(a->type == b->type);
     GGML_ASSERT(ggml_are_same_shape(a, b));
     GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b));
+
+    return true;
 }
 
 void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
@@ -1217,11 +1220,11 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g
 
     std::vector<char> tn_buf;
     tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [&arch, &tn_buf](const char * key) -> const char * {
+    auto tn = [&tn_buf](const char * key) -> const char * {
         snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
         return tn_buf.data();
     };
-    auto tni = [&arch, &tn_buf](const char * key, int bid) -> const char * {
+    auto tni = [&tn_buf](const char * key, int bid) -> const char * {
         snprintf(tn_buf.data(), tn_buf.size(), key, bid);
         std::string s = tn_buf.data();
         snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
@@ -2194,7 +2197,7 @@ int main(int argc, char ** argv) {
         ggml_set_no_alloc(ctx0, false);
 
         // don't use alloc for input tensors, so we can safely fill them with data
-        struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
+        //struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
         //struct ggml_tensor * after_opt_probs        = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
         struct ggml_tensor * tokens_input           = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
         struct ggml_tensor * target_logits          = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
diff --git a/ggml.c b/ggml.c
index 8dc37433eddf2..9a787863d0e5a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9448,6 +9448,8 @@ static void ggml_compute_forward_div_f32(
 
 
 #ifdef GGML_USE_ACCELERATE
+            UNUSED(ggml_vec_div_f32);
+
             vDSP_vdiv(
                     (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
                     (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
@@ -13936,7 +13938,7 @@ static void ggml_compute_forward_flash_attn_f32(
                 vvexpf(S, S, &Mup);
                 ggml_vec_sum_f32(Mup, &sum, S);
 #else
-                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
+                uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
                 ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
 
                 for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -14530,7 +14532,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
                     vvexpf(SM, SM, &Mup);
                     ggml_vec_sum_f32(Mup, &sum, SM);
 #else
-                    uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
+                    uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
                     ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
 
                     for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
@@ -15330,7 +15332,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
             float max = -INFINITY;
             ggml_vec_max_f32(nc, &max, s0);
 
-            uint16_t scvt;
+            uint16_t scvt; UNUSED(scvt);
             for (int i = 0; i < nc; i++) {
                 if (s0[i] == -INFINITY) {
                     st[i] = 0.0f;
@@ -15410,7 +15412,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
         return;
     }
 
-    const double eps = 1e-9f;
+    const double eps = 1e-9;
 
     // TODO: handle transposed/permuted matrices
     const int64_t nc = src0->ne[0];
@@ -15444,7 +15446,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
             float max = -INFINITY;
             ggml_vec_max_f32(nc, &max, s0);
 
-            uint16_t scvt;
+            uint16_t scvt; UNUSED(scvt);
             for (int i = 0; i < nc; i++) {
                 if (s0[i] == -INFINITY) {
                     ds0[i] = 0.0f;
@@ -18495,7 +18497,7 @@ static enum ggml_opt_result ggml_opt_adam(
                     const int64_t ne = ggml_nelements(ps[p]);
                     for (int64_t j = 0; j < ne; ++j) {
                         float g = ggml_get_f32_1d(ps[p]->grad, j);
-                        sum += g*g;
+                        sum += (ggml_float)(g*g);
                     }
                 }
                 ggml_float norm = sqrt(sum);
@@ -18508,7 +18510,7 @@ static enum ggml_opt_result ggml_opt_adam(
             int64_t i = 0;
             for (int p = 0; p < np; ++p) {
                 const int64_t ne = ggml_nelements(ps[p]);
-                const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0) * sched;
+                const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
                 for (int64_t j = 0; j < ne; ++j) {
                     float x = ggml_get_f32_1d(ps[p], j);
                     float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
diff --git a/llama.cpp b/llama.cpp
index 11697ee65c2a2..7cb468538ef74 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6248,7 +6248,6 @@ const char * llama_print_system_info(void) {
 }
 
 void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
-
     fprintf(stream, "\n");
     fprintf(stream, "###########\n");
     fprintf(stream, "# Timings #\n");
@@ -6264,10 +6263,10 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
     fprintf(stream, "n_eval: %d  # number of tokens generated (excluding the first one)\n", ctx->n_eval);
     fprintf(stream, "n_p_eval: %d  # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
     fprintf(stream, "n_sample: %d  # number of sampled tokens\n", ctx->n_sample);
-    fprintf(stream, "t_eval_us: %ld  # total microseconds spent generating tokens\n", ctx->t_eval_us);
-    fprintf(stream, "t_load_us: %ld  # total microseconds spent loading the model\n", ctx->t_load_us);
-    fprintf(stream, "t_p_eval_us: %ld  # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
-    fprintf(stream, "t_sample_us: %ld  # total microseconds spent sampling\n", ctx->t_sample_us);
+    fprintf(stream, "t_eval_us: %" PRId64 "  # total microseconds spent generating tokens\n", ctx->t_eval_us);
+    fprintf(stream, "t_load_us: %" PRId64 "  # total microseconds spent loading the model\n", ctx->t_load_us);
+    fprintf(stream, "t_p_eval_us: %" PRId64 "  # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
+    fprintf(stream, "t_sample_us: %" PRId64 "  # total microseconds spent sampling\n", ctx->t_sample_us);
     fprintf(stream, "ts_eval: %.2f  # tokens / second during generation\n",
             1.0e6 * ctx->n_eval / ctx->t_eval_us);
     fprintf(stream, "ts_p_eval: %.2f  # tokens / second during prompt processing\n",