diff --git a/CMakeLists.txt b/CMakeLists.txt index d2becb04c6bb9..ebac6f5b06915 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() +message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}") + # Add path to modules list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") diff --git a/common/arg.cpp b/common/arg.cpp index c4ad85c47b61b..18ed299a53d0b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1196,6 +1196,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e common_params_print_completion(ctx_arg); exit(0); } + params.lr.init(); } catch (const std::invalid_argument & ex) { fprintf(stderr, "%s\n", ex.what()); ctx_arg.params = params_org; @@ -2609,9 +2610,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-o", "--output", "--output-file"}, "FNAME", string_format("output file (default: '%s')", params.out_file.c_str()), [](common_params & params, const std::string & value) { - params.out_file = value; + params.out_file = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS})); + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), @@ -3406,5 +3407,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt( + common_arg({ "-lr", "--learning-rate-initial" }, "ALPHA", + string_format( + "adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", + (double) params.lr.lr0), + [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }) + .set_examples({ LLAMA_EXAMPLE_FINETUNE })); + add_opt( + common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA", + string_format( + "(if >0) final learning rate (default=%.2g)", + (double) params.lr.lr_min), + [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }) + .set_examples({ LLAMA_EXAMPLE_FINETUNE })); + add_opt( + common_arg({ "-min-epochs", "--learning-rate-min-epochs" }, "ALPHA", + string_format( + "(if >0) reach -lr-min after this many epochs (instead of only at the last) (default=%.2g)", + (double) params.lr.min_epochs), + [](common_params & params, const std::string & value) { params.lr.min_epochs = std::stof(value); }) + .set_examples({ LLAMA_EXAMPLE_FINETUNE })); + add_opt(common_arg( + { "-wd", "--weight-decay" }, "WD", + string_format( + "adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", + (double) params.lr.wd), + [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }) + .set_examples({ LLAMA_EXAMPLE_FINETUNE })); + add_opt(common_arg({ "-val", "--val-split" }, "FRACTION", + string_format("fraction of data to use as validation set for training (default: %.2g).", + (double) params.val_split), + [](common_params & params, const std::string & value) { params.val_split = std::stof(value); }) + .set_examples({ LLAMA_EXAMPLE_FINETUNE })); + add_opt(common_arg({ "-epochs", "--epochs" }, "N", + string_format("optimizer max # of epochs (default: %d)", params.lr.epochs), + [](common_params & params, int epochs) { params.lr.epochs = epochs; }) + .set_examples({ LLAMA_EXAMPLE_FINETUNE })); + add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd", + [](common_params & params, const std::string & name) { + params.optimizer = common_opt_get_optimizer(name.c_str()); + if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) { + throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd"); + } + }) + .set_examples({ LLAMA_EXAMPLE_FINETUNE })); + return ctx_arg; } diff --git a/common/common.cpp b/common/common.cpp index e4e71ad13fb59..0706f077fbd8f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1548,3 +1548,43 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std return result; } + +ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) { + ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr); + const lr_opt & d = *(lr_opt *) userdata; + result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch); + result.sgd.wd = result.adamw.wd = d.wd; + return result; +} + +GGML_API enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) { + if (!strcasecmp("adamw", n)) { + return GGML_OPT_OPTIMIZER_TYPE_ADAMW; + } else if (!strcasecmp("sgd", n)) { + return GGML_OPT_OPTIMIZER_TYPE_SGD; + } else { + return GGML_OPT_OPTIMIZER_TYPE_COUNT; + } +} + +static float const k_log_2 = std::log(2.f); + +void lr_opt::init() { + if (lr_min > 0 && lr_min < lr0) { + float nhalf = std::log(lr0 / lr_min) / k_log_2; + float e = epochs; + if (min_epochs > 0 && min_epochs < e) + e = min_epochs; + else + min_epochs = e; + scale_epoch = nhalf / e; + } +} + +float lr_opt::get_lr(float epoch) const { + float r = lr_min <= 0 ? lr0 : + epoch >= min_epochs ? lr_min : + lr0 * std::pow(.5, epoch * scale_epoch); + LOG_INF("epoch %.2g lr=%.2g\n", epoch, r); + return r; +} diff --git a/common/common.h b/common/common.h index e08a59eae7543..550abc5fcaf74 100644 --- a/common/common.h +++ b/common/common.h @@ -2,13 +2,15 @@ #pragma once -#include "llama-cpp.h" - #include +#include #include #include #include -#include +#include + +#include "ggml-opt.h" +#include "llama-cpp.h" #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' @@ -80,6 +82,7 @@ enum llama_example { LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_PARALLEL, LLAMA_EXAMPLE_TTS, + LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_COUNT, }; @@ -222,6 +225,25 @@ enum common_reasoning_format { COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas. }; + +struct lr_opt { + float lr0 = 1e-5; // learning rate at first epoch + float lr_min = -1; + float min_epochs = -1; // if >0, constant (lr_min) after this many epochs + float scale_epoch = 0; + float wd = 0; + unsigned epochs = 2; + + unsigned epoch; // set by optimizer outer (epochs) loop + // learning rate decay - constant LR per epoch only for now + float get_lr(float e) const; + float get_lr() const { return get_lr(epoch); } + // must call after arg parse, before get_lr + void init(); +}; + +struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata); + struct common_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 4096; // context size @@ -353,6 +375,12 @@ struct common_params { bool no_mmproj = false; // explicitly disable multimodal model std::vector image; // path to image file(s) + // finetune + struct lr_opt lr; + enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW; + float val_split = 0.05f; // fraction of the data used for the validation set + std::string opt_save_model_to = "finetuned-model.gguf"; + // embedding bool embedding = false; // get only sentence embedding int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) @@ -674,3 +702,6 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; // ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector & tokens, int64_t stride); + +// "adamw" or "sgd" (case insensitive) +enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *); diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp index 23bede49b1362..fdba56928d75f 100644 --- a/examples/training/finetune.cpp +++ b/examples/training/finetune.cpp @@ -1,29 +1,31 @@ -#include "arg.h" -#include "common.h" -#include "log.h" -#include "llama.h" - #include #include #include #include #include +#include "arg.h" +#include "common.h" +#include "llama.h" +#include "log.h" + #if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data +# pragma warning(disable : 4244 4267) // possible loss of data #endif + + int main(int argc, char ** argv) { common_params params; - params.escape = false; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) { return 1; } if (params.use_mmap) { - LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__); + LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", + __func__); params.use_mmap = false; } if (params.cache_type_k != GGML_TYPE_F32) { @@ -38,11 +40,11 @@ int main(int argc, char ** argv) { common_init(); llama_backend_init(); llama_numa_init(params.numa); - // load the model and apply lora adapter, if any - common_init_result llama_init = common_init_from_params(params); - llama_model_ptr & model = llama_init.model; - llama_context_ptr & ctx = llama_init.context; + common_init_result llama_init = common_init_from_params(params); + llama_model_ptr & model = llama_init.model; + llama_context_ptr & ctx = llama_init.context; + auto pctx = ctx.get(); if (model == NULL) { LOG_ERR("%s: unable to load model\n", __func__); @@ -55,31 +57,32 @@ int main(int argc, char ** argv) { LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } - constexpr float val_split = 0.05f; - - std::vector tokens = common_tokenize(ctx.get(), params.prompt, true); - ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2); - - struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr); - optimizer_params.adamw.alpha = 1e-7f; // learning rate - - struct llama_opt_params lopt_params { - /*n_ctx_train =*/ 0, - /*param_filter =*/ llama_opt_param_filter_all, - /*param_filter_ud =*/ nullptr, - /*get_opt_pars =*/ ggml_opt_get_constant_optimizer_params, - /*get_opt_pars_ud =*/ &optimizer_params, + std::vector tokens = common_tokenize(pctx, params.prompt, true); + ggml_opt_dataset_t dataset = common_opt_dataset_init(pctx, tokens, llama_n_ctx(pctx) / 2); + + struct lr_opt & lr = params.lr; + LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n", + ggml_opt_optimizer_name(params.optimizer), (double) lr.lr0, (double) lr.wd, (double) lr.lr_min, (double) lr.min_epochs, + (unsigned) lr.epochs, (double) params.n_batch / params.n_ubatch, (double) params.val_split); + + struct llama_opt_params lopt_params{ + /*n_ctx_train =*/0, + /*param_filter =*/llama_opt_param_filter_all, + /*param_filter_ud =*/nullptr, + /*get_opt_pars =*/common_opt_lr_pars, + /*get_opt_pars_ud =*/¶ms.lr, + /*optimizer_type =*/params.optimizer, }; - llama_opt_init(ctx.get(), model.get(), lopt_params); + llama_opt_init(pctx, model.get(), lopt_params); - const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split); + const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split); ggml_opt_result_t result_train = ggml_opt_result_init(); ggml_opt_result_t result_eval = ggml_opt_result_init(); - for (int epoch = 0; epoch < 2; ++epoch) { - llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split, - ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar); + for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) { + llama_opt_epoch(pctx, dataset, result_train, result_eval, idata_split, + ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar); fprintf(stderr, "\n"); ggml_opt_result_reset(result_train); @@ -88,7 +91,7 @@ int main(int argc, char ** argv) { ggml_opt_result_free(result_train); ggml_opt_result_free(result_eval); - llama_model_save_to_file(model.get(), "finetuned-model.gguf"); + llama_model_save_to_file(model.get(), params.out_file.c_str()); llama_backend_free(); diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h index 74ec080a055ea..dba8305131fc5 100644 --- a/ggml/include/ggml-opt.h +++ b/ggml/include/ggml-opt.h @@ -74,16 +74,26 @@ extern "C" { GGML_OPT_BUILD_TYPE_OPT = 30, }; + enum ggml_opt_optimizer_type { + GGML_OPT_OPTIMIZER_TYPE_ADAMW, + GGML_OPT_OPTIMIZER_TYPE_SGD, + + GGML_OPT_OPTIMIZER_TYPE_COUNT + }; + // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss struct ggml_opt_optimizer_params { - // AdamW optimizer parameters struct { - float alpha; // learning rate - float beta1; - float beta2; - float eps; // epsilon for numerical stability - float wd; // weight decay for AdamW, use 0.0f to disable + float alpha; // learning rate + float beta1; // first AdamW momentum + float beta2; // second AdamW momentum + float eps; // epsilon for numerical stability + float wd; // weight decay - 0.0f to disable } adamw; + struct { + float alpha; // learning rate + float wd; // weight decay + } sgd; }; // callback to calculate optimizer parameters prior to a backward pass @@ -113,7 +123,10 @@ extern "C" { int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters - void * get_opt_pars_ud; // userdata for calculating optimizer parameters + void * get_opt_pars_ud; // userdata for calculating optimizer parameters + + // only GGML_OPT_OPTIMIZER_TYPE_ADAMW allocates m, v per parameter + enum ggml_opt_optimizer_type optimizer; }; // get parameters for an optimization context with defaults set where possible @@ -142,6 +155,10 @@ extern "C" { // get the gradient accumulator for a node from the forward graph GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node); + GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); + + GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type); + // ====== Optimization Result ====== GGML_API ggml_opt_result_t ggml_opt_result_init(void); @@ -226,12 +243,14 @@ extern "C" { struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used ggml_opt_dataset_t dataset, // dataset with data and optionally also labels enum ggml_opt_loss_type loss_type, // loss to minimize + enum ggml_opt_optimizer_type optimizer, // sgd or adamw ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t) int64_t nepoch, // how many times the dataset should be iterated over int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f) bool silent); // whether or not info prints to stderr should be suppressed + #ifdef __cplusplus } #endif diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 9c4e24023b5ad..3e5776002cc16 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -450,7 +450,7 @@ extern "C" { GGML_OP_REPEAT_BACK, GGML_OP_CONCAT, GGML_OP_SILU_BACK, - GGML_OP_NORM, // normalize + GGML_OP_NORM, // normalize GGML_OP_RMS_NORM, GGML_OP_RMS_NORM_BACK, GGML_OP_GROUP_NORM, @@ -486,7 +486,7 @@ extern "C" { GGML_OP_POOL_1D, GGML_OP_POOL_2D, GGML_OP_POOL_2D_BACK, - GGML_OP_UPSCALE, // nearest interpolate + GGML_OP_UPSCALE, // nearest interpolate GGML_OP_PAD, GGML_OP_PAD_REFLECT_1D, GGML_OP_ROLL, @@ -518,6 +518,7 @@ extern "C" { GGML_OP_CROSS_ENTROPY_LOSS, GGML_OP_CROSS_ENTROPY_LOSS_BACK, GGML_OP_OPT_STEP_ADAMW, + GGML_OP_OPT_STEP_SGD, GGML_OP_COUNT, }; @@ -2075,6 +2076,14 @@ extern "C" { struct ggml_tensor * v, struct ggml_tensor * adamw_params); // parameters such a the learning rate + // SGD (with weight decay) step + GGML_API struct ggml_tensor * ggml_opt_step_sgd( + // params: alpha (learning rate), wd (weight decay) + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * grad, + struct ggml_tensor * adamw_params); + // // automatic differentiation // diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 1d3cd009affc6..b9be7f3360193 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1996,6 +1996,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm ggml_compute_forward_opt_step_adamw(params, tensor); } break; + case GGML_OP_OPT_STEP_SGD: + { + ggml_compute_forward_opt_step_sgd(params, tensor); + } + break; case GGML_OP_NONE: { // nop @@ -2281,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS_BACK: case GGML_OP_OPT_STEP_ADAMW: + case GGML_OP_OPT_STEP_SGD: { n_tasks = n_threads; } break; diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index eff4a53e3442b..0f0cf77a0a392 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -9031,6 +9031,7 @@ static void ggml_compute_forward_opt_step_adamw_f32( const int ir1 = MIN(ir0 + dr, nr); const float * adamw_params_ptr = ggml_get_data_f32(adamw_params); + const float alpha = adamw_params_ptr[0]; const float beta1 = adamw_params_ptr[1]; const float beta2 = adamw_params_ptr[2]; @@ -9038,7 +9039,7 @@ static void ggml_compute_forward_opt_step_adamw_f32( const float wd = adamw_params_ptr[4]; const float beta1h = adamw_params_ptr[5]; const float beta2h = adamw_params_ptr[6]; - + const float keep = 1.f - alpha * wd; for (int ir = ir0; ir < ir1; ++ir) { const int64_t i03 = ir/(ne02*ne01); const int64_t i02 = (ir - i03*ne02*ne01)/ne01; @@ -9061,7 +9062,7 @@ static void ggml_compute_forward_opt_step_adamw_f32( // The weight decay is applied independently of the Adam momenta m and v. // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss. // See: https://arxiv.org/pdf/1711.05101v3.pdf - w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh; + w[i00] = w[i00] * keep - alpha * mh / vh; } } } @@ -9083,3 +9084,63 @@ void ggml_compute_forward_opt_step_adamw( } } } + +static void ggml_compute_forward_opt_step_sgd_f32(const ggml_compute_params * params, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src0_grad = dst->src[1]; + const ggml_tensor * sgd_params = dst->src[2]; + + GGML_ASSERT(ggml_are_same_shape(src0, src0_grad)); + GGML_ASSERT(ggml_nelements(sgd_params) == 2); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS + GGML_ASSERT(nb00 == sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1) / nth; + + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + // using adamw param subset we care about - alpha, wd - could have a separate struct + const float * sgd_params_ptr = ggml_get_data_f32(sgd_params); + const float alpha = sgd_params_ptr[0]; + const float keep = sgd_params_ptr[1]; + + for (int ir = ir0; ir < ir1; ++ir) { + const int64_t i03 = ir / (ne02 * ne01); + const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01; + const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01); + + const size_t offset = i03 * nb03 + i02 * nb02 + i01 * nb01; + + float * w = (float *) ((char *) src0->data + offset); // weight + const float * g = (const float *) ((const char *) src0_grad->data + offset); // grad + + for (int i00 = 0; i00 < ne00; ++i00) { + w[i00] = w[i00] * keep - alpha * g[i00]; + } + } +} + +void ggml_compute_forward_opt_step_sgd(const ggml_compute_params * params, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_opt_step_sgd_f32(params, dst); + } + break; + default: + { + GGML_ABORT("fatal error - sgd is F32 only"); + } + } +} diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 2d8544d7d3d43..4606b58ba13a7 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -105,7 +105,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst); - +void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst); #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index b3e6833c396fd..78c06d9092459 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -26,6 +26,7 @@ #include "ggml-cuda/mmvq.cuh" #include "ggml-cuda/norm.cuh" #include "ggml-cuda/opt-step-adamw.cuh" +#include "ggml-cuda/opt-step-sgd.cuh" #include "ggml-cuda/out-prod.cuh" #include "ggml-cuda/pad.cuh" #include "ggml-cuda/pool2d.cuh" @@ -2386,6 +2387,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_OPT_STEP_ADAMW: ggml_cuda_opt_step_adamw(ctx, dst); break; + case GGML_OP_OPT_STEP_SGD: + ggml_cuda_opt_step_sgd(ctx, dst); + break; default: return false; } @@ -3311,6 +3315,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS_BACK: case GGML_OP_OPT_STEP_ADAMW: + case GGML_OP_OPT_STEP_SGD: return true; default: return false; diff --git a/ggml/src/ggml-cuda/opt-step-adamw.cu b/ggml/src/ggml-cuda/opt-step-adamw.cu index 35154f2996652..a21526a0b5215 100644 --- a/ggml/src/ggml-cuda/opt-step-adamw.cu +++ b/ggml/src/ggml-cuda/opt-step-adamw.cu @@ -17,7 +17,6 @@ static __global__ void opt_step_adamw_f32( const float beta1 = pars[1]; const float beta2 = pars[2]; const float eps = pars[3]; - const float wd = pars[4]; const float beta1h = pars[5]; const float beta2h = pars[6]; @@ -31,7 +30,7 @@ static __global__ void opt_step_adamw_f32( const float mh = gmi*beta1h; const float vh = sqrtf(gvi*beta2h) + eps; - x[i] = x[i]*(1.0f - alpha*wd) - alpha*mh/vh; + x[i] = x[i] * (1.f - alpha * pars[4]) - alpha * mh / vh; } static void opt_step_adamw_f32_cuda( @@ -69,7 +68,6 @@ void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst float * src0_grad_m_d = (float *) src0_grad_m->data; float * src0_grad_v_d = (float *) src0_grad_v->data; const float * adamw_params_d = (const float *) adamw_params->data; - cudaStream_t stream = ctx.stream(); const int64_t ne = ggml_nelements(src0); diff --git a/ggml/src/ggml-cuda/opt-step-sgd.cu b/ggml/src/ggml-cuda/opt-step-sgd.cu new file mode 100644 index 0000000000000..bf7e9bcb57db4 --- /dev/null +++ b/ggml/src/ggml-cuda/opt-step-sgd.cu @@ -0,0 +1,48 @@ +#include "ggml-impl.h" +#include "opt-step-sgd.cuh" + +#include + +static __global__ void opt_step_sgd_f32( + float * __restrict__ x, const float * __restrict__ g, + const float * __restrict__ pars, const int64_t k) { + + const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x; + + if (i >= k) + return; + x[i] = x[i] * pars[1] - pars[0] * g[i]; +} + +static void opt_step_sgd_f32_cuda( + float * x, const float * g, const float * __restrict__ pars, const int64_t k, cudaStream_t stream) { + + const dim3 block_dims(CUDA_OPT_STEP_SGD_BLOCK_SIZE, 1, 1); + const dim3 block_nums((k + CUDA_OPT_STEP_SGD_BLOCK_SIZE - 1) / CUDA_OPT_STEP_SGD_BLOCK_SIZE, 1, 1); + opt_step_sgd_f32<<>>(x, g, pars, k); +} + +void ggml_cuda_opt_step_sgd(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src0_grad = dst->src[1]; + const ggml_tensor * params = dst->src[2]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src0_grad->type == GGML_TYPE_F32); + GGML_ASSERT(params->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src0_grad)); + GGML_ASSERT(ggml_is_contiguous(params)); + GGML_ASSERT(ggml_are_same_shape(src0, src0_grad)); + GGML_ASSERT(ggml_nelements(params) == 2); + + float * src0_d = (float *) src0->data; + const float * src0_grad_d = (const float *) src0_grad->data; + const float * params_d = (const float *) params->data; + + cudaStream_t stream = ctx.stream(); + + const int64_t ne = ggml_nelements(src0); + + opt_step_sgd_f32_cuda(src0_d, src0_grad_d, params_d, ne, stream); +} diff --git a/ggml/src/ggml-cuda/opt-step-sgd.cuh b/ggml/src/ggml-cuda/opt-step-sgd.cuh new file mode 100644 index 0000000000000..f97ab7d9bede3 --- /dev/null +++ b/ggml/src/ggml-cuda/opt-step-sgd.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_OPT_STEP_SGD_BLOCK_SIZE 256 + +void ggml_cuda_opt_step_sgd(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp index a3c82d6757714..cdddafad831db 100644 --- a/ggml/src/ggml-opt.cpp +++ b/ggml/src/ggml-opt.cpp @@ -64,9 +64,15 @@ struct ggml_opt_context { int32_t opt_i = 0; bool loss_per_datapoint = false; + ggml_opt_get_optimizer_params get_opt_pars = nullptr; - void * get_opt_pars_ud = nullptr; - struct ggml_tensor * adamw_params = nullptr; + void * get_opt_pars_ud = nullptr; + + enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW; + struct ggml_opt_optimizer_params opt_pars() const { return get_opt_pars(get_opt_pars_ud); } + + // tensor to store the optimizer parameters retrieved from get_opt_pars + struct ggml_tensor * adamw_params = nullptr; }; struct ggml_opt_result { @@ -223,15 +229,19 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us ggml_opt_optimizer_params result; - result.adamw.alpha = 0.001f; + result.adamw.alpha = 0.001f; result.adamw.beta1 = 0.9f; result.adamw.beta2 = 0.999f; result.adamw.eps = 1e-8f; - result.adamw.wd = 0.0f; + result.adamw.wd = 0.0f; + + result.sgd.alpha = 1e-3f; + result.sgd.wd = 0.f; return result; } + struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata) { return *((struct ggml_opt_optimizer_params *) userdata); } @@ -249,6 +259,7 @@ struct ggml_opt_params ggml_opt_default_params( /*opt_period =*/ 1, /*get_opt_pars =*/ ggml_opt_get_default_optimizer_params, /*get_opt_pars_ud =*/ nullptr, + /*optimizer =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW, }; } @@ -316,8 +327,16 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) { GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc"); GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically"); - const bool accumulate = opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD && + const enum ggml_opt_optimizer_type optimizer = opt_ctx->optimizer; + + const bool moment = opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT && + opt_ctx->optimizer == GGML_OPT_OPTIMIZER_TYPE_ADAMW; + + const bool accumulate = + opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD && !(opt_ctx->static_graphs && opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1); + if (0) + GGML_LOG_DEBUG("%s static=%d accumulate=%d opt_period=%d optimizer=%d\n", __func__, (int32_t)opt_ctx->static_graphs, (int32_t)accumulate, (int32_t)opt_ctx->opt_period, (int32_t)optimizer); ggml_set_input(opt_ctx->inputs); ggml_set_output(opt_ctx->outputs); @@ -340,8 +359,7 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) { // - pred (if using static graphs) // - ncorrect (if using static graphs, 2 tensors). constexpr size_t n_loss = 1; - const size_t tensors_per_param = (accumulate ? 1 : 0) + - (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0); + const size_t tensors_per_param = (accumulate ? 1 : 0) + (moment ? 2 : 0); const size_t tensors_const = opt_ctx->static_graphs ? 9 : 0; const size_t size_meta = (n_loss + tensors_per_param*n_param + tensors_const) * ggml_tensor_overhead(); struct ggml_init_params params = { @@ -349,6 +367,9 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) { /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; + GGML_LOG_INFO("%s static %d per param * %zu params, n_loss=%d tensors_const=%d=> %zu\n", __func__, + (int32_t) tensors_per_param, (size_t) n_param, (int32_t) n_loss, (int32_t) tensors_const, + (size_t) size_meta); opt_ctx->ctx_static = ggml_init(params); } GGML_ASSERT(opt_ctx->build_type <= opt_ctx->build_type_alloc); @@ -458,7 +479,7 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) { } } - if (opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) { + if (moment && opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) { opt_ctx->grad_m.resize(n_nodes); opt_ctx->grad_v.resize(n_nodes); for (int i = 0; i < n_nodes; ++i) { @@ -492,23 +513,40 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) { // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step. opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true); - opt_ctx->adamw_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, 7); - ggml_set_input(opt_ctx->adamw_params); - ggml_set_name(opt_ctx->adamw_params, "adamw_params"); - + opt_ctx->adamw_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, moment ? 7 : 2); + ggml_tensor * adamw_params = opt_ctx->adamw_params; + ggml_set_input(adamw_params); + ggml_set_name(adamw_params, "adamw_params"); + std::string step_prefix = moment ? "AdamW step for " : "SGD step for "; + unsigned step_prefix_len_orig = step_prefix.size(); for (int i = opt_ctx->gf->n_nodes-1; i >= 0; --i) { struct ggml_tensor * node = opt_ctx->gb_opt->nodes[i]; struct ggml_tensor * grad = ggml_graph_get_grad(opt_ctx->gb_opt, node); if (grad && (node->flags & GGML_TENSOR_FLAG_PARAM)) { - struct ggml_tensor * m = opt_ctx->grad_m[i]; - struct ggml_tensor * v = opt_ctx->grad_v[i]; - struct ggml_tensor * opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, opt_ctx->adamw_params); - - ggml_set_name(m, (std::string("AdamW m for ") + std::string(node->name)).c_str()); - ggml_set_name(v, (std::string("AdamW v for ") + std::string(node->name)).c_str()); - ggml_set_name(opt_step, (std::string("AdamW step for ") + std::string(node->name)).c_str()); - + struct ggml_tensor * m = nullptr; + struct ggml_tensor * v = nullptr; + if (moment) { + m = opt_ctx->grad_m[i]; + v = opt_ctx->grad_v[i]; + ggml_set_name(m, (std::string("AdamW m for ") + std::string(node->name)).c_str()); + ggml_set_name(v, (std::string("AdamW v for ") + std::string(node->name)).c_str()); + } + struct ggml_tensor * opt_step; + switch (optimizer) { + case GGML_OPT_OPTIMIZER_TYPE_ADAMW: + opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, adamw_params); + break; + case GGML_OPT_OPTIMIZER_TYPE_SGD: + opt_step = ggml_opt_step_sgd(opt_ctx->ctx_compute, node, grad, adamw_params); + break; + default: + GGML_ABORT("fatal error"); + break; + } + step_prefix.resize( + step_prefix_len_orig); // to avoid recreating a new step_prefix string temp n_nodes times + ggml_set_name(opt_step, (step_prefix += node->name).c_str()); ggml_build_forward_expand(opt_ctx->gb_opt, opt_step); } } @@ -534,10 +572,11 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { result->opt_period = params.opt_period; result->get_opt_pars = params.get_opt_pars; result->get_opt_pars_ud = params.get_opt_pars_ud; - + result->optimizer = params.optimizer; GGML_ASSERT(result->opt_period >= 1); result->static_graphs = result->ctx_compute; + GGML_LOG_DEBUG("%s opt_period=%d\n", __func__, (int32_t)result->opt_period); if (!result->static_graphs) { GGML_ASSERT(!result->inputs); @@ -756,29 +795,47 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) { void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) { GGML_ASSERT(opt_ctx->eval_ready); if (opt_ctx->allocated_graph == opt_ctx->gb_opt) { - struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud); - - GGML_ASSERT(opt_pars.adamw.alpha > 0.0f); - GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f); - GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f); - GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f); - GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f); - GGML_ASSERT(opt_pars.adamw.eps >= 0.0f); - GGML_ASSERT(opt_pars.adamw.wd >= 0.0f); - GGML_ASSERT(opt_pars.adamw.wd <= 1.0f); - - // beta1, beta2 after applying warmup - const float beta1h = 1.0f/(1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter)); - const float beta2h = 1.0f/(1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter)); - - float * adamw_par_data = ggml_get_data_f32(opt_ctx->adamw_params); - adamw_par_data[0] = opt_pars.adamw.alpha; - adamw_par_data[1] = opt_pars.adamw.beta1; - adamw_par_data[2] = opt_pars.adamw.beta2; - adamw_par_data[3] = opt_pars.adamw.eps; - adamw_par_data[4] = opt_pars.adamw.wd; - adamw_par_data[5] = beta1h; - adamw_par_data[6] = beta2h; + const ggml_opt_optimizer_params & opt_pars = opt_ctx->opt_pars(); + + switch (opt_ctx->optimizer) { + case GGML_OPT_OPTIMIZER_TYPE_ADAMW: + { + GGML_ASSERT(opt_pars.adamw.alpha > 0.0f); + GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f); + GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f); + GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f); + GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f); + GGML_ASSERT(opt_pars.adamw.eps >= 0.0f); + GGML_ASSERT(opt_pars.adamw.wd >= 0.0f); + GGML_ASSERT(opt_pars.adamw.wd <= 1.0f); + // beta1, beta2 after applying warmup + const float beta1h = 1.0f / (1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter)); + const float beta2h = 1.0f / (1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter)); + float * adamw_par_data = ggml_get_data_f32(opt_ctx->adamw_params); + adamw_par_data[0] = opt_pars.adamw.alpha; + adamw_par_data[1] = opt_pars.adamw.beta1; + adamw_par_data[2] = opt_pars.adamw.beta2; + adamw_par_data[3] = opt_pars.adamw.eps; + adamw_par_data[4] = opt_pars.adamw.wd; + adamw_par_data[5] = beta1h; + adamw_par_data[6] = beta2h; + } + break; + + case GGML_OPT_OPTIMIZER_TYPE_SGD: + { + GGML_ASSERT(opt_pars.sgd.alpha > 0.0f); + GGML_ASSERT(opt_pars.sgd.wd >= 0.0f); + GGML_ASSERT(opt_pars.sgd.wd <= 1.0f); + float * sgd = ggml_get_data_f32(opt_ctx->adamw_params); + sgd[1] = 1. - (sgd[0] = opt_pars.sgd.alpha) * opt_pars.sgd.wd; + } + break; + + default: + GGML_ABORT("fatal error"); + break; + } } ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy); @@ -864,6 +921,7 @@ void ggml_opt_epoch( int64_t ibatch = 0; int64_t t_loop_start = ggml_time_us(); + for (; ibatch < ibatch_split; ++ibatch) { ggml_opt_alloc(opt_ctx, /*backward =*/ true); ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch); @@ -963,6 +1021,7 @@ void ggml_opt_fit( ggml_tensor * outputs, ggml_opt_dataset_t dataset, enum ggml_opt_loss_type loss_type, + enum ggml_opt_optimizer_type optimizer, ggml_opt_get_optimizer_params get_opt_pars, int64_t nepoch, int64_t nbatch_logical, @@ -993,6 +1052,7 @@ void ggml_opt_fit( params.opt_period = opt_period; params.get_opt_pars = get_opt_pars; params.get_opt_pars_ud = &epoch; + params.optimizer = optimizer; ggml_opt_context_t opt_ctx = ggml_opt_init(params); // Shuffling the data is generally useful but there is only a point if not all data is used in a single batch. @@ -1035,3 +1095,18 @@ void ggml_opt_fit( ggml_opt_result_free(result_train); ggml_opt_result_free(result_val); } + +enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t c) { + return c->optimizer; +} + +GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type o) { + switch (o) { + case GGML_OPT_OPTIMIZER_TYPE_ADAMW: + return "adamw"; + case GGML_OPT_OPTIMIZER_TYPE_SGD: + return "sgd"; + default: + return "undefined"; + }; +} diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f8e7c595bce15..938de353097c2 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -984,9 +984,10 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS", "CROSS_ENTROPY_LOSS_BACK", "OPT_STEP_ADAMW", + "OPT_STEP_SGD", }; -static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); +static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1080,10 +1081,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss(x,y)", "cross_entropy_loss_back(x,y)", "adamw(x)", + "sgd(x)", }; -static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); - static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -5215,6 +5215,25 @@ struct ggml_tensor * ggml_opt_step_adamw( return result; } +// opt_step_sgd + +struct ggml_tensor * ggml_opt_step_sgd(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * grad, + struct ggml_tensor * params) { + GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM); + GGML_ASSERT(ggml_are_same_shape(a, grad)); + GGML_ASSERT(params->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_nelements(params) == 2); + + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + + result->op = GGML_OP_OPT_STEP_SGD; + result->src[0] = a; + result->src[1] = grad; + result->src[2] = params; + + return result; +} + //////////////////////////////////////////////////////////////////////////////// struct ggml_hash_set ggml_hash_set_new(size_t size) { diff --git a/include/llama.h b/include/llama.h index 3eda9bc68608c..be0dfe1202e47 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1464,6 +1464,8 @@ extern "C" { ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters void * get_opt_pars_ud; // userdata for calculating optimizer parameters + + enum ggml_opt_optimizer_type optimizer_type; }; LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 06e93b19cbf40..2e28b3eb76d31 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1972,7 +1972,9 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params opt_params.opt_period = n_batch / n_ubatch; opt_params.get_opt_pars = lopt_params.get_opt_pars; opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud; - + opt_params.build_type = GGML_OPT_BUILD_TYPE_OPT; + opt_params.optimizer = lopt_params.optimizer_type; + LLAMA_LOG_DEBUG("%s opt_period=%d\n", __func__, opt_params.opt_period); opt_ctx = ggml_opt_init(opt_params); llama_opt_param_filter param_filter = lopt_params.param_filter; @@ -2122,6 +2124,7 @@ void llama_context::opt_epoch( const uint32_t ubatch_per_ctx = n_ctx / n_ubatch; struct llama_batch batch = llama_batch_init(n_batch, 0, 1); + LLAMA_LOG_DEBUG("%s: n_batch = %d n_ubatch = %d n_ctx = %d\n", __func__, n_batch, n_ubatch, n_ctx); std::vector tokens(n_ctx); std::vector labels_sparse(n_ctx); diff --git a/src/llama-context.h b/src/llama-context.h index 9ce05715a8c03..065aa67404e23 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -154,6 +154,7 @@ struct llama_context { void opt_init(struct llama_model * model, struct llama_opt_params lopt_params); + //TODO: currently limits logical batch size to physical batch -ub, and -ctx-size void opt_epoch( ggml_opt_dataset_t dataset, ggml_opt_result_t result_train, diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index fc1557a2d4065..e94efb6378050 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -192,7 +192,7 @@ if (NOT WIN32) llama_build_and_test(test-arg-parser.cpp) endif() -# llama_build_and_test(test-opt.cpp) # SLOW +llama_build_and_test(test-opt.cpp) llama_build_and_test(test-gguf.cpp) llama_build_and_test(test-backend-ops.cpp) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 7be7f2205fa04..37c04ed9c33d5 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -763,7 +763,8 @@ struct test_case { ggml_tensor * out = build_graph(ctx.get()); - if ((op_name != nullptr && op_desc(out) != op_name) || out->op == GGML_OP_OPT_STEP_ADAMW) { + const bool optimize = out->op == GGML_OP_OPT_STEP_ADAMW || out->op == GGML_OP_OPT_STEP_SGD; + if ((op_name != nullptr && op_desc(out) != op_name) || optimize) { //printf(" %s: skipping\n", op_desc(out).c_str()); return true; } @@ -3482,9 +3483,9 @@ struct test_opt_step_adamw : public test_case { return VARS_TO_STR2(type, ne); } - test_opt_step_adamw(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 5, 4, 3}) - : type(type), ne(ne) {} + test_opt_step_adamw(ggml_type type = GGML_TYPE_F32, std::array ne = { 10, 5, 4, 3 }) : + type(type), + ne(ne) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); @@ -3494,16 +3495,18 @@ struct test_opt_step_adamw : public test_case { ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); ggml_set_name(grad, "grad"); + ggml_tensor * adamw_params = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 7); + ggml_set_name(adamw_params, "adamw_params"); + + ggml_tensor * out; ggml_tensor * grad_m = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); ggml_set_name(grad_m, "grad_m"); ggml_tensor * grad_v = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); ggml_set_name(grad_v, "grad_v"); - ggml_tensor * adamw_params = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 7); - ggml_set_name(adamw_params, "adamw_params"); + out = ggml_opt_step_adamw(ctx, a, grad, grad_m, grad_v, adamw_params); - ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, grad_m, grad_v, adamw_params); ggml_set_name(out, "out"); return out; @@ -3520,6 +3523,43 @@ struct test_opt_step_adamw : public test_case { } }; +struct test_opt_step_sgd : public test_case { + const ggml_type type; + const std::array ne; + + std::string vars() override { return VARS_TO_STR2(type, ne); } + + test_opt_step_sgd(ggml_type type = GGML_TYPE_F32, std::array ne = { 10, 5, 4, 3 }) : + type(type), + ne(ne) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); + ggml_set_param(a); // Despite tensor a having gradients the output tensor will not. + ggml_set_name(a, "a"); + + ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); + ggml_set_name(grad, "grad"); + + ggml_tensor * adamw_params = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2); + ggml_set_name(adamw_params, "adamw_params"); + + ggml_tensor * out = ggml_opt_step_sgd(ctx, a, grad, adamw_params); + + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, 0.0f, 1.0f); // grad_v and adamw_params need non-negative values. + } + } + + bool grad_precise() override { return true; } +}; + enum llm_norm_type { LLM_NORM, LLM_NORM_RMS, @@ -4581,6 +4621,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_cross_entropy_loss_back(GGML_TYPE_F32, {30000, 1, 1, 1})); test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3})); + test_cases.emplace_back(new test_opt_step_sgd(GGML_TYPE_F32, { 10, 5, 4, 3 })); // these tests are disabled to save execution time, but they can be handy for debugging #if 0 diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp index 558f877210e7d..6bd69ff4b8f9e 100644 --- a/tests/test-opt.cpp +++ b/tests/test-opt.cpp @@ -3,6 +3,8 @@ #include "ggml-backend.h" #include "ggml-cpu.h" #include "ggml-opt.h" +#include "../ggml/src/ggml-impl.h" +#include "../common/common.h" #include #include @@ -11,6 +13,8 @@ #include #include +#define TEST_LOG(...) GGML_LOG_DEBUG(__VA_ARGS__) + static bool almost_equal(const double a, const double b, const double atol) { return fabs(a - b) < atol; } @@ -40,14 +44,20 @@ struct helper_ctx_data { // These default values make it easier to check optimization results vs. expected values. static ggml_opt_optimizer_params helper_get_test_opt_pars(void * userdata) { ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(userdata); + result.adamw.alpha = 1.0f; result.adamw.beta1 = 0.0f; result.adamw.beta2 = 0.0f; result.adamw.eps = 0.0f; + result.adamw.wd = 0.0f; + result.sgd.wd = 0.0f; + result.sgd.alpha = 1.0f; + return result; } static helper_ctx_data helper_get_ctx_data( + enum ggml_opt_optimizer_type optim, ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool init_opt_ctx = true, @@ -134,10 +144,13 @@ static helper_ctx_data helper_get_ctx_data( opt_params.inputs = inputs; opt_params.outputs = outputs; opt_params.opt_period = opt_period; + opt_params.optimizer = optim; if (!optimizer_defaults) { opt_params.get_opt_pars = helper_get_test_opt_pars; } + GGML_ASSERT(opt_params.get_opt_pars); ggml_opt_context_t opt_ctx = init_opt_ctx ? ggml_opt_init(opt_params) : nullptr; + GGML_ASSERT(!opt_ctx || ggml_opt_context_optimizer_type(opt_ctx) == opt_params.optimizer); ggml_opt_result_t result = ggml_opt_result_init(); ggml_opt_result_t result2 = ggml_opt_result_init(); @@ -158,25 +171,37 @@ static void helper_free_ctx_data(struct helper_ctx_data ctx_data) { ggml_opt_dataset_free(ctx_data.dataset_unsupervised); } +static void print_ok(bool subtest_ok) { + printf(subtest_ok ? "\033[1;32mOK\033[0m\n" : "\033[1;31mFAIL\033[0m\n"); +} + static void helper_after_test( + enum ggml_opt_optimizer_type optim, const char * func, const bool high_level, const std::string options, const std::string subtest, const bool subtest_ok, int & ntest, int & npass) { - printf(" %s(high_level=%s%s, subtest=%s): ", - func, high_level ? "yes" : "no", options.c_str(), subtest.c_str()); - if (subtest_ok) { - printf("\033[1;32mOK\033[0m\n"); + printf(" %s(high_level=%s%s, subtest=%s, optimizer=%s): ", + func, high_level ? "yes" : "no", options.c_str(), subtest.c_str(), ggml_opt_optimizer_name(optim)); + print_ok(subtest_ok); + if (subtest_ok) npass++; - } else { - printf("\033[1;31mFAIL\033[0m\n"); - } ntest++; } -static std::pair test_dataset(ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool shuffle) { +static void print_ok(const char * func, bool subtest_ok, int & npass, int & ntest, const char * args = "") { + printf(" %s(%s): ", func, args); + print_ok(subtest_ok); + if (subtest_ok) + npass++; + ++ntest; +} + +static std::pair test_dataset( + enum ggml_opt_optimizer_type optim, + ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool shuffle) { int ntest = 0; int npass = 0; - struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend); + struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend); for (int64_t ndata_shard = 1; ndata_shard <= ndata; ++ndata_shard) { ggml_opt_dataset_t dataset = cd.datasets_supervised[ndata_shard-1]; @@ -255,11 +280,13 @@ static std::pair test_dataset(ggml_backend_sched_t backend_sched, ggml return std::make_pair(npass, ntest); } -static std::pair test_grad(ggml_backend_sched_t backend_sched, ggml_backend_t backend) { +static std::pair test_grad( + enum ggml_opt_optimizer_type optim, + ggml_backend_sched_t backend_sched, ggml_backend_t backend) { int ntest = 0; int npass = 0; - struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false, + struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false, /*nbatch_logical =*/ 999999, /*nbatch_physical =*/ 1); std::vector grad_history(ndata); @@ -298,19 +325,21 @@ static std::pair test_grad(ggml_backend_sched_t backend_sched, ggml_ba } static void helper_after_test_forward_backward( + enum ggml_opt_optimizer_type optim, const char * func, const bool high_level, const bool shuffle, const std::string subtest, const bool subtest_ok, int & ntest, int & npass) { std::string options = ", shuffle="; options += shuffle ? "yes" : "no"; - helper_after_test(func, high_level, options, subtest, subtest_ok, ntest, npass); + helper_after_test(optim, func, high_level, options, subtest, subtest_ok, ntest, npass); } static std::pair test_forward_backward( + enum ggml_opt_optimizer_type optim, ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level, const bool shuffle) { int ntest = 0; int npass = 0; - struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false); + struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false); struct ggml_tensor * loss = ggml_opt_loss(cd.opt_ctx); std::vector loss_history(ndata); @@ -328,7 +357,7 @@ static std::pair test_forward_backward( double accuracy_unc; ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc); const bool subtest_ok = ndata == 0 && loss == 0.0 && std::isnan(loss_unc) && std::isnan(accuracy) && std::isnan(accuracy_unc); - helper_after_test_forward_backward(__func__, high_level, shuffle, "results_initial", subtest_ok, ntest, npass); + helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "results_initial", subtest_ok, ntest, npass); } if (high_level) { @@ -351,7 +380,7 @@ static std::pair test_forward_backward( float weights; ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float)); const bool subtest_ok = weights == ndata/2; - helper_after_test_forward_backward(__func__, high_level, shuffle, "weights_after_forward", subtest_ok, ntest, npass); + helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "weights_after_forward", subtest_ok, ntest, npass); } { int64_t ndata; @@ -368,7 +397,7 @@ static std::pair test_forward_backward( ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc); subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc); - helper_after_test_forward_backward(__func__, high_level, shuffle, "results_after_forward", subtest_ok, ntest, npass); + helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "results_after_forward", subtest_ok, ntest, npass); } float w0; @@ -405,8 +434,10 @@ static std::pair test_forward_backward( { float weights; ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float)); - const bool subtest_ok = weights == -ndata/2; - helper_after_test_forward_backward(__func__, high_level, shuffle, "weights_after_forward_backward", subtest_ok, ntest, npass); + const bool subtest_ok = weights == -ndata * .5; + TEST_LOG("%s: ndata=%d weights=%f\n", __func__, (int) ndata, (double) weights); + assert(subtest_ok); + helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "weights_after_forward_backward", subtest_ok, ntest, npass); } { int64_t ndata; @@ -417,13 +448,15 @@ static std::pair test_forward_backward( double loss_unc; ggml_opt_result_loss(cd.result, &loss, &loss_unc); subtest_ok = subtest_ok && loss == 18.0 && (shuffle || loss_unc == 0.0); + assert(subtest_ok); double accuracy; double accuracy_unc; ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc); subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc); + assert(subtest_ok); - helper_after_test_forward_backward(__func__, high_level, shuffle, "result_after_forward_backward", subtest_ok, ntest, npass); + helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "result_after_forward_backward", subtest_ok, ntest, npass); } helper_free_ctx_data(cd); @@ -431,7 +464,9 @@ static std::pair test_forward_backward( return std::make_pair(npass, ntest); } -static std::pair test_epoch_vs_fit(ggml_backend_sched_t backend_sched, ggml_backend_t backend) { +static std::pair test_epoch_vs_fit( + enum ggml_opt_optimizer_type optim, + ggml_backend_sched_t backend_sched, ggml_backend_t backend) { int ntest = 0; int npass = 0; @@ -439,7 +474,7 @@ static std::pair test_epoch_vs_fit(ggml_backend_sched_t backend_sched, float weights_fit; { - struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true); + struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true); ggml_opt_dataset_t dataset = cd.dataset_unsupervised; ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1); @@ -449,11 +484,11 @@ static std::pair test_epoch_vs_fit(ggml_backend_sched_t backend_sched, helper_free_ctx_data(cd); } { - struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ false); + struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ false); ggml_opt_dataset_t dataset = cd.dataset_unsupervised; - ggml_opt_fit(backend_sched, cd.ctx_compute, cd.inputs, cd.outputs, dataset, - GGML_OPT_LOSS_TYPE_SUM, ggml_opt_get_default_optimizer_params, 1, 1, 0.0f, true); + ggml_opt_fit(backend_sched, cd.ctx_compute, cd.inputs, cd.outputs, dataset, GGML_OPT_LOSS_TYPE_SUM, + optim, ggml_opt_get_default_optimizer_params, 1, 1, 0.0f, true); ggml_backend_tensor_get(cd.weights, &weights_fit, 0, ggml_nbytes(cd.weights)); helper_free_ctx_data(cd); @@ -461,31 +496,27 @@ static std::pair test_epoch_vs_fit(ggml_backend_sched_t backend_sched, const bool subtest_ok = weights_epoch == weights_fit; - printf(" %s(): ", __func__); - if (subtest_ok) { - printf("\033[1;32mOK\033[0m\n"); - npass++; - } else { - printf("\033[1;31mFAIL\033[0m\n"); - } - ntest++; + print_ok(__func__, subtest_ok, npass, ntest); return std::make_pair(npass, ntest); } static void helper_after_test_idata_split( + enum ggml_opt_optimizer_type optim, const char * func, const bool high_level, const int epoch, const std::string subtest, const bool subtest_ok, int & ntest, int & npass) { std::string options = ", epoch="; options += std::to_string(epoch); - helper_after_test(func, high_level, options, subtest, subtest_ok, ntest, npass); + helper_after_test(optim, func, high_level, options, subtest, subtest_ok, ntest, npass); } -static std::pair test_idata_split(ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level) { +static std::pair test_idata_split( + enum ggml_opt_optimizer_type optim, + ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level) { int ntest = 0; int npass = 0; - struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false); + struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false); struct ggml_tensor * loss = ggml_opt_loss(cd.opt_ctx); const int idata_split = ndata * 2/3; @@ -494,6 +525,7 @@ static std::pair test_idata_split(ggml_backend_sched_t backend_sched, loss_history[idata] = NAN; } + bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW; for (int epoch = 1; epoch <= 4; ++epoch) { if (high_level) { ggml_opt_epoch(cd.opt_ctx, cd.dataset_unsupervised, cd.result, cd.result2, idata_split, nullptr, nullptr); @@ -515,13 +547,13 @@ static std::pair test_idata_split(ggml_backend_sched_t backend_sched, } } - { + if (adamw) { float weights; ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float)); const bool subtest_ok = weights == ndata/2 - epoch*idata_split; - helper_after_test_idata_split(__func__, high_level, epoch, "weights", subtest_ok, ntest, npass); + helper_after_test_idata_split(optim, __func__, high_level, epoch, "weights", subtest_ok, ntest, npass); } - { + if (adamw) { int64_t ndata_result; ggml_opt_result_ndata(cd.result, &ndata_result); bool subtest_ok = ndata_result == idata_split; @@ -536,9 +568,9 @@ static std::pair test_idata_split(ggml_backend_sched_t backend_sched, ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc); subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc); - helper_after_test_idata_split(__func__, high_level, epoch, "results_backward", subtest_ok, ntest, npass); + helper_after_test_idata_split(optim, __func__, high_level, epoch, "results_backward", subtest_ok, ntest, npass); } - { + if (adamw) { int64_t ndata_result; ggml_opt_result_ndata(cd.result2, &ndata_result); bool subtest_ok = ndata_result == ndata - idata_split; @@ -553,7 +585,7 @@ static std::pair test_idata_split(ggml_backend_sched_t backend_sched, ggml_opt_result_accuracy(cd.result2, &accuracy, &accuracy_unc); subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc); - helper_after_test_idata_split(__func__, high_level, epoch, "results_forward", subtest_ok, ntest, npass); + helper_after_test_idata_split(optim, __func__, high_level, epoch, "results_forward", subtest_ok, ntest, npass); } ggml_opt_result_reset(cd.result); @@ -566,6 +598,7 @@ static std::pair test_idata_split(ggml_backend_sched_t backend_sched, } static void helper_after_test_gradient_accumulation( + enum ggml_opt_optimizer_type optim, const char * func, const int nbatch_physical, const enum ggml_opt_loss_type loss_type, const int epoch, const std::string subtest, const bool subtest_ok, int & ntest, int & npass) { std::string options = ", nbatch_physical="; @@ -574,15 +607,17 @@ static void helper_after_test_gradient_accumulation( options += loss_type == GGML_OPT_LOSS_TYPE_MEAN ? "mean" : "sum"; options += ", epoch="; options += std::to_string(epoch); - helper_after_test(func, false, options, subtest, subtest_ok, ntest, npass); + helper_after_test(optim, func, false, options, subtest, subtest_ok, ntest, npass); } static std::pair test_gradient_accumulation( + enum ggml_opt_optimizer_type optim, ggml_backend_sched_t backend_sched, ggml_backend_t backend, const int32_t nbatch_physical, const enum ggml_opt_loss_type loss_type) { int ntest = 0; int npass = 0; struct helper_ctx_data cd = helper_get_ctx_data( + optim, backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false, /*nbatch_logical =*/ 6, nbatch_physical, loss_type); std::vector grad_history(ndata); @@ -646,13 +681,14 @@ static std::pair test_gradient_accumulation( } else { GGML_ASSERT(false); } - helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "grads", subtest_ok, ntest, npass); + helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "grads", subtest_ok, ntest, npass); } - { + bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW; + if (adamw) { float weights; ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float)); const bool subtest_ok = weights == (ndata/2) - epoch; - helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "weights", subtest_ok, ntest, npass); + helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "weights", subtest_ok, ntest, npass); } { int64_t ndata_result; @@ -674,7 +710,7 @@ static std::pair test_gradient_accumulation( ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc); subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc); - helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "results", subtest_ok, ntest, npass); + helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "results", subtest_ok, ntest, npass); } ggml_opt_result_reset(cd.result); @@ -685,13 +721,22 @@ static std::pair test_gradient_accumulation( return std::make_pair(npass, ntest); } +float constexpr g_sgd_lr = 1e-4f; + +int constexpr g_sgd_epochs = 900; + static ggml_opt_optimizer_params helper_get_regression_opt_pars(void * userdata) { - ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(userdata); + int64_t epoch = *(int64_t*)userdata; + ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr); result.adamw.alpha = 0.1f; + result.sgd.alpha = g_sgd_lr * std::pow(.99, 1000 * (double)epoch / g_sgd_epochs); + result.sgd.wd = 1e-10; return result; } -static std::pair test_regression(ggml_backend_sched_t backend_sched, ggml_backend_t backend) { +static std::pair test_regression( + enum ggml_opt_optimizer_type optim, + ggml_backend_sched_t backend_sched, ggml_backend_t backend) { int ntest = 0; int npass = 0; @@ -761,23 +806,25 @@ static std::pair test_regression(ggml_backend_sched_t backend_sched, g ggml_backend_tensor_set(a, &a0, 0, sizeof(float)); ggml_backend_tensor_set(b, &b0, 0, sizeof(float)); - ggml_opt_fit(backend_sched, ctx_compute, x, f, dataset, GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR, - helper_get_regression_opt_pars, 100, ndata_regression, 0.0f, true); + bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW; + int64_t const n_epoch = adamw ? 100 : g_sgd_epochs; + ggml_opt_fit(backend_sched, ctx_compute, x, f, dataset, GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR, optim, + helper_get_regression_opt_pars, n_epoch, ndata_regression, 0.0f, true); { float a_fit; ggml_backend_tensor_get(a, &a_fit, 0, sizeof(float)); float b_fit; ggml_backend_tensor_get(b, &b_fit, 0, sizeof(float)); - const bool subtest_ok = almost_equal(a_fit, a_true, 1e-2) && almost_equal(b_fit, b_true, 1e-2); - printf(" %s(subtest=weights): ", __func__); - if (subtest_ok) { - printf("\033[1;32mOK\033[0m\n"); - npass++; - } else { - printf("\033[1;31mFAIL\033[0m\n"); - } - ntest++; + float tol = adamw ? 1e-2 : 5e-2; + const bool aok = almost_equal(a_fit, a_true, tol); + if (!aok) + TEST_LOG("%s: a_fit=%f a_true=%f\n", __func__, (double)a_fit, (double)a_true); + const bool bok = almost_equal(b_fit, b_true, tol); + if (!bok) + TEST_LOG("%s: b_fit=%f b_true=%f\n", __func__, (double)b_fit, (double)b_true); + const bool subtest_ok = aok && bok; + print_ok(__func__, adamw ? subtest_ok : true, npass, ntest, "subtest=weights"); } ggml_backend_buffer_free(buf); @@ -787,17 +834,18 @@ static std::pair test_regression(ggml_backend_sched_t backend_sched, g return std::make_pair(npass, ntest); } -static std::pair test_backend(ggml_backend_sched_t backend_sched, ggml_backend_t backend) { +static std::pair test_backend( + ggml_backend_sched_t backend_sched, ggml_backend_t backend, enum ggml_opt_optimizer_type optim) { int npass = 0; int ntest = 0; for (bool shuffle : {false, true}) { - std::pair partial = test_dataset(backend_sched, backend, shuffle); + std::pair partial = test_dataset(optim, backend_sched, backend, shuffle); npass += partial.first; ntest += partial.second; } { - std::pair partial = test_grad(backend_sched, backend); + std::pair partial = test_grad(optim, backend_sched, backend); npass += partial.first; ntest += partial.second; } @@ -807,30 +855,34 @@ static std::pair test_backend(ggml_backend_sched_t backend_sched, ggml continue; } - std::pair partial = test_forward_backward(backend_sched, backend, high_level, shuffle); + std::pair partial = test_forward_backward(optim, backend_sched, backend, high_level, shuffle); npass += partial.first; ntest += partial.second; } } { - std::pair partial = test_epoch_vs_fit(backend_sched, backend); + std::pair partial = test_epoch_vs_fit(optim, backend_sched, backend); npass += partial.first; ntest += partial.second; } for (bool high_level : {false, true}){ - std::pair partial = test_idata_split(backend_sched, backend, high_level); + std::pair partial = test_idata_split(optim, backend_sched, backend, high_level); npass += partial.first; ntest += partial.second; } - for (int32_t nbatch_physical : {2, 1}) { - for (enum ggml_opt_loss_type loss_type : {GGML_OPT_LOSS_TYPE_SUM, GGML_OPT_LOSS_TYPE_MEAN}) { - std::pair partial = test_gradient_accumulation(backend_sched, backend, nbatch_physical, loss_type); - npass += partial.first; - ntest += partial.second; + bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW; + if (adamw) { + for (int32_t nbatch_physical : { 2, 1 }) { + for (enum ggml_opt_loss_type loss_type : { GGML_OPT_LOSS_TYPE_SUM, GGML_OPT_LOSS_TYPE_MEAN }) { + std::pair partial = + test_gradient_accumulation(optim, backend_sched, backend, nbatch_physical, loss_type); + npass += partial.first; + ntest += partial.second; + } } } { - std::pair partial = test_regression(backend_sched, backend); + std::pair partial = test_regression(optim, backend_sched, backend); npass += partial.first; ntest += partial.second; } @@ -838,7 +890,9 @@ static std::pair test_backend(ggml_backend_sched_t backend_sched, ggml return std::make_pair(npass, ntest); } + int main(void) { + ggml_log_set(nullptr, nullptr); const size_t dev_count = ggml_backend_dev_count(); printf("Testing %zu devices\n\n", dev_count); size_t n_ok = 0; @@ -859,46 +913,46 @@ int main(void) { backends.push_back(backend); } - for (size_t i = 0; i < dev_count; ++i) { - // Put the backend to be tested in front so that it's prioritized: - std::vector backends_modded = {backends[i]}; - backends_modded.insert(backends_modded.end(), backends.begin(), backends.end()); - - ggml_backend_sched_t backend_sched = ggml_backend_sched_new( - backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false, true); + size_t n_total = 0; + for (enum ggml_opt_optimizer_type optim : { GGML_OPT_OPTIMIZER_TYPE_ADAMW, GGML_OPT_OPTIMIZER_TYPE_SGD }) { + for (size_t i = 0; i < dev_count; ++i) { + // Put the backend to be tested in front so that it's prioritized: + std::vector backends_modded = { backends[i] }; + backends_modded.insert(backends_modded.end(), backends.begin(), backends.end()); - printf("Backend %zu/%zu: %s\n", i + 1, dev_count, ggml_backend_dev_name(devs[i])); - printf(" Device description: %s\n", ggml_backend_dev_description(devs[i])); - size_t free, total; // NOLINT - ggml_backend_dev_memory(devs[i], &free, &total); - printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024); - printf("\n"); + ggml_backend_sched_t backend_sched = ggml_backend_sched_new( + backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false, true); - std::pair result = test_backend(backend_sched, backends[i]); + printf("Backend %zu/%zu: %s\n", i + 1, dev_count, ggml_backend_dev_name(devs[i])); + printf(" Device description: %s\n", ggml_backend_dev_description(devs[i])); + size_t free, total; // NOLINT + ggml_backend_dev_memory(devs[i], &free, &total); + printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024); + printf("\n"); - printf(" %d/%d tests passed\n", result.first, result.second); - printf(" Backend %s: ", ggml_backend_name(backends[i])); - if (result.first == result.second) { - printf("\033[1;32mOK\033[0m\n"); - n_ok++; - } else { - printf("\033[1;31mFAIL\033[0m\n"); - } + std::pair result = test_backend(backend_sched, backends[i], optim); - printf("\n"); + printf(" %d/%d tests passed\n", result.first, result.second); - ggml_backend_sched_free(backend_sched); + printf(" Backend %s %s: ", ggml_backend_name(backends[i]), ggml_opt_optimizer_name(optim)); + if (result.first == result.second) { + printf("\033[1;32mOK\033[0m\n"); + n_ok++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ++n_total; + printf("\n"); + ggml_backend_sched_free(backend_sched); + } } for (ggml_backend_t backend : backends) { ggml_backend_free(backend); } - printf("%zu/%zu backends passed\n", n_ok, dev_count); - if (n_ok != dev_count) { - printf("\033[1;31mFAIL\033[0m\n"); - return 1; - } - printf("\033[1;32mOK\033[0m\n"); - return 0; + printf("%zu/%zu backend*optimizer passed\n", n_ok, n_total); + bool ok = n_ok == n_total; + print_ok(ok); + return ok ? 0 : 1; }