finetune.cpp command-line arg

graehl · graehl · commit e752031ded21 · 2025-05-29T10:03:37.000-07:00
add to ggml-opt learning rate (adamw alpha) cmdline arg, and an optimizer enum defaulting to adamw, including string->id mapping, preparatory to work to support SGD these are in common args a set of optimizer options active only for the new FINETUNE example (but we drop all the previous finetune.cpp PERPLEXITY options which we're told are unused/accidental) perhaps breaking with precedent, the ggml_opt_optimizer_params struct is included directly as args - if desired, we can instead just add learning rate and optimizer type to a struct independent of ggml-opt.h as proposed in #13835
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1095,6 +1095,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
         "llama-embedding",
         "llama-eval-callback",
         "llama-export-lora",
+        "llama-finetune",
         "llama-gen-docs",
         "llama-gguf",
         "llama-gguf-hash",
@@ -1239,6 +1240,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     sampler_type_names.pop_back();
 
 
+    params.optimize = ggml_opt_get_default_optimizer_params(NULL);
+    params.optimize.adamw.alpha = 1e-8; // default 1e-3 is much too high for LLAMA_EXAMPLE_FINETUNE
+
     /**
      * filter options by example
      * rules:
@@ -1472,14 +1476,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.ctx_shift = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
     add_opt(common_arg(
         {"--chunks"}, "N",
         string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
         [](common_params & params, int value) {
             params.n_chunks = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"-fa", "--flash-attn"},
         string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
@@ -2181,6 +2185,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.ppl_output_type = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(common_arg(
+        {"-lr", "-alpha", "--alpha", "--learning-rate"}, "ALPHA",
+        string_format("adamw optimizer alpha (default: %.1f)", (double)params.optimize.adamw.alpha),
+        [](common_params & params, const std::string & value) {
+            params.optimize.adamw.alpha = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
+    add_opt(common_arg(
+        {"-opt", "--optimizer"}, "sgd|adamw",
+        "adamw or //TODO:sgd",
+        [](common_params & params, std::string const& name) {
+          params.optimize.optimizer = named_ggml_opt_optimizer(name.c_str());
+          if (params.optimize.optimizer == GGML_OPT_OPTIMIZER_COUNT)
+            throw std::invalid_argument("invalid --optimizer N (try 0)");
+          else if (params.optimize.optimizer == GGML_OPT_OPTIMIZER_SGD)
+            throw std::invalid_argument("TODO: implement SGD");
+        }
+    ).set_examples({LLAMA_EXAMPLE_FINETUNE}));
     add_opt(common_arg(
         {"-dt", "--defrag-thold"}, "N",
         string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
diff --git a/common/common.h b/common/common.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "llama-cpp.h"
+#include "ggml-opt.h"
 
 #include <set>
 #include <string>
@@ -80,6 +81,7 @@ enum llama_example {
     LLAMA_EXAMPLE_LOOKUP,
     LLAMA_EXAMPLE_PARALLEL,
     LLAMA_EXAMPLE_TTS,
+    LLAMA_EXAMPLE_FINETUNE,
 
     LLAMA_EXAMPLE_COUNT,
 };
@@ -349,6 +351,8 @@ struct common_params {
     bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
 
+    // finetune
+    struct ggml_opt_optimizer_params optimize;
     // embedding
     bool embedding         = false; // get only sentence embedding
     int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp
@@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
 
     params.escape = false;
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
         return 1;
     }
 
@@ -60,8 +60,8 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
     ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
 
-    struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
-    optimizer_params.adamw.alpha = 1e-7f; // learning rate
+    struct ggml_opt_optimizer_params &optimizer_params = params.optimize;
+    LOG_INF("-optimizer %d -lr: %.1f", optimizer_params.optimizer, (double)optimizer_params.adamw.alpha);
 
     struct llama_opt_params lopt_params {
         /*n_ctx_train     =*/ 0,
diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h
@@ -74,6 +74,17 @@ extern "C" {
         GGML_OPT_BUILD_TYPE_OPT     = 30,
     };
 
+    enum ggml_opt_optimizer {
+      GGML_OPT_OPTIMIZER_ADAMW,
+      GGML_OPT_OPTIMIZER_SGD,
+
+      GGML_OPT_OPTIMIZER_COUNT
+    };
+
+    // "adamw" or "sgd" (case insensitive)
+    GGML_API char const* ggml_opt_optimizer_name (enum ggml_opt_optimizer);
+    GGML_API enum ggml_opt_optimizer named_ggml_opt_optimizer(char const*);
+
     // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
     struct ggml_opt_optimizer_params {
         // AdamW optimizer parameters
@@ -84,6 +95,7 @@ extern "C" {
             float eps;   // epsilon for numerical stability
             float wd;    // weight decay for AdamW, use 0.0f to disable
         } adamw;
+        enum ggml_opt_optimizer optimizer;
     };
 
     // callback to calculate optimizer parameters prior to a backward pass
diff --git a/ggml/src/ggml-opt.cpp b/ggml/src/ggml-opt.cpp
@@ -228,10 +228,30 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
     result.adamw.beta2 = 0.999f;
     result.adamw.eps   = 1e-8f;
     result.adamw.wd    = 0.0f;
+    result.optimizer   = GGML_OPT_OPTIMIZER_ADAMW;
 
     return result;
 }
 
+GGML_API char const* ggml_opt_optimizer_name (enum ggml_opt_optimizer o) {
+  switch(o) {
+    case GGML_OPT_OPTIMIZER_ADAMW:
+      return "adamw";
+    case GGML_OPT_OPTIMIZER_SGD:
+      return "sgd";
+    default:
+      return "undefined";
+  };
+}
+
+
+GGML_API enum ggml_opt_optimizer named_ggml_opt_optimizer (char const* n) {
+  if (!strcasecmp("adamw", n)) return GGML_OPT_OPTIMIZER_ADAMW;
+  else if (!strcasecmp("sgd", n))return GGML_OPT_OPTIMIZER_SGD;
+  else return GGML_OPT_OPTIMIZER_COUNT;
+}
+
+
 struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata) {
     return *((struct ggml_opt_optimizer_params *) userdata);
 }