fix(ignite): making state saveable, param_groups modifieable (#1235)

* ope): Brief description fix(ignite): making state saveable, param_groups modifieable It was impossible to save the state because of: * #1233 * undefined tensors were part of the state such as max_exp_avg_sq for adam with amsgrad = FALSE. We now keep them as 0-sized tensors as undefined tensors are not serializeable. (The reason we keep them at all is that it simplifies the saving and loading of state dicts easier) This PR also improves the tests by removing an unnecessary call to `torch_manual_seed()` that made the tests deterministic * cleanup previous commit * trigger ci * tests: fix ignite sgd test
mlverse · Jan 16, 2025 · 37db774 · 37db774
1 parent b31a5fc
commit 37db774
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 24 deletions.
diff --git a/R/ignite.R b/R/ignite.R
@@ -117,7 +117,8 @@ OptimizerIgnite <- R6::R6Class(
     #' The parameter groups of the optimizer.
     param_groups = function(rhs) {
       if (!missing(rhs)) {
-        prev_param_groups <- self$state_dict()$param_groups
+        prev_param_groups <- self$param_groups
+        all_params = unlist(lapply(prev_param_groups, function(x) x$params))
         if (!is.list(rhs) && length(rhs) == length(prev_param_groups)) {
           value_error("Parameter groups must be a list of the same length as the number of parameter groups.")
         }
@@ -128,8 +129,16 @@ OptimizerIgnite <- R6::R6Class(
             value_error("Parameter groups must have names {paste0(names(prev_param_group), collapse = ', ')} but got {paste0(names(new_param_group), collapse = ', ')}.")
           }
 
-          if (!identical(prev_param_group$params, new_param_group$params)) {
-            value_error("Cannot change the indices of the parameter group, use `$add_param_group()` to add a new parameter group.")
+          param_cmp_value = if (is.integer(new_param_group$params)) {
+            all_params[new_param_group$params]
+          } else {
+            new_param_group$params
+          }
+
+          if (!identical(prev_param_group$params, param_cmp_value)) {
+            print(prev_param_group$params)
+            print(new_param_group$params)
+            value_error("Cannot change the parameter groups, use `$add_param_group()` to add a new parameter group.")
           }
 
           private$.set_param_group_options(self$ptr, rhs)
@@ -367,7 +376,6 @@ is_permutation <- function(vec1, vec2) {
   if (length(vec1) != length(vec2)) {
     return(FALSE)
   }
-
   # Check if sorted elements are the same
   identical(sort(vec1), sort(vec2))
 }

diff --git a/src/lantern/src/Ignite.cpp b/src/lantern/src/Ignite.cpp
@@ -1,5 +1,6 @@
 #include <ostream>
 #include <torch/optim/adamw.h>
+#include <torch/types.h>
 #define LANTERN_BUILD
 #include <torch/torch.h>
 #include "lantern/lantern.h"
@@ -113,7 +114,7 @@ void* _ignite_adagrad_get_states(void* optim) {
         auto base_state = state_it->second.get();
         auto adagrad_state = static_cast<torch::optim::AdagradParamState*>(base_state);
         tensors.push_back(adagrad_state->sum().clone());
-        tensors.push_back(torch::scalar_tensor(adagrad_state->step(), torch::kLong));
+        tensors.push_back(torch::tensor({adagrad_state->step()}, torch::kLong));
       }
     }
   }
@@ -201,9 +202,9 @@ void* _ignite_adam_get_states(void* optim) {
         if (adam_state->max_exp_avg_sq().defined()) {
           tensors.push_back(adam_state->max_exp_avg_sq().clone());
         } else {
-          tensors.push_back(torch::Tensor());
+          tensors.push_back(torch::empty(0, torch::kFloat32));
         }
-        tensors.push_back(torch::scalar_tensor(adam_state->step(), torch::kLong));
+        tensors.push_back(torch::tensor({adam_state->step()}, torch::kLong));
       }
     }
   }
@@ -226,7 +227,7 @@ void _ignite_adam_set_states(void* optim, void* params,void* states_) {
     auto* current_state = static_cast<torch::optim::AdamParamState*>(state_it->second.get());
     current_state->exp_avg(states[i]);
     current_state->exp_avg_sq(states[i + 1]);
-    if (states[i + 2].defined()) {
+    if (states[i + 2].numel() != 0) {
       current_state->max_exp_avg_sq(states[i + 2]);
     }
     auto step = states[i + 3];
@@ -324,9 +325,9 @@ void* _ignite_adamw_get_states(void* optim) {
         if (adamw_state->max_exp_avg_sq().defined()) {
           tensors.push_back(adamw_state->max_exp_avg_sq().clone());
         } else {
-          tensors.push_back(torch::Tensor());
+          tensors.push_back(torch::empty(0, torch::kFloat32));
         }
-        tensors.push_back(torch::scalar_tensor(adamw_state->step(), torch::kLong));
+        tensors.push_back(torch::tensor({adamw_state->step()}, torch::kLong));
       }
     }
   }
@@ -422,15 +423,15 @@ void* _ignite_rmsprop_get_states(void* optim) {
         if (rmsprop_state->grad_avg().defined()) {
           tensors.push_back(rmsprop_state->grad_avg().clone());
         } else {
-          tensors.push_back(torch::Tensor());
+          tensors.push_back(torch::empty(0, torch::kFloat32));
         }
         tensors.push_back(rmsprop_state->square_avg().clone());
         if (rmsprop_state->momentum_buffer().defined()) {
           tensors.push_back(rmsprop_state->momentum_buffer().clone());
         } else {
-          tensors.push_back(torch::Tensor());
+          tensors.push_back(torch::empty(0, torch::kFloat32));
         }
-        tensors.push_back(torch::scalar_tensor(rmsprop_state->step(), torch::kLong));
+        tensors.push_back(torch::tensor({rmsprop_state->step()}, torch::kLong));
       }
     }
   }
@@ -451,9 +452,13 @@ void _ignite_rmsprop_set_states(void* optim, void* params, void* states_) {
       state_it = opt->state().find(param.unsafeGetTensorImpl());
     }
     auto* current_state = static_cast<torch::optim::RMSpropParamState*>(state_it->second.get());
-    current_state->grad_avg(states[i]);
+    if (states[i].numel() != 0) {
+      current_state->grad_avg(states[i]);
+    }
     current_state->square_avg(states[i + 1]);
-    current_state->momentum_buffer(states[i + 2]);
+    if (states[i + 2].numel() != 0) {
+      current_state->momentum_buffer(states[i + 2]);
+    }
     auto step = states[i + 3];
     current_state->step(step.item<int64_t>());
     i += 4;
@@ -519,7 +524,7 @@ void* _ignite_sgd_get_states(void* optim) {
         if (sgd_state->momentum_buffer().defined()) {
           tensors.push_back(sgd_state->momentum_buffer().clone());
         } else {
-          tensors.push_back(torch::Tensor());
+          tensors.push_back(torch::empty(0, torch::kFloat32));
         }
       }
     }

diff --git a/tests/testthat/helper-ignite.R b/tests/testthat/helper-ignite.R
@@ -75,14 +75,12 @@ expect_state_dict_works <- function(optimizer_fn, ...) {
     }
     replicate(2, s())
     if (load) {
-      o$load_state_dict(o$state_dict())
+      o$load_state_dict(torch_load(torch_serialize(o$state_dict())))
     }
     replicate(2, s())
     return(n$parameters)
   }
-  torch_manual_seed(123)
   w1 <- f(load = TRUE)
-  torch_manual_seed(123)
   w2 <- f(load = FALSE)
   expect_equal(w1, w2)
 }

diff --git a/tests/testthat/test-ignite.R b/tests/testthat/test-ignite.R
@@ -30,10 +30,10 @@ test_that("un-optimized parameters and state dict", {
   states = sd$state
   expect_equal(names(states), "1")
   # all parameters are included in the state dict even when they don't have a state.
-  expect_false(cpp_tensor_is_undefined(states[[1]]$exp_avg))
-  expect_false(cpp_tensor_is_undefined(states[[1]]$exp_avg_sq))
-  expect_true(cpp_tensor_is_undefined(states[[1]]$max_exp_avg_sq))
-  expect_false(cpp_tensor_is_undefined(states[[1]]$step))
+  expect_false(is.null(states[[1]]$exp_avg))
+  expect_false(is.null(states[[1]]$exp_avg_sq))
+  expect_false(is.null(states[[1]]$max_exp_avg_sq))
+  expect_false(is.null(states[[1]]$step))
   opt$load_state_dict(sd)
   x1 = unlist(states)
   x2 = unlist(opt$state_dict()$state)
@@ -58,6 +58,12 @@ test_that("adam", {
   expect_ignite_can_change_param_groups(optim_ignite_adam)
   expect_ignite_can_add_param_group(optim_ignite_adam)
   do.call(expect_state_dict_works, c(list(optim_ignite_adam), defaults))
+  # can save adam even when one of the tensors in the state is undefined in C++
+  defaults$amsgrad <- FALSE
+  o <- do.call(make_ignite_adam, defaults)
+  prev <- o$state_dict()
+  o$load_state_dict(torch_load(torch_serialize(o$state_dict())))
+  expect_equal(prev, o$state_dict())
 })
 
 test_that("adamw", {
@@ -73,6 +79,13 @@ test_that("adamw", {
   expect_ignite_can_change_param_groups(optim_ignite_adamw)
   expect_ignite_can_add_param_group(optim_ignite_adamw)
   do.call(expect_state_dict_works, c(list(optim_ignite_adamw), defaults))
+
+  # can save adamw even when one of the tensors in the state is undefined in C++
+  defaults$amsgrad <- FALSE
+  o <- do.call(make_ignite_adamw, defaults)
+  prev <- o$state_dict()
+  o$load_state_dict(torch_load(torch_serialize(o$state_dict())))
+  expect_equal(prev, o$state_dict())
 })
 
 test_that("sgd", {
@@ -87,6 +100,13 @@ test_that("sgd", {
   expect_ignite_can_change_param_groups(optim_ignite_sgd, lr = 0.1)
   expect_ignite_can_add_param_group(optim_ignite_sgd)
   do.call(expect_state_dict_works, c(list(optim_ignite_sgd), defaults))
+  o$load_state_dict(torch_load(torch_serialize(o$state_dict())))
+
+  # saving of state dict
+  o <- do.call(make_ignite_sgd, defaults)
+  prev <- o$state_dict()
+  o$load_state_dict(torch_load(torch_serialize(o$state_dict())))
+  expect_equal(prev, o$state_dict())
 })
 
 test_that("rmsprop", {
@@ -102,6 +122,11 @@ test_that("rmsprop", {
   expect_ignite_can_change_param_groups(optim_ignite_rmsprop)
   expect_ignite_can_add_param_group(optim_ignite_rmsprop)
   do.call(expect_state_dict_works, c(list(optim_ignite_rmsprop), defaults))
+
+  o <- do.call(make_ignite_rmsprop, defaults)
+  prev <- o$state_dict()
+  o$load_state_dict(torch_load(torch_serialize(o$state_dict())))
+  expect_equal(prev, o$state_dict())
 })
 
 test_that("adagrad", {
@@ -117,6 +142,11 @@ test_that("adagrad", {
   expect_ignite_can_change_param_groups(optim_ignite_adagrad)
   expect_ignite_can_add_param_group(optim_ignite_adagrad)
   do.call(expect_state_dict_works, c(list(optim_ignite_adagrad), defaults))
+
+  o <- do.call(make_ignite_adagrad, defaults)
+  prev <- o$state_dict()
+  o$load_state_dict(torch_load(torch_serialize(o$state_dict())))
+  expect_equal(prev, o$state_dict())
 })
 
 test_that("base class: can initialize optimizer with different options per param group", {
@@ -160,6 +190,17 @@ test_that("base class: params must have length > 1", {
   expect_error(optim_ignite_adamw(list()), "must have length")
 })
 
+test_that("base class: can change values of param_groups", {
+  o = optim_ignite_adamw(list(torch_tensor(1, requires_grad = TRUE)), lr = 0.1)
+  o$param_groups[[1]]$lr = 1
+  expect_equal(o$param_groups[[1]]$lr, 1)
+  o$param_groups[[1]]$amsgrad = FALSE
+  expect_true(!o$param_groups[[1]]$amsgrad)
+  o$param_groups[[1]]$amsgrad = TRUE
+  expect_false(!o$param_groups[[1]]$amsgrad)
+})
+
+
 test_that("base class: error handling when loading state dict", {
   o = make_ignite_adamw()
   expect_error(o$load_state_dict(list()), "must be a list with elements")
@@ -174,7 +215,28 @@ test_that("base class: error handling when loading state dict", {
   expect_error(o$load_state_dict(sd3), "but got params, weight_decay")
 })
 
-test_that("deep cloning not possible", {
+test_that("base class: deep cloning not possible", {
   o = make_ignite_adamw(steps = 0)
   expect_error(o$clone(deep = TRUE), "OptimizerIgnite cannot be deep cloned")
 })
+
+test_that("base class: changing the learning rate has an effect", {
+  n1 = nn_linear(1, 1)
+  n2 = n1$clone(deep = TRUE)
+  o1 = optim_sgd(n1$parameters, lr = 0.1)
+  o2 = optim_sgd(n2$parameters, lr = 0.1)
+
+  s = function(n, o) {
+    o$zero_grad()
+    ((n(torch_tensor(1)) - torch_tensor(1))^2)$backward()
+    o$step()
+  }
+
+  s(n1, o1)
+  s(n2, o2)
+  expect_true(torch_equal(n1$parameters[[1]], n2$parameters[[1]]) && torch_equal(n1$parameters[[2]], n2$parameters[[2]]))
+  o1$param_groups[[1]]$lr = 0.2
+  s(n1, o1)
+  s(n2, o2)
+  expect_false(torch_equal(n1$parameters[[1]], n2$parameters[[1]]) && torch_equal(n1$parameters[[2]], n2$parameters[[2]]))
+})