diff --git a/CHANGELOG.md b/CHANGELOG.md index 25829b1db..d13ba4e53 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,13 @@ Note that Sockeye has checks in place to not translate with an old model that wa Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_. +## [2.3.8] + +### Fixed + +- Fix problem identified in issue #925 that caused learning rate + warmup to fail in some instances when doing continued training + ## [2.3.7] ### Changed diff --git a/sockeye/__init__.py b/sockeye/__init__.py index 94d1a6150..17b45bf0d 100644 --- a/sockeye/__init__.py +++ b/sockeye/__init__.py @@ -11,4 +11,4 @@ # express or implied. See the License for the specific language governing # permissions and limitations under the License. -__version__ = '2.3.7' +__version__ = '2.3.8' diff --git a/sockeye/training.py b/sockeye/training.py index 1a9d5e707..4ef8972e1 100644 --- a/sockeye/training.py +++ b/sockeye/training.py @@ -582,6 +582,8 @@ def _load_lr_scheduler(self, fname): with open(fname, "rb") as fp: self.trainer.optimizer.lr_scheduler = pickle.load(fp) logger.info("Loaded '%s' from '%s'", self.trainer.optimizer.lr_scheduler, fname) + self.trainer.optimizer.begin_num_update = self.state.updates + self.trainer.optimizer.num_update = self.state.updates def _save_training_state(self, train_iter: data_io.BaseParallelSampleIter): """ @@ -603,10 +605,6 @@ def _save_training_state(self, train_iter: data_io.BaseParallelSampleIter): opt_state_fname = os.path.join(training_state_dirname, C.OPT_STATES_LAST) self._save_trainer_states(opt_state_fname) - # (2.5) lr_scheduler - lr_scheduler_fname = os.path.join(training_state_dirname, C.LR_SCHEDULER_LAST) - self._save_lr_scheduler(lr_scheduler_fname) - # (3) Data iterator train_iter.save_state(os.path.join(training_state_dirname, C.BUCKET_ITER_STATE_NAME)) @@ -621,6 +619,10 @@ def _save_training_state(self, train_iter: data_io.BaseParallelSampleIter): # (5) Training state self.state.save(os.path.join(training_state_dirname, C.TRAINING_STATE_NAME)) + # (5.5) lr_scheduler + lr_scheduler_fname = os.path.join(training_state_dirname, C.LR_SCHEDULER_LAST) + self._save_lr_scheduler(lr_scheduler_fname) + # (6) AMP loss scaler state if self.using_amp: with open(os.path.join(training_state_dirname, C.AMP_LOSS_SCALER_STATE_NAME), "wb") as fp: @@ -658,10 +660,6 @@ def _load_training_state(self, train_iter: data_io.BaseParallelSampleIter): opt_state_fname = os.path.join(self.training_state_dirname, C.OPT_STATES_LAST) self._load_trainer_states(opt_state_fname) - # (2.5) lr_scheduler - lr_scheduler_fname = os.path.join(self.training_state_dirname, C.LR_SCHEDULER_LAST) - self._load_lr_scheduler(lr_scheduler_fname) - # (3) Data Iterator train_iter.load_state(os.path.join(self.training_state_dirname, C.BUCKET_ITER_STATE_NAME)) @@ -676,6 +674,10 @@ def _load_training_state(self, train_iter: data_io.BaseParallelSampleIter): # (5) Training state self.state = TrainState.load(os.path.join(self.training_state_dirname, C.TRAINING_STATE_NAME)) + # (5.5) lr_scheduler + lr_scheduler_fname = os.path.join(self.training_state_dirname, C.LR_SCHEDULER_LAST) + self._load_lr_scheduler(lr_scheduler_fname) + # (6) AMP loss scaler state if self.using_amp: # Load loss scaler state