Skip to content

Commit

Permalink
fix warmup of learning rate so that it works properly w/ continued tr… (
Browse files Browse the repository at this point in the history
#926)

Co-authored-by: Steven Bradtke sjbradt <[email protected]>
  • Loading branch information
tuglat and Steven Bradtke sjbradt authored Jan 8, 2021
1 parent c5ff7d9 commit b6d8d35
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 9 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@ Note that Sockeye has checks in place to not translate with an old model that wa

Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.

## [2.3.8]

### Fixed

- Fix problem identified in issue #925 that caused learning rate
warmup to fail in some instances when doing continued training

## [2.3.7]

### Changed
Expand Down
2 changes: 1 addition & 1 deletion sockeye/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

__version__ = '2.3.7'
__version__ = '2.3.8'
18 changes: 10 additions & 8 deletions sockeye/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,8 @@ def _load_lr_scheduler(self, fname):
with open(fname, "rb") as fp:
self.trainer.optimizer.lr_scheduler = pickle.load(fp)
logger.info("Loaded '%s' from '%s'", self.trainer.optimizer.lr_scheduler, fname)
self.trainer.optimizer.begin_num_update = self.state.updates
self.trainer.optimizer.num_update = self.state.updates

def _save_training_state(self, train_iter: data_io.BaseParallelSampleIter):
"""
Expand All @@ -603,10 +605,6 @@ def _save_training_state(self, train_iter: data_io.BaseParallelSampleIter):
opt_state_fname = os.path.join(training_state_dirname, C.OPT_STATES_LAST)
self._save_trainer_states(opt_state_fname)

# (2.5) lr_scheduler
lr_scheduler_fname = os.path.join(training_state_dirname, C.LR_SCHEDULER_LAST)
self._save_lr_scheduler(lr_scheduler_fname)

# (3) Data iterator
train_iter.save_state(os.path.join(training_state_dirname, C.BUCKET_ITER_STATE_NAME))

Expand All @@ -621,6 +619,10 @@ def _save_training_state(self, train_iter: data_io.BaseParallelSampleIter):
# (5) Training state
self.state.save(os.path.join(training_state_dirname, C.TRAINING_STATE_NAME))

# (5.5) lr_scheduler
lr_scheduler_fname = os.path.join(training_state_dirname, C.LR_SCHEDULER_LAST)
self._save_lr_scheduler(lr_scheduler_fname)

# (6) AMP loss scaler state
if self.using_amp:
with open(os.path.join(training_state_dirname, C.AMP_LOSS_SCALER_STATE_NAME), "wb") as fp:
Expand Down Expand Up @@ -658,10 +660,6 @@ def _load_training_state(self, train_iter: data_io.BaseParallelSampleIter):
opt_state_fname = os.path.join(self.training_state_dirname, C.OPT_STATES_LAST)
self._load_trainer_states(opt_state_fname)

# (2.5) lr_scheduler
lr_scheduler_fname = os.path.join(self.training_state_dirname, C.LR_SCHEDULER_LAST)
self._load_lr_scheduler(lr_scheduler_fname)

# (3) Data Iterator
train_iter.load_state(os.path.join(self.training_state_dirname, C.BUCKET_ITER_STATE_NAME))

Expand All @@ -676,6 +674,10 @@ def _load_training_state(self, train_iter: data_io.BaseParallelSampleIter):
# (5) Training state
self.state = TrainState.load(os.path.join(self.training_state_dirname, C.TRAINING_STATE_NAME))

# (5.5) lr_scheduler
lr_scheduler_fname = os.path.join(self.training_state_dirname, C.LR_SCHEDULER_LAST)
self._load_lr_scheduler(lr_scheduler_fname)

# (6) AMP loss scaler state
if self.using_amp:
# Load loss scaler state
Expand Down

0 comments on commit b6d8d35

Please sign in to comment.