Skip to content

Commit

Permalink
Ensure last checkpoint decoder results are written to metrics fail wh…
Browse files Browse the repository at this point in the history
…en cleaning up training (#368)
  • Loading branch information
fhieber authored Apr 21, 2018
1 parent dd7933d commit 12fab9b
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 7 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ Note that Sockeye has checks in place to not translate with an old model that wa

Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.

## [1.18.5]
### Fixed
- Fixed a problem with trainer not waiting for the last checkpoint decoder (#367).

## [1.18.4]
### Added
- Added options to control training length w.r.t number of updates/batches or number of samples:
Expand Down
2 changes: 1 addition & 1 deletion sockeye/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

__version__ = '1.18.4'
__version__ = '1.18.5'
13 changes: 7 additions & 6 deletions sockeye/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@ def fit(self,

tic = time.time()

self._cleanup(lr_decay_opt_states_reset)
self._cleanup(lr_decay_opt_states_reset, process_manager=process_manager)
logger.info("Training finished. Best checkpoint: %d. Best validation %s: %.6f",
self.state.best_checkpoint, early_stopping_metric, self.state.best_metric)
return self.state.best_metric
Expand Down Expand Up @@ -723,7 +723,6 @@ def _update_metrics(self,
checkpoint_metrics["%s-val" % name] = value

if process_manager is not None:
process_manager.wait_to_finish()
result = process_manager.collect_results()
if result is not None:
decoded_checkpoint, decoder_metrics = result
Expand All @@ -749,12 +748,12 @@ def _cleanup(self, lr_decay_opt_states_reset: str, process_manager: Optional['De
utils.cleanup_params_files(self.model.output_dir, self.max_params_files_to_keep,
self.state.checkpoint, self.state.best_checkpoint)
if process_manager is not None:
process_manager.wait_to_finish()
result = process_manager.collect_results()
if result is not None:
decoded_checkpoint, decoder_metrics = result
self.state.metrics[decoded_checkpoint - 1].update(decoder_metrics)
self.tflogger.log_metrics(decoder_metrics, decoded_checkpoint)
utils.write_metrics_file(self.state.metrics, self.metrics_fname)

final_training_state_dirname = os.path.join(self.model.output_dir, C.TRAINING_STATE_DIRNAME)
if os.path.exists(final_training_state_dirname):
Expand Down Expand Up @@ -1139,6 +1138,7 @@ def collect_results(self) -> Optional[Tuple[int, Dict[str, float]]]:
return None
decoded_checkpoint, decoder_metrics = self.decoder_metric_queue.get()
assert self.decoder_metric_queue.empty()
logger.info("Decoder-%d finished: %s", decoded_checkpoint, decoder_metrics)
return decoded_checkpoint, decoder_metrics

def wait_to_finish(self):
Expand All @@ -1147,14 +1147,15 @@ def wait_to_finish(self):
if not self.decoder_process.is_alive():
self.decoder_process = None
return
logger.warning("Waiting for process %s to finish.", self.decoder_process.name)
name = self.decoder_process.name
logger.warning("Waiting for process %s to finish.", name)
wait_start = time.time()
self.decoder_process.join()
self.decoder_process = None
wait_time = int(time.time() - wait_start)
logger.warning("Had to wait %d seconds for the checkpoint decoder to finish. Consider increasing the "
logger.warning("Had to wait %d seconds for the Checkpoint %s to finish. Consider increasing the "
"checkpoint frequency (updates between checkpoints, see %s) or reducing the size of the "
"validation samples that are decoded (see %s)." % (wait_time,
"validation samples that are decoded (see %s)." % (wait_time, name,
C.TRAIN_ARGS_CHECKPOINT_FREQUENCY,
C.TRAIN_ARGS_MONITOR_BLEU))

Expand Down

0 comments on commit 12fab9b

Please sign in to comment.