From f8b951a386b25a2b6d5698954261dcbf7dd03bdb Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Thu, 5 Apr 2018 14:50:27 -0700
Subject: [PATCH 01/29] Branch wiki_history to wiki_gec to define GEC-specific
 problems for wiki data generation.

PiperOrigin-RevId: 191802181
---
 README.md                                     |   3 +-
 docs/cloud_tpu.md                             |   4 +-
 docs/distributed_training.md                  |   2 +-
 docs/index.md                                 |   1 -
 docs/walkthrough.md                           |   3 +-
 tensor2tensor/bin/t2t_avg_all.py              |   2 +-
 tensor2tensor/bin/t2t_bleu.py                 |   2 +-
 tensor2tensor/bin/t2t_decoder.py              |   2 +-
 tensor2tensor/bin/t2t_trainer.py              |   4 +-
 .../data_generators/algorithmic_math.py       |   6 +-
 tensor2tensor/data_generators/desc2code.py    |   6 +-
 .../data_generators/generator_utils.py        |   2 +-
 tensor2tensor/data_generators/gym.py          | 325 ++++++++----------
 .../data_generators/problem_hparams.py        |   2 +-
 tensor2tensor/data_generators/ptb.py          |   2 +-
 .../data_generators/speech_recognition.py     |   6 +-
 tensor2tensor/data_generators/text_encoder.py |   2 +-
 tensor2tensor/data_generators/translate.py    |   2 +-
 .../data_generators/translate_enzh.py         |   2 +-
 tensor2tensor/data_generators/wiki.py         |   8 +-
 tensor2tensor/data_generators/wsj_parsing.py  |   6 +-
 tensor2tensor/layers/common_attention.py      |  60 ++--
 tensor2tensor/layers/common_attention_test.py |   2 +-
 tensor2tensor/layers/common_hparams.py        |  10 +-
 .../layers/common_image_attention.py          |   4 +-
 tensor2tensor/layers/common_layers.py         |  14 +-
 tensor2tensor/layers/discretization.py        |   2 +-
 tensor2tensor/layers/modalities.py            |   1 +
 tensor2tensor/models/research/attention_lm.py |   2 +-
 .../models/research/attention_lm_moe.py       |   4 +-
 .../models/research/basic_conv_gen.py         | 180 +---------
 tensor2tensor/models/research/rl.py           |   3 -
 .../models/research/transformer_moe.py        |   2 +-
 tensor2tensor/models/resnet.py                |   2 +-
 tensor2tensor/models/slicenet.py              |   2 +-
 tensor2tensor/models/transformer.py           |  56 +--
 tensor2tensor/models/vanilla_gan.py           |   2 +-
 tensor2tensor/notebooks/hello_t2t-rl.ipynb    |  10 +-
 tensor2tensor/rl/README.md                    |  10 +-
 tensor2tensor/rl/collect.py                   |  17 +-
 tensor2tensor/rl/envs/atari_wrappers.py       | 139 ++++++++
 tensor2tensor/rl/envs/frame1.png              | Bin 729 -> 0 bytes
 tensor2tensor/rl/envs/frame2.png              | Bin 732 -> 0 bytes
 tensor2tensor/rl/envs/in_graph_batch_env.py   | 117 ++++++-
 tensor2tensor/rl/envs/py_func_batch_env.py    | 169 ---------
 tensor2tensor/rl/envs/simulated_batch_env.py  | 150 --------
 tensor2tensor/rl/envs/tf_atari_wrappers.py    | 163 ---------
 tensor2tensor/rl/envs/utils.py                |  46 +--
 tensor2tensor/rl/model_rl_experiment.py       |  92 -----
 tensor2tensor/rl/ppo.py                       |  85 ++---
 tensor2tensor/rl/rl_trainer_lib.py            |  57 ++-
 tensor2tensor/rl/t2t_rl_trainer.py            |   2 +-
 tensor2tensor/utils/adafactor.py              |   6 +-
 tensor2tensor/utils/beam_search.py            |  30 +-
 tensor2tensor/utils/bleu_hook.py              |   4 +-
 tensor2tensor/utils/cloud_tpu.py              |   2 +-
 tensor2tensor/utils/data_reader.py            |   4 +-
 tensor2tensor/utils/decoding.py               |   4 +-
 tensor2tensor/utils/diet.py                   |   6 +-
 tensor2tensor/utils/expert_utils.py           |   8 +-
 tensor2tensor/utils/flags.py                  |   2 +-
 tensor2tensor/utils/metrics.py                |   6 +-
 tensor2tensor/utils/rouge.py                  |   8 +-
 tensor2tensor/utils/t2t_model.py              |  22 +-
 tensor2tensor/utils/yellowfin.py              |  16 +-
 tensor2tensor/visualization/attention.py      |   2 +-
 tensor2tensor/visualization/visualization.py  |  10 +-
 67 files changed, 667 insertions(+), 1258 deletions(-)
 create mode 100644 tensor2tensor/rl/envs/atari_wrappers.py
 delete mode 100644 tensor2tensor/rl/envs/frame1.png
 delete mode 100644 tensor2tensor/rl/envs/frame2.png
 delete mode 100644 tensor2tensor/rl/envs/py_func_batch_env.py
 delete mode 100644 tensor2tensor/rl/envs/simulated_batch_env.py
 delete mode 100644 tensor2tensor/rl/envs/tf_atari_wrappers.py
 delete mode 100644 tensor2tensor/rl/model_rl_experiment.py

diff --git a/README.md b/README.md
index e8d46add5..12e05e936 100644
--- a/README.md
+++ b/README.md
@@ -143,7 +143,6 @@ There are a number of translation data-sets in T2T:
 * English-French: `--problems=translate_enfr_wmt32k`
 * English-Czech: `--problems=translate_encs_wmt32k`
 * English-Chinese: `--problems=translate_enzh_wmt32k`
-* English-Vietnamese: `--problems=translate_envi_iwslt32k`
 
 You can get translations in the other direction by appending `_rev` to
 the problem name, e.g., for German-English use
@@ -328,7 +327,7 @@ inference. Users can easily switch between problems, models, and hyperparameter
 sets by using the `--model`, `--problems`, and `--hparams_set` flags. Specific
 hyperparameters can be overridden with the `--hparams` flag. `--schedule` and
 related flags control local and distributed training/evaluation
-([distributed training documentation](https://github.com/tensorflow/tensor2tensor/tree/master/docs/distributed_training.md)).
+([distributed training documentation](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/g3doc/distributed_training.md)).
 
 ## Adding your own components
 
diff --git a/docs/cloud_tpu.md b/docs/cloud_tpu.md
index d923ee02e..cfc0c0a96 100644
--- a/docs/cloud_tpu.md
+++ b/docs/cloud_tpu.md
@@ -1,7 +1,7 @@
 # Running on Cloud TPUs
 
 Tensor2Tensor supports running on Google Cloud Platforms TPUs, chips
-specialized for ML training. See the official tutorial for [running Transformer
+specialized for ML training. See the official tutorial for [running Transfomer
 on Cloud TPUs](https://cloud.google.com/tpu/docs/tutorials/transformer) or
 read on for more T2T models on TPUs.
 
@@ -14,7 +14,7 @@ Transformer:
 
 You can run the Transformer model on a number of problems,
 from translation through language modeling to sentiment analysis.
-See the official tutorial for [running Transformer
+See the official tutorial for [running Transfomer
 on Cloud TPUs](https://cloud.google.com/tpu/docs/tutorials/transformer)
 for some examples and try out your own problems.
 
diff --git a/docs/distributed_training.md b/docs/distributed_training.md
index 74ae0e536..95b499f87 100644
--- a/docs/distributed_training.md
+++ b/docs/distributed_training.md
@@ -68,7 +68,7 @@ For example:
 TF_CONFIG=$JOB_TF_CONFIG t2t-trainer $JOB_FLAGS --model=transformer ...
 ```
 
-Modify the `--worker_gpu` and `--ps_gpu` flags, which specify how many GPUs are
+Modify the `--worker_gpu` and `--ps_gpu` flags, which specify how many gpus are
 on each master and ps, respectively, as needed for your machine/cluster setup.
 
 ## Command-line flags for eval jobs
diff --git a/docs/index.md b/docs/index.md
index 060e10471..b7d0236c9 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -107,7 +107,6 @@ There are a number of translation data-sets in T2T:
 * English-French: `--problems=translate_enfr_wmt32k`
 * English-Czech: `--problems=translate_encs_wmt32k`
 * English-Chinese: `--problems=translate_enzh_wmt32k`
-* English-Vietnamese: `--problems=translate_envi_iwslt32k`
 
 You can get translations in the other direction by appending `_rev` to
 the problem name, e.g., for German-English use
diff --git a/docs/walkthrough.md b/docs/walkthrough.md
index e8d46add5..12e05e936 100644
--- a/docs/walkthrough.md
+++ b/docs/walkthrough.md
@@ -143,7 +143,6 @@ There are a number of translation data-sets in T2T:
 * English-French: `--problems=translate_enfr_wmt32k`
 * English-Czech: `--problems=translate_encs_wmt32k`
 * English-Chinese: `--problems=translate_enzh_wmt32k`
-* English-Vietnamese: `--problems=translate_envi_iwslt32k`
 
 You can get translations in the other direction by appending `_rev` to
 the problem name, e.g., for German-English use
@@ -328,7 +327,7 @@ inference. Users can easily switch between problems, models, and hyperparameter
 sets by using the `--model`, `--problems`, and `--hparams_set` flags. Specific
 hyperparameters can be overridden with the `--hparams` flag. `--schedule` and
 related flags control local and distributed training/evaluation
-([distributed training documentation](https://github.com/tensorflow/tensor2tensor/tree/master/docs/distributed_training.md)).
+([distributed training documentation](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/g3doc/distributed_training.md)).
 
 ## Adding your own components
 
diff --git a/tensor2tensor/bin/t2t_avg_all.py b/tensor2tensor/bin/t2t_avg_all.py
index 694ab26ed..0b0aa266d 100644
--- a/tensor2tensor/bin/t2t_avg_all.py
+++ b/tensor2tensor/bin/t2t_avg_all.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Script to continuously average last N checkpoints in a given directory."""
+"""Script to continously average last N checkpoints in a given directory."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensor2tensor/bin/t2t_bleu.py b/tensor2tensor/bin/t2t_bleu.py
index 74117454d..4eeb84eec 100644
--- a/tensor2tensor/bin/t2t_bleu.py
+++ b/tensor2tensor/bin/t2t_bleu.py
@@ -74,7 +74,7 @@
 flags.DEFINE_string("translation", None,
                     "Path to the MT system translation file")
 flags.DEFINE_string("translations_dir", None,
-                    "Directory with translated files to be evaluated.")
+                    "Directory with translated files to be evaulated.")
 flags.DEFINE_string("event_dir", None, "Where to store the event file.")
 
 flags.DEFINE_string("bleu_variant", "both",
diff --git a/tensor2tensor/bin/t2t_decoder.py b/tensor2tensor/bin/t2t_decoder.py
index 5705807ee..fd103a6a1 100644
--- a/tensor2tensor/bin/t2t_decoder.py
+++ b/tensor2tensor/bin/t2t_decoder.py
@@ -82,7 +82,7 @@ def create_decode_hparams():
 
 def decode(estimator, hparams, decode_hp):
   if FLAGS.decode_interactive:
-    decoding.decode_interactively(estimator, hparams, decode_hp, checkpoint_path=FLAGS.checkpoint_path)
+    decoding.decode_interactively(estimator, hparams, decode_hp)
   elif FLAGS.decode_from_file:
     decoding.decode_from_file(estimator, FLAGS.decode_from_file, hparams,
                               decode_hp, FLAGS.decode_to_file,
diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
index b82f7f6e4..7d8db041b 100644
--- a/tensor2tensor/bin/t2t_trainer.py
+++ b/tensor2tensor/bin/t2t_trainer.py
@@ -59,7 +59,7 @@
 flags.DEFINE_bool("profile", False, "Profile performance?")
 
 # To maintain compatibility with some internal libs, we guard against these flag
-# definitions possibly erring. Apologies for the ugliness.
+# definitions possibly erroring. Apologies for the ugliness.
 try:
   flags.DEFINE_string("master", "", "Address of TensorFlow master.")
   flags.DEFINE_string("output_dir", "", "Base output directory for run.")
@@ -178,7 +178,7 @@ def create_run_config(hp):
   save_ckpt_secs = FLAGS.save_checkpoints_secs or None
   if save_ckpt_secs:
     save_ckpt_steps = None
-  assert FLAGS.output_dir or FLAGS.checkpoint_path
+  assert FLAGS.output_dir
   return trainer_lib.create_run_config(
       model_dir=os.path.expanduser(FLAGS.output_dir),
       master=FLAGS.master,
diff --git a/tensor2tensor/data_generators/algorithmic_math.py b/tensor2tensor/data_generators/algorithmic_math.py
index 433525a7a..c3a028b12 100644
--- a/tensor2tensor/data_generators/algorithmic_math.py
+++ b/tensor2tensor/data_generators/algorithmic_math.py
@@ -181,7 +181,7 @@ def algebra_inverse_solve(left, right, var, solve_ops):
           right- Expression on the right side of the op.
           to_tree- The tree on the other side of the equal sign. The canceled
               out expression will be moved here.
-          new_from_tree- The resulting from_tree after the algebraic
+          new_from_tree- The resuling from_tree after the algebraic
               manipulation.
           new_to_tree- The resulting to_tree after the algebraic manipulation.
 
@@ -355,7 +355,7 @@ def generate_calculus_integrate_sample(vlist, ops, min_depth, max_depth,
 # functions: Dict of special function names. Maps human readable string names to
 #     single char names used in flist.
 # ops: Dict mapping op symbols (chars) to ExprOp instances.
-# solve_ops: Encodes rules for how to algebraically cancel out each operation. See
+# solve_ops: Encodes rules for how to algebraicly cancel out each operation. See
 #     doc-string for `algebra_inverse_solve`.
 # int_encoder: Function that maps a string to a list of tokens. Use this to
 #     encode an expression to feed into a model.
@@ -377,7 +377,7 @@ def math_dataset_init(alphabet_size=26, digits=None, functions=None):
 
   Args:
     alphabet_size: How many possible variables there are. Max 52.
-    digits: How many numerical digits to encode as tokens, "0" through
+    digits: How many numerical digits to encode as tokens, "0" throuh
         str(digits-1), or None to encode no digits.
     functions: Defines special functions. A dict mapping human readable string
         names, like "log", "exp", "sin", "cos", etc., to single chars. Each
diff --git a/tensor2tensor/data_generators/desc2code.py b/tensor2tensor/data_generators/desc2code.py
index 145279a84..1e72746fb 100644
--- a/tensor2tensor/data_generators/desc2code.py
+++ b/tensor2tensor/data_generators/desc2code.py
@@ -140,8 +140,8 @@ def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     samples = list(generator_samples(tmp_dir, self.pb_constants))
 
     # Split between train and dev
-    # Shuffle to get problems from diverse sources (CodeChef and CodeForces) and
-    # difficulties in each set.
+    # Suffle to get problems from diverse sources (CodeChef and CodeForces) and
+    # dificulties in each set.
     # Need to sort the samples first before shuffling (as walk() isn't
     # deterministic)
     samples.sort(key=lambda x: x.desc_file)  # in-place
@@ -289,7 +289,7 @@ def next_sample(subdir, dirs, files):  # pylint: disable=unused-argument
     for f in tf.gfile.Glob(code_pattern):
       with tf.gfile.GFile(f, mode="r") as target_file:
         # Hack to filter C++/Java files. In theory some python comments could
-        # make the file be considered as C++ but in practice the chance of
+        # make the file be concidered as C++ but in practice the chance of
         # getting a false negative is low.
         content = target_file.read()
         if not any(p in content for p in pb_cst.filter_patterns):
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
index 3078f8dfe..a628252a5 100644
--- a/tensor2tensor/data_generators/generator_utils.py
+++ b/tensor2tensor/data_generators/generator_utils.py
@@ -227,7 +227,7 @@ def maybe_download(directory, filename, uri):
 
 
 def maybe_download_from_drive(directory, filename, url):
-  """Download filename from Google drive unless it's already in directory.
+  """Download filename from google drive unless it's already in directory.
 
   Args:
     directory: path to the directory that will be used.
diff --git a/tensor2tensor/data_generators/gym.py b/tensor2tensor/data_generators/gym.py
index 8c04de9fe..0cdfe0fa9 100644
--- a/tensor2tensor/data_generators/gym.py
+++ b/tensor2tensor/data_generators/gym.py
@@ -24,136 +24,76 @@
 # Dependency imports
 
 import gym
-import os
-from tensorflow.contrib.training import HParams
-from collections import deque
+import numpy as np
 
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
+
 from tensor2tensor.models.research import rl
+from tensor2tensor.rl import rl_trainer_lib  # pylint: disable=unused-import
+from tensor2tensor.rl.envs import atari_wrappers
+
+from tensor2tensor.utils import metrics
 from tensor2tensor.utils import registry
-from tensor2tensor.rl.envs.utils import batch_env_factory
-from tensor2tensor.rl.envs.tf_atari_wrappers import MemoryWrapper, TimeLimitWrapper
-from tensor2tensor.rl.envs.tf_atari_wrappers import MaxAndSkipWrapper
-from tensor2tensor.rl.envs.tf_atari_wrappers import PongT2TGeneratorHackWrapper
-from tensor2tensor.rl import collect
 
 import tensorflow as tf
 
 
-def moviepy_editor():
-  """Access to moviepy to allow for import of this file without a moviepy install."""
-  try:
-    from moviepy import editor # pylint: disable=g-import-not-at-top
-  except ImportError:
-    raise ImportError("pip install moviepy to record videos")
-  return editor
-
 flags = tf.flags
 FLAGS = flags.FLAGS
 
-flags.DEFINE_string("agent_policy_path", "", "File with model for pong")
+flags.DEFINE_string("model_path", "", "File with model for pong")
+
 
-@registry.register_problem
 class GymDiscreteProblem(problem.Problem):
   """Gym environment with discrete actions and rewards."""
 
   def __init__(self, *args, **kwargs):
     super(GymDiscreteProblem, self).__init__(*args, **kwargs)
-    self.num_channels = 3
-    self.history_size = 2
-
-    # defaults
-    self.environment_spec = lambda: gym.make("PongNoFrameskip-v4")
-    self.in_graph_wrappers = [(MaxAndSkipWrapper, {"skip": 4})]
-    self.collect_hparams = rl.atari_base()
-    self.num_steps = 1000
-    self.movies = True
-    self.movies_fps = 24
-    self.simulated_environment = None
-    self.warm_up = 70
-
-  def _setup(self):
-    # TODO: remove PongT2TGeneratorHackWrapper by writing a modality
-
-    in_graph_wrappers = [(PongT2TGeneratorHackWrapper, {"add_value": 2}),
-                         (MemoryWrapper, {})] + self.in_graph_wrappers
-    env_hparams = HParams(in_graph_wrappers=in_graph_wrappers,
-                          simulated_environment=self.simulated_environment)
-
-    generator_batch_env = \
-      batch_env_factory(self.environment_spec, env_hparams, num_agents=1, xvfb=False)
-
-    with tf.variable_scope("", reuse=tf.AUTO_REUSE):
-      policy_lambda = self.collect_hparams.network
-      policy_factory = tf.make_template(
-        "network",
-        functools.partial(policy_lambda, self.environment_spec().action_space, self.collect_hparams),
-        create_scope_now_=True,
-        unique_name_="network")
-
-    with tf.variable_scope("", reuse=tf.AUTO_REUSE):
-      sample_policy = lambda policy: 0*policy.sample()
-      # sample_policy = lambda policy: 0
-
-      self.collect_hparams.epoch_length = 10
-      _, self.collect_trigger_op = collect.define_collect(
-        policy_factory, generator_batch_env, self.collect_hparams,
-        eval_phase=False, policy_to_actions_lambda=sample_policy, scope="define_collect")
-
-    self.avilable_data_size_op = MemoryWrapper.singleton._speculum.size()
-    self.data_get_op = MemoryWrapper.singleton._speculum.dequeue()
-    self.history_buffer = deque(maxlen=self.history_size+1)
+    self._env = None
 
   def example_reading_spec(self, label_repr=None):
+
     data_fields = {
-      "targets_encoded": tf.FixedLenFeature((), tf.string),
-      "image/format": tf.FixedLenFeature((), tf.string),
-      "action": tf.FixedLenFeature([1], tf.int64),
-      "reward": tf.FixedLenFeature([1], tf.int64),
-      # "done": tf.FixedLenFeature([1], tf.int64)
+        "inputs": tf.FixedLenFeature([210, 160, 3], tf.int64),
+        "inputs_prev": tf.FixedLenFeature([210, 160, 3], tf.int64),
+        "targets": tf.FixedLenFeature([210, 160, 3], tf.int64),
+        "action": tf.FixedLenFeature([1], tf.int64),
+        "reward": tf.FixedLenFeature([1], tf.int64)
     }
 
-    for x in range(self.history_size):
-      data_fields["inputs_encoded_{}".format(x)] =  tf.FixedLenFeature((), tf.string)
-
+    return data_fields, None
 
-    data_items_to_decoders = {
-      "targets":
-        tf.contrib.slim.tfexample_decoder.Image(
-          image_key="targets_encoded",
-          format_key="image/format",
-          shape=[210, 160, 3],
-          channels=3),
+  def eval_metrics(self):
+    return [metrics.Metrics.ACC, metrics.Metrics.ACC_PER_SEQ,
+            metrics.Metrics.NEG_LOG_PERPLEXITY, metrics.Metrics.IMAGE_SUMMARY]
 
-      #Just do a pass through
-      "action":tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="action"),
-      "reward":tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="reward"),
-    }
-
-    for x in range(self.history_size):
-      data_items_to_decoders["inputs_{}".format(x)] =  tf.contrib.slim.tfexample_decoder.Image(
-                image_key="inputs_encoded_{}".format(x),
-                format_key="image/format",
-                shape=[210, 160, 3],
-                channels=3)
+  @property
+  def env_name(self):
+    # This is the name of the Gym environment for this problem.
+    raise NotImplementedError()
 
-    return data_fields, data_items_to_decoders
+  @property
+  def env(self):
+    if self._env is None:
+      self._env = gym.make(self.env_name)
+    return self._env
 
-  # def preprocess_example(self, example, mode, hparams):
-  #   if not self._was_reversed:
-  #     for x in range(self.history_size):
-  #       input_name = "inputs_{}".format(x)
-  #       example[input_name] = tf.image.per_image_standardization(example[input_name])
-  #   return example
+  @property
+  def num_channels(self):
+    return 3
 
   @property
   def num_actions(self):
-    return 4
+    raise NotImplementedError()
 
   @property
   def num_rewards(self):
-    return 2
+    raise NotImplementedError()
+
+  @property
+  def num_steps(self):
+    raise NotImplementedError()
 
   @property
   def num_shards(self):
@@ -168,70 +108,35 @@ def get_action(self, observation=None):
 
   def hparams(self, defaults, unused_model_hparams):
     p = defaults
-    # hard coded +1 after "symbol" refers to the fact
-    # that 0 is a special symbol meaning padding
-    # when symbols are e.g. 0, 1, 2, 3 we
-    # shift them to 0, 1, 2, 3, 4
-    p.input_modality = {"action": ("symbol:identity", self.num_actions)}
-
-    for x in range(self.history_size):
-      p.input_modality["inputs_{}".format(x)] = ("image", 256)
-
-    p.target_modality = {"targets": ("image", 256),
-                         "reward":  ("symbol", self.num_rewards+1),
-                         # "done": ("symbol", 2+1)
-                         }
-
+    p.input_modality = {"inputs": ("image", 256),
+                        "inputs_prev": ("image", 256),
+                        "reward": ("symbol", self.num_rewards),
+                        "action": ("symbol", self.num_actions)}
+    p.target_modality = ("image", 256)
     p.input_space_id = problem.SpaceID.IMAGE
     p.target_space_id = problem.SpaceID.IMAGE
 
-  def restore_networks(self, sess):
-    model_saver = tf.train.Saver(
-      tf.global_variables(".*network_parameters.*"))
-    if FLAGS.agent_policy_path:
-      model_saver.restore(sess, FLAGS.agent_policy_path)
-
   def generator(self, data_dir, tmp_dir):
-    self._setup()
-    clip_files = []
-    with tf.Session() as sess:
-      sess.run(tf.global_variables_initializer())
-      self.restore_networks(sess)
-
-      pieces_generated = 0
-      while pieces_generated<self.num_steps + self.warm_up:
-        avilable_data_size = sess.run(self.avilable_data_size_op)
-        if avilable_data_size>0:
-          observ, reward, action, done = sess.run(self.data_get_op)
-          self.history_buffer.append(observ)
-
-          if self.movies==True and pieces_generated>self.warm_up:
-            file_name = os.path.join(tmp_dir,'output_{}.png'.format(pieces_generated))
-            clip_files.append(file_name)
-            with open(file_name, 'wb') as f:
-              f.write(observ)
-
-          if len(self.history_buffer)==self.history_size+1:
-            pieces_generated += 1
-            ret_dict = {
-              "targets_encoded": [observ],
-              "image/format": ["png"],
-              "action": [int(action)],
-              # "done": [bool(done)],
-              "reward": [int(reward)],
-                }
-            for i, v in enumerate(list(self.history_buffer)[:-1]):
-              ret_dict["inputs_encoded_{}".format(i)] = [v]
-            if pieces_generated>self.warm_up:
-              yield ret_dict
-        else:
-          sess.run(self.collect_trigger_op)
-    if self.movies:
-      # print(clip_files)
-      clip = moviepy_editor().ImageSequenceClip(clip_files, fps=self.movies_fps)
-      clip.write_videofile(os.path.join(data_dir, 'output_{}.mp4'.format(self.name)),
-                           fps=self.movies_fps, codec='mpeg4')
-
+    self.env.reset()
+    action = self.get_action()
+    prev_observation, observation = None, None
+    for _ in range(self.num_steps):
+      prev_prev_observation = prev_observation
+      prev_observation = observation
+      observation, reward, done, _ = self.env.step(action)
+      action = self.get_action(observation)
+      if done:
+        self.env.reset()
+      def flatten(nparray):
+        flat1 = [x for sublist in nparray.tolist() for x in sublist]
+        return [x for sublist in flat1 for x in sublist]
+      if prev_prev_observation is not None:
+        yield {"inputs_prev": flatten(prev_prev_observation),
+               "inputs": flatten(prev_observation),
+               "action": [action],
+               "done": [done],
+               "reward": [int(reward)],
+               "targets": flatten(observation)}
 
   def generate_data(self, data_dir, tmp_dir, task_id=-1):
     train_paths = self.training_filepaths(
@@ -245,23 +150,93 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
 
 
 @registry.register_problem
-class GymSimulatedDiscreteProblem(GymDiscreteProblem):
-  """Simulated gym environment with discrete actions and rewards."""
+class GymPongRandom5k(GymDiscreteProblem):
+  """Pong game, random actions."""
+
+  @property
+  def env_name(self):
+    return "PongDeterministic-v4"
+
+  @property
+  def num_actions(self):
+    return 4
+
+  @property
+  def num_rewards(self):
+    return 2
+
+  @property
+  def num_steps(self):
+    return 5000
+
+
+@registry.register_problem
+class GymPongTrajectoriesFromPolicy(GymDiscreteProblem):
+  """Pong game, loaded actions."""
 
   def __init__(self, *args, **kwargs):
-    super(GymSimulatedDiscreteProblem, self).__init__(*args, **kwargs)
-    #TODO: pull it outside
-    self.in_graph_wrappers = [(TimeLimitWrapper, {"timelimit": 150}), (MaxAndSkipWrapper, {"skip": 4})]
-    self.simulated_environment = True
-    self.movies_fps = 2
-
-  def restore_networks(self, sess):
-    super(GymSimulatedDiscreteProblem, self).restore_networks(sess)
-
-    #TODO: adjust regexp for different models
-    env_model_loader = tf.train.Saver(tf.global_variables(".*basic_conv_gen.*"))
-    sess = tf.get_default_session()
-
-    ckpts = tf.train.get_checkpoint_state(FLAGS.output_dir)
-    ckpt = ckpts.model_checkpoint_path
-    env_model_loader.restore(sess, ckpt)
+    super(GymPongTrajectoriesFromPolicy, self).__init__(*args, **kwargs)
+    self._env = None
+    self._last_policy_op = None
+    self._max_frame_pl = None
+    self._last_action = self.env.action_space.sample()
+    self._skip = 4
+    self._skip_step = 0
+    self._obs_buffer = np.zeros((2,) + self.env.observation_space.shape,
+                                dtype=np.uint8)
+
+  def generator(self, data_dir, tmp_dir):
+    env_spec = lambda: atari_wrappers.wrap_atari(  # pylint: disable=g-long-lambda
+        gym.make(self.env_name),
+        warp=False,
+        frame_skip=4,
+        frame_stack=False)
+    hparams = rl.atari_base()
+    with tf.variable_scope("train", reuse=tf.AUTO_REUSE):
+      policy_lambda = hparams.network
+      policy_factory = tf.make_template(
+          "network",
+          functools.partial(policy_lambda, env_spec().action_space, hparams))
+      self._max_frame_pl = tf.placeholder(
+          tf.float32, self.env.observation_space.shape)
+      actor_critic = policy_factory(tf.expand_dims(tf.expand_dims(
+          self._max_frame_pl, 0), 0))
+      policy = actor_critic.policy
+      self._last_policy_op = policy.mode()
+      with tf.Session() as sess:
+        model_saver = tf.train.Saver(
+            tf.global_variables(".*network_parameters.*"))
+        model_saver.restore(sess, FLAGS.model_path)
+        for item in super(GymPongTrajectoriesFromPolicy,
+                          self).generator(data_dir, tmp_dir):
+          yield item
+
+  # TODO(blazej0): For training of atari agents wrappers are usually used.
+  # Below we have a hacky solution which is a workaround to be used together
+  # with atari_wrappers.MaxAndSkipEnv.
+  def get_action(self, observation=None):
+    if self._skip_step == self._skip - 2: self._obs_buffer[0] = observation
+    if self._skip_step == self._skip - 1: self._obs_buffer[1] = observation
+    self._skip_step = (self._skip_step + 1) % self._skip
+    if self._skip_step == 0:
+      max_frame = self._obs_buffer.max(axis=0)
+      self._last_action = int(tf.get_default_session().run(
+          self._last_policy_op,
+          feed_dict={self._max_frame_pl: max_frame})[0, 0])
+    return self._last_action
+
+  @property
+  def env_name(self):
+    return "PongDeterministic-v4"
+
+  @property
+  def num_actions(self):
+    return 4
+
+  @property
+  def num_rewards(self):
+    return 2
+
+  @property
+  def num_steps(self):
+    return 5000
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
index 262a0dc51..c69a78eb9 100644
--- a/tensor2tensor/data_generators/problem_hparams.py
+++ b/tensor2tensor/data_generators/problem_hparams.py
@@ -31,7 +31,7 @@
 
 import tensorflow as tf
 
-# TODO(rsepassi): Merge these problems with their data generators. Currently
+# TODO(rsepassi): Merge these problems with their data generators. Currenlty
 # they only implement the hparams.
 
 
diff --git a/tensor2tensor/data_generators/ptb.py b/tensor2tensor/data_generators/ptb.py
index 4ac3911b9..af455749d 100644
--- a/tensor2tensor/data_generators/ptb.py
+++ b/tensor2tensor/data_generators/ptb.py
@@ -58,7 +58,7 @@ def _build_vocab(filename, vocab_path, vocab_size):
   Args:
     filename: file to read list of words from.
     vocab_path: path where to save the vocabulary.
-    vocab_size: size of the vocabulary to generate.
+    vocab_size: size of the vocablulary to generate.
   """
   data = _read_words(filename)
   counter = collections.Counter(data)
diff --git a/tensor2tensor/data_generators/speech_recognition.py b/tensor2tensor/data_generators/speech_recognition.py
index 2777cd9cf..25cea7cc5 100644
--- a/tensor2tensor/data_generators/speech_recognition.py
+++ b/tensor2tensor/data_generators/speech_recognition.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Common classes for automatic speech recognition (ASR) datasets.
+"""Common classes for automatic speech recogntion (ASR) datasets.
 
 The audio import uses sox to generate normalized waveforms, please install
 it as appropriate (e.g. using apt-get or yum).
@@ -85,7 +85,7 @@ def compute_mel_filterbank_features(
     sample_rate: sampling rate of the waveform
     dither: stddev of Gaussian noise added to waveform to prevent quantization
       artefacts
-    preemphasis: waveform high-pass filtering constant
+    preemphasis: waveform high-pass filtering costant
     frame_length: frame length in ms
     frame_step: frame_Step in ms
     fft_length: number of fft bins
@@ -182,7 +182,7 @@ def encode(self, s):
     """
     # Make sure that the data is a single channel, 16bit, 16kHz wave.
     # TODO(chorowski): the directory may not be writable, this should fallback
-    # to a temp path, and provide instructions for installing sox.
+    # to a temp path, and provide instructions for instaling sox.
     if not s.endswith(".wav"):
       out_filepath = s + ".wav"
       if not os.path.exists(out_filepath):
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index f80416fdd..a0059845a 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -348,7 +348,7 @@ def store_to_file(self, filename):
 def _escape_token(token, alphabet):
   """Escape away underscores and OOV characters and append '_'.
 
-  This allows the token to be expressed as the concatenation of a list
+  This allows the token to be experessed as the concatenation of a list
   of subtokens from the vocabulary. The underscore acts as a sentinel
   which allows us to invertibly concatenate multiple such lists.
 
diff --git a/tensor2tensor/data_generators/translate.py b/tensor2tensor/data_generators/translate.py
index 8d5cf808f..e0b9c6d3f 100644
--- a/tensor2tensor/data_generators/translate.py
+++ b/tensor2tensor/data_generators/translate.py
@@ -170,7 +170,7 @@ def is_generate_per_split(self):
 
   def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
     """Get vocab for distill problems."""
-    # We assume that vocab file is present in data_dir directory where the
+    # We assume that voab file is present in data_dir, directory where the
     # data generated will be stored.
     vocab_filepath = os.path.join(data_dir, self.vocab_filename)
     encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
diff --git a/tensor2tensor/data_generators/translate_enzh.py b/tensor2tensor/data_generators/translate_enzh.py
index 9e2f56e04..444fc9834 100644
--- a/tensor2tensor/data_generators/translate_enzh.py
+++ b/tensor2tensor/data_generators/translate_enzh.py
@@ -172,7 +172,7 @@ class TranslateEnzhWmt32k(translate.TranslateProblem):
 
   CWMT:
     - http://nlp.nju.edu.cn/cwmt-wmt/
-    - Website contains instructions for FTP server access.
+    - Website contrains instructions for FTP server access.
     - You'll need to download CASIA, CASICT, DATUM2015, DATUM2017,
         NEU datasets
 
diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py
index 5222b5a62..80f1ed36d 100644
--- a/tensor2tensor/data_generators/wiki.py
+++ b/tensor2tensor/data_generators/wiki.py
@@ -177,7 +177,7 @@ def scramble(self, seq):
 
 @registry.register_problem
 class LanguagemodelWikiScrambleL128(LanguagemodelWikiScramble):
-  """Sequence length 128, 50% scrambled."""
+  """Sequence length 128, 50% scrambed."""
 
   @property
   def sequence_length(self):
@@ -190,7 +190,7 @@ def scramble_fraction(self):
 
 @registry.register_problem
 class LanguagemodelWikiScrambleL1k(LanguagemodelWikiScramble):
-  """Sequence length 1024, 50% scrambled."""
+  """Sequence length 1024, 50% scrambed."""
 
   @property
   def sequence_length(self):
@@ -209,7 +209,7 @@ class LanguagemodelWikiNorefV8kL1k(LanguagemodelWikiXmlV8kL1k):
 
   Special pages (non-articles) are dropped.
 
-  This more closely resembles plain text, though there are still some xml
+  This more closely resemples plain text, though there are still some xml
   elements, like tables.
 
   Each article is prefixed by a line containing the title and length in
@@ -228,7 +228,7 @@ def vocab_filename(self):
     return "vocab.wiki_noref.%d" % self.approx_vocab_size
 
   def filepath_to_unicode_strings(self, filepath):
-    """Overrides the base class to clean up the xml dump before tokenizing."""
+    """Overriddes the base class to clean up the xml dump before tokenizing."""
     dump = text_encoder.to_unicode_ignore_errors(tf.gfile.Open(filepath).read())
     pages = _dump_to_pages(dump)
     ret = u""
diff --git a/tensor2tensor/data_generators/wsj_parsing.py b/tensor2tensor/data_generators/wsj_parsing.py
index 867277de9..bef82eb1b 100644
--- a/tensor2tensor/data_generators/wsj_parsing.py
+++ b/tensor2tensor/data_generators/wsj_parsing.py
@@ -63,11 +63,11 @@ def token_generator(tree_path, source_token_vocab, target_token_vocab,
 
   This generator assumes the files at source_path and target_path have
   the same number of lines and yields dictionaries of "inputs" and "targets"
-  where inputs and targets are token ids from source and target lines
+  where inputs and targets are token ids from source and taret lines
   converted to integers using the token_map.
 
   Args:
-    tree_path: path to the file with WSJ format trees, one per line.
+    tree_path: path to the file with wsj format trees, one per line.
     source_token_vocab: GenericVocabulary object for source vocabulary.
     target_token_vocab: GenericVocabulary object for target vocabulary.
     eos: integer to append at the end of each sequence (default: None).
@@ -92,7 +92,7 @@ def parsing_token_generator(data_dir, tmp_dir, train, source_vocab_size,
   """Generator for parsing as a sequence-to-sequence task that uses tokens.
 
   This generator assumes the files parsing_{train,dev}.trees, which contain
-  trees in WSJ format.
+  trees in wsj format.
 
   Args:
     data_dir: path to the data directory.
diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index ac8552191..0ccb72745 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -37,7 +37,7 @@
 
 from tensorflow.python.framework import function
 
-# Struct containing the sequences ids and order on a batch (are send to the
+# Struct conatining the sequences ids and order on a batch (are send to the
 # expert to allow them to compute the bias mask)
 BatchInfo = collections.namedtuple("BatchInfo", "coordinates, order")
 
@@ -57,9 +57,9 @@ def get_standardized_layers(hparams, dp=None, ps_devices=None):
 
   Args:
     hparams (tf.HParams): the model hparameters
-    dp (expert_utils.Parallelism): A data parallelism object. If not given,
+    dp (expert_utils.Parallelism): A data paralelism object. If not given,
       the dp calls are simply ignored.
-    ps_devices: a reference to model._ps_devices (only used by the MOE layer)
+    ps_devices: a reference to model._ps_devices (only used by the moe layer)
 
   Returns:
     dict[str:fct]: A dictionary containing the standardized functions
@@ -82,9 +82,9 @@ def register_layer(
       fct_in (fct): The function to register
       default_args (list): The default parameters to add to the function.
       default_kwargs (dict): The default parameters to add to the function.
-        Those arguments can be overwritten when calling the function.
-      use_dp (bool): Wrap the function call within a dataparallelism object if
-        dp is available. Some layers (like MOE) must be called without dp.
+        Those arguments can be overwriten when calling the function.
+      use_dp (bool): Wrap the function call within a dataparalellism object if
+        dp is available. Some layers (like moe) must be called without dp.
       recompute_grad (bool): If True, recompute the function during the
         backward pass to save memory
 
@@ -319,7 +319,7 @@ def add_standard_attention_hparams(hparams):
   hparams.add_hparam("attention_red_nonlinearity", "none")
 
   # Fully connected layers flags
-  # To be more consistent, should use filter_size to also control the MOE
+  # To be more concistent, should use filter_size to also controle the moe
   # size if moe_hidden_sizes not set
   hparams.add_hparam("filter_size", 2048)
   hparams.add_hparam("relu_dropout", 0.0)
@@ -400,7 +400,7 @@ def get_timing_signal_1d(length,
   memory inputs to attention.
 
   The use of relative position is possible because sin(x+y) and cos(x+y) can be
-  expressed in terms of y, sin(x) and cos(x).
+  experessed in terms of y, sin(x) and cos(x).
 
   In particular, we use a geometric sequence of timescales starting with
   min_timescale and ending with max_timescale.  The number of different
@@ -445,7 +445,7 @@ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
   memory inputs to attention.
 
   The use of relative position is possible because sin(x+y) and cos(x+y) can be
-  expressed in terms of y, sin(x) and cos(x).
+  experessed in terms of y, sin(x) and cos(x).
 
   In particular, we use a geometric sequence of timescales starting with
   min_timescale and ending with max_timescale.  The number of different
@@ -512,7 +512,7 @@ def add_timing_signal_nd(x, min_timescale=1.0, max_timescale=1.0e4):
   memory inputs to attention.
 
   The use of relative position is possible because sin(a+b) and cos(a+b) can be
-  expressed in terms of b, sin(a) and cos(a).
+  experessed in terms of b, sin(a) and cos(a).
 
   x is a Tensor with n "positional" dimensions, e.g. one dimension for a
   sequence or two dimensions for an image
@@ -862,7 +862,7 @@ def to_float(bc):
   bc_v = tf.expand_dims(to_float(batch_coordinates_q), 1)
   bc_h = tf.expand_dims(to_float(batch_coordinates_k), 0)
   bias_batch = bc_h - bc_v  # Broadcast to create [length_q, length_k] mask
-  # Threshold non zeros to 1.0
+  # Theshold non zeros to 1.0
   bias_batch = condition_fn(bias_batch)
   bias_batch *= -1e9  # Set non zeros to -infinity
   return bias_batch
@@ -877,7 +877,7 @@ def to_float(bc):
 # Mask similar to upper triangular mask, but allow dispatching
 attention_bias_future = functools.partial(
     attention_bias_batch,
-    # Elems can attend to themselves (otherwise would use bias_batch + 1.0)
+    # Elems can attend to themself (otherwise would use bias_batch + 1.0)
     # No tf.abs to consider the order
     # tf.maximum and tf.minimum to threshold the values
     condition_fn=lambda bias: tf.maximum(0.0, tf.minimum(1.0, bias)),
@@ -1060,7 +1060,7 @@ def grouped_attention_multihead(query_antecedent,
   memory_target_density indicates the average how many groups in which
   a key-value pair should participate.
 
-  We use auxiliary losses to ensure that each group contains roughly
+  We use auxialiary losses to ensure that each group contains roughly
   the same number of queries and the same number of key-value pairs.
   If for a given sequence, the actual number of queries/pairs sent to
   an expert exceeds this target by a factor of more than
@@ -1316,7 +1316,7 @@ def dot_product_attention(q,
     name: an optional string
     make_image_summary: True if you want an image summary.
     save_weights_to: an optional dictionary to capture attention weights
-      for visualization; the weights tensor will be appended there under
+      for vizualization; the weights tensor will be appended there under
       a string key created from the variable scope (including name).
     dropout_broadcast_dims:  an optional list of integers less than 4
       specifying in which dimensions to broadcast the dropout decisions.
@@ -1378,7 +1378,7 @@ def _relative_attention_inner(x, y, z, transpose):
     x: Tensor with shape [batch_size, heads, length, length or depth].
     y: Tensor with shape [batch_size, heads, length, depth].
     z: Tensor with shape [length, length, depth].
-    transpose: Whether to transpose inner matrices of y and z. Should be true if
+    transpose: Whether to tranpose inner matrices of y and z. Should be true if
         last dimension of x is depth, not length.
 
   Returns:
@@ -1422,7 +1422,7 @@ def dot_product_attention_relative(q,
     k: a Tensor with shape [batch, heads, length, depth].
     v: a Tensor with shape [batch, heads, length, depth].
     bias: bias Tensor.
-    max_relative_position: an integer specifying the maximum distance between
+    max_relative_position: an integer specifying the maxmimum distance between
         inputs that unique position embeddings should be learned for.
     dropout_rate: a floating point number.
     image_shapes: optional tuple of integer scalars.
@@ -2141,7 +2141,7 @@ def gather_indices_2d(x, block_shape, block_stride):
 
 
 def make_2d_block_raster_mask(query_shape, memory_flange):
-  """creates a mask for 2d block raster scan.
+  """creates a mask for 2d block raster scany.
 
   The query mask can look to the left, top left, top, and top right, but
   not to the right. Inside the query, we have the standard raster scan
@@ -2476,7 +2476,7 @@ def multihead_attention(query_antecedent,
                kv_padding: One of "VALID", "SAME" or "LEFT". Default is "VALID":
                no padding.
     cache: dict containing Tensors which are the results of previous
-           attentions, used for fast decoding. Expects the dict to contain two
+           attentions, used for fast decoding. Expects the dict to contrain two
            keys ('k' and 'v'), for the initial call the values for these keys
            should be empty Tensors of the appropriate shape.
                'k' [batch_size, 0, key_channels]
@@ -2487,7 +2487,7 @@ def multihead_attention(query_antecedent,
                        at.
     name: an optional string.
     save_weights_to: an optional dictionary to capture attention weights
-      for visualization; the weights tensor will be appended there under
+      for vizualization; the weights tensor will be appended there under
       a string key created from the variable scope (including name).
     make_image_summary: Whether to make an attention image summary.
     dropout_broadcast_dims:  an optional list of integers less than 4
@@ -2509,7 +2509,7 @@ def multihead_attention(query_antecedent,
         [batch_size, length_q, hidden_dim]
     unless the cache dict is provided in which case only the last memory
     position is calculated and the output shape is [batch_size, 1, hidden_dim]
-    Optionally returns an additional loss parameters (ex: load balance loss for
+    Optionaly returns an additional loss parameters (ex: load balance loss for
     the experts) returned by the attention_type function.
 
   Raises:
@@ -2661,7 +2661,7 @@ def ffn_self_attention_layer(x,
   We use self-attention to do feedforward computations. We apply this function
   positionwise where for each position, we linearly transform the output to have
   depth filter_depth, and break up the result depth-wise into num_parts
-  contiguous parts.  The parts self-attend, we concatenate the results
+  contiguous parts.  The parts self-attentd, we concatenate the results
   depth-wise, and we linearly transform to a depth of output_depth. The
   goal is to get multiplicative interactions between components of a
   representation.
@@ -2764,7 +2764,7 @@ def parameter_attention(x,
         x, total_key_depth, use_bias=False, name="q_transform")
     if dropout_rate:
       # This is a cheaper form of attention dropout where we use to use
-      # the same dropout decisions across batch elements and query positions,
+      # the same dropout decisions across batch elemets and query positions,
       # but different decisions across heads and memory positions.
       v = tf.nn.dropout(
           v, 1.0 - dropout_rate, noise_shape=[num_heads, memory_rows, 1])
@@ -2787,7 +2787,7 @@ def parameter_attention(x,
 
 @expert_utils.add_name_scope()
 def coordinate_tensor(shape, axis):
-  """Return a tensor with given shape containing coordinate along given axis.
+  """Return a tensor with given shape containing coordinte along given axis.
 
   Args:
     shape: a Tensor representing the shape of the output Tensor
@@ -2879,7 +2879,7 @@ def add_or_set_if(prev_bias, new_bias, condition):
     def mask_and_call_attention(x):
       """Function applied once for each sequence of the batch."""
 
-      # Mask to prevent sequences of attending to the future
+      # Mask to prevent sequences of attenting to the future
       length = common_layers.shape_list(x)[1]  # x has shape [1, length,...]
       bias_past = tf.reshape(
           attention_bias_lower_triangle(length), [length, length])
@@ -2972,7 +2972,7 @@ def expert_dot_product(q, k, v, info_q, info_k):
   """Perform dot product on a subset of the sequence.
 
   Can add a mask to the attention to prevent sequences to attend to each other
-  and to prevent attention to the future.
+  and to prevent attention to the futur.
 
   Args:
     q (tf.Tensor): Queries of shape [length_expert_q, depth_k]
@@ -3201,7 +3201,7 @@ def flatten_batch(x):
   gates_q = tf.stack(list_gates_q)
   gates_k = tf.stack(list_gates_k)
 
-  # Process each head separately
+  # Process each head separatly
   v_out = map_fn_switch(
       lambda args: dot_product_single_head(bi=bi, *args),
       elems=(q, k, v, gates_q, gates_k),
@@ -3436,7 +3436,7 @@ def conv_elems_1d(x, factor, out_depth=None):
 
   Merge/restore/compress factors positions of dim depth of the input into
   a single position of dim out_depth.
-  This is basically just a strided convolution without overlap
+  This is basically just a strided convolution without overlapp
   between each strides.
   The original length has to be divided by factor.
 
@@ -3481,7 +3481,7 @@ def local_reduction_attention(x, block_length, multihead_params):
   def dot_product_self_local_attention_flattened(q, k, v):
     """Strided block local self-attention.
 
-    No overlap between the blocks.
+    No overlapp between the blocks.
 
     Args:
       q (tf.Tensor): shape [batch, heads, length, depth_k]
@@ -3562,7 +3562,7 @@ def multihead_self_attention_reduced(
 
   Args:
     x (tf.Tensor): float32 of shape [batch, length, depth]
-    memory_antecedent (tf.Tensor): Unsupported for now
+    memory_antecedent (tf.Tensor): Unsuported for now
     bias (tf.Tensor): Ignored
     factor (int): compression factor for the memory sequence
     multihead_params (dict): parameters for multihead attention
@@ -3584,7 +3584,7 @@ def multihead_self_attention_reduced(
 
   depth = x.get_shape().as_list()[-1]
 
-  # Could try to have some overlap between the blocks but that would
+  # Could try to have some overlapp between the blocks but that would
   # create conv artifacts, would make it difficult to not attend to the future
   # within one group and the padding should be handled specially.
 
diff --git a/tensor2tensor/layers/common_attention_test.py b/tensor2tensor/layers/common_attention_test.py
index 61ff5a6d5..bb84a515d 100644
--- a/tensor2tensor/layers/common_attention_test.py
+++ b/tensor2tensor/layers/common_attention_test.py
@@ -403,7 +403,7 @@ def testDotProductAttentionRelative(self):
     self.assertEqual(res.shape, (5, 7, 12, 32))
 
   def testBiasBatchCoordinates(self):
-    """Testing the batch coordinates mask."""
+    """Testing the batch cooridnates mask."""
     q = tf.constant([0, 0, 1, 1, 1, 1, 2, 2, 2], dtype=tf.int32)
     q = tf.expand_dims(q, axis=-1)
 
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
index 3b2d5f802..ea0e93fbd 100644
--- a/tensor2tensor/layers/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -102,13 +102,13 @@ def basic_params1():
       moe_loss_coef=1e-2,
       # Sequences of operations to perform on layer input and layer output.
       # Used by common_layers.layer_preprocess, common_layers.layer_postprocess
-      # Each character represents an operation:
+      # Each character repsesnts an operation:
       # none: no preprocessing
       #    d: apply dropout
       #    n: apply normalization (see norm_type and norm_epsilon)
       #    a: add layer input (residual connection - only during postprocess)
       # The special string "none" is used instead of the empty string
-      # to indicate no pre/postprocessing, since the empty string causes
+      # to indicate no pre/postprocesisng, since the empty string causes
       # trouble for hyperparameter tuning.
       # TODO(noam): The current settings ("", "dan") are the published version
       # of the transformer.  ("n", "da") seems better for harder-to-learn
@@ -174,13 +174,13 @@ def basic_params1():
       # The maximum length of "input" sequence.
       # Sequences longer than this value will be truncated. 0 or negative values
       # mean there is no maximum or truncation.
-      # You can change this behavior by overriding preprocess_example() method
+      # You can change this behavior by overridding preprocess_example() method
       # in your problem class.
       max_input_seq_length=0,
       # The maximum length of "target" sequence.
       # Sequences longer than this value will be truncated. 0 or negative values
       # mean there is no maximum or truncation.
-      # You can change this behavior by overriding preprocess_example() method
+      # You can change this behavior by overridding preprocess_example() method
       # in your problem class.
       max_target_seq_length=0,
       # if nonzero, we split the target sequences on example read.
@@ -220,7 +220,7 @@ def basic_params1():
       scheduled_sampling_warmup_steps=50000,
       scheduled_sampling_gold_mixin_prob=0.5,
       # This setting controls whether to copy variables around in a daisy chain
-      # (if true) or leave their placement to TensorFlow. It only affects multi
+      # (if true) or leave their placement to Tensorflow. It only affects multi
       # device training and mostly should be turned on for performance. One
       # exception are recurrent models: with dynamic loops it must be off.
       daisy_chain_variables=True,
diff --git a/tensor2tensor/layers/common_image_attention.py b/tensor2tensor/layers/common_image_attention.py
index f1849c050..23730c0d6 100644
--- a/tensor2tensor/layers/common_image_attention.py
+++ b/tensor2tensor/layers/common_image_attention.py
@@ -541,8 +541,8 @@ def prepare_decoder(targets, hparams):
       assert hparams.img_len*channels % hparams.query_shape[1] == 0
       assert hparams.img_len % hparams.query_shape[0] == 0
       total_block_width = hparams.img_len*channels
-      # Decoding is in block raster scan order. We divide the image into
-      # hparams.query_shape blocks and then decode each block in raster scan.
+      # Decoding is in block rastor scan order. We divide the image into
+      # hparams.query_shape blocks and then decode each block in rastor scan.
       # To make that compatible with our inference pipeline, pad the target so
       # that rows is a multiple of query_shape and columns is a multiple of
       # hparams.img_len*channels
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index 10488c961..5dc088234 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -1201,7 +1201,7 @@ def add_timing_signal(x, min_timescale=1, max_timescale=1e4, num_timescales=16):
   and the target of the attention.
 
   The use of relative position is possible because sin(x+y) and cos(x+y) can be
-  expressed in terms of y, sin(x) and cos(x).
+  experessed in terms of y, sin(x) and cos(x).
 
   In particular, we use a geometric sequence of timescales starting with
   min_timescale and ending with max_timescale.  For each timescale, we
@@ -1698,7 +1698,7 @@ def padded_cross_entropy(logits,
     label_smoothing: a floating point `Scalar`.
     weights_fn: A function from labels to weights.
     reduce_sum: a Boolean, whether to sum at the end or not.
-    gaussian: If true, use a Gaussian distribution for label smoothing
+    gaussian: If true, use a gaussian distribution for label smoothing
 
   Returns:
     loss_numerator: a `Scalar`.  Sum of losses.
@@ -1747,9 +1747,9 @@ def smoothing_cross_entropy(logits,
     labels: Tensor of size [batch_size, ?, ?, ?]
     vocab_size: Tensor representing the size of the vocabulary.
     confidence: Used to determine on and off values for label smoothing.
-      If `gaussian` is true, `confidence` is the variance to the Gaussian
+      If `gaussian` is true, `confidence` is the variance to the gaussian
       distribution.
-    gaussian: Uses a Gaussian distribution for label smoothing
+    gaussian: Uses a gaussian distribution for label smoothing
 
   Returns:
 
@@ -1935,7 +1935,7 @@ def ravanbakhsh_set_layer(layer_size,
                           name=None):
   """Layer from Deep Sets paper: https://arxiv.org/abs/1611.04500 .
 
-  More parameter-efficient version of a linear-set-layer with context.
+  More parameter-efficient verstion of a linear-set-layer with context.
 
   Args:
     layer_size: Dimension to transform the input vectors to.
@@ -2661,7 +2661,7 @@ def grad_fn(inputs, variables, outputs, output_grads):
       grad_vars = [tf.cast(grad_var, tf.bfloat16) for grad_var in grad_vars]
     if is_on_tpu():
       # TODO(noam): remove this hack once XLA does the right thing.
-      # Force the gradients on the inputs to be computed before the variables
+      # Force the gradinets on the inputs to be computed before the variables
       # are updated.  This saves memory by preventing XLA from making an extra
       # copy of the variables.
       grad_vars = force_dependency(grad_vars, grad_inputs)
@@ -2703,7 +2703,7 @@ def dense(x, units, **kwargs):
   fn = lambda x: tf.layers.dense(x, units, **kwargs)
   if is_on_tpu():
     # TODO(noam): remove this hack once XLA does the right thing.
-    # Forces the gradients on the inputs to be computed before the variables
+    # Forces the gradinets on the inputs to be computed before the variables
     # are updated.  This saves memory by preventing XLA from making an extra
     # copy of the variables.
     return _recompute_grad(fn, [x])
diff --git a/tensor2tensor/layers/discretization.py b/tensor2tensor/layers/discretization.py
index 52f236f98..c1596d89d 100644
--- a/tensor2tensor/layers/discretization.py
+++ b/tensor2tensor/layers/discretization.py
@@ -591,7 +591,7 @@ def discrete_bottleneck(x,
         x_means += x_means_res
         x_means_hot.append(x_means_hot_res)
 
-      # Get the discrete latent representation
+      # Get the discrete latent represenation
       x_means_hot = tf.stack(x_means_hot, axis=1)
       x_means_idx = tf.argmax(x_means_hot, axis=-1)
 
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
index 1d735d9c5..992ea5b95 100644
--- a/tensor2tensor/layers/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -100,6 +100,7 @@ def bottom_simple(self, x, name, reuse):
         x = tf.squeeze(x, axis=3)
       while len(x.get_shape()) < 3:
         x = tf.expand_dims(x, axis=-1)
+
       var = self._get_weights()
       x = common_layers.dropout_no_scaling(
           x, 1.0 - self._model_hparams.symbol_dropout)
diff --git a/tensor2tensor/models/research/attention_lm.py b/tensor2tensor/models/research/attention_lm.py
index cbc45c4e7..30277d6f3 100644
--- a/tensor2tensor/models/research/attention_lm.py
+++ b/tensor2tensor/models/research/attention_lm.py
@@ -70,7 +70,7 @@ def attention_lm_prepare_decoder(targets, hparams):
   Returns:
     decoder_input: a Tensor, bottom of decoder stack
     decoder_self_attention_bias: a Tensor, containing large negative values
-    to implement masked attention and possibly biases for diagonal alignments
+    to implement masked attention and possibly baises for diagonal alignments
   """
   if hparams.prepend_mode == "prepend_inputs_full_attention":
     decoder_self_attention_bias = (
diff --git a/tensor2tensor/models/research/attention_lm_moe.py b/tensor2tensor/models/research/attention_lm_moe.py
index 49ca3d20f..ea65496cb 100644
--- a/tensor2tensor/models/research/attention_lm_moe.py
+++ b/tensor2tensor/models/research/attention_lm_moe.py
@@ -163,7 +163,7 @@ def _diet_expert(x):
     def print_shape(x, suffix, debug=False):
       # To help debugging, print the input/output shapes at inference and eval
       # Inference for long sequences can take a long time, so that's help to
-      # see the progression of the generation
+      # see the progession of the generation
       if not debug and hparams.mode == ModeKeys.TRAIN:
         return x
       return tf.Print(x, [tf.shape(x)], "shape_x_{}".format(suffix))
@@ -368,7 +368,7 @@ def attention_lm_moe_prepare_decoder(targets, hparams):
   Returns:
     decoder_input: a Tensor, bottom of decoder stack
     decoder_self_attention_bias: a Tensor, containing large negative values
-    to implement masked attention and possibly biases for diagonal alignments
+    to implement masked attention and possibly baises for diagonal alignments
     pad_remover (expert_utils.PadRemover): an util object to remove padding
   """
   targets_pad_mask = common_attention.embedding_to_padding(targets)
diff --git a/tensor2tensor/models/research/basic_conv_gen.py b/tensor2tensor/models/research/basic_conv_gen.py
index b3260e2d3..144042896 100644
--- a/tensor2tensor/models/research/basic_conv_gen.py
+++ b/tensor2tensor/models/research/basic_conv_gen.py
@@ -41,7 +41,7 @@ def body(self, features):
     cur_frame = tf.to_float(features["inputs"])
     prev_frame = tf.to_float(features["inputs_prev"])
     x = tf.concat([cur_frame, prev_frame], axis=-1)
-    for _ in range(hparams.num_compress_steps):
+    for _ in xrange(hparams.num_compress_steps):
       x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu,
                            strides=(2, 2), padding="SAME")
       x = common_layers.layer_norm(x)
@@ -52,7 +52,7 @@ def body(self, features):
     x = tf.concat([x, action + zeros], axis=-1)
 
     # Run a stack of convolutions.
-    for i in range(hparams.num_hidden_layers):
+    for i in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer%d" % i):
         y = tf.layers.conv2d(x, filters, kernel1, activation=common_layers.belu,
                              strides=(1, 1), padding="SAME")
@@ -61,7 +61,7 @@ def body(self, features):
         else:
           x = common_layers.layer_norm(x + y)
     # Up-convolve.
-    for _ in range(hparams.num_compress_steps):
+    for _ in xrange(hparams.num_compress_steps):
       filters //= 2
       x = tf.layers.conv2d_transpose(
           x, filters, kernel2, activation=common_layers.belu,
@@ -102,179 +102,5 @@ def basic_conv():
 def basic_conv_small():
   """Small conv model."""
   hparams = common_hparams.basic_params1()
-  hparams.kernel_sizes = [(3,3), (5,5)]
-  hparams.filter_numbers = [32, 3*256]
-  hparams.batch_size = 2
-  hparams.add_hparam("per_image_standardization", False)
   hparams.hidden_size = 32
   return hparams
-
-@registry.register_hparams
-def basic_conv_small_per_image_standardization():
-  """Small conv model."""
-  hparams = common_hparams.basic_params1()
-  hparams.kernel_sizes = [(3,3), (5,5)]
-  hparams.filter_numbers = [32, 3*256]
-  hparams.batch_size = 2
-  hparams.add_hparam("per_image_standardization", True)
-
-  return hparams
-
-
-@registry.register_hparams
-def basic_conv_small_small_lr():
-  """Small conv model."""
-  hparams = common_hparams.basic_params1()
-  hparams.batch_size = 2
-
-  hparams.learning_rate = 0.0001
-  return hparams
-
-
-@registry.register_model
-class StaticBasicConvGen(t2t_model.T2TModel):
-
-  def body(self, features):
-    filters = self.hparams.hidden_size
-    cur_frame = features["inputs_0"]
-    prev_frame = features["inputs_1"]
-    if self.hparams.per_image_standardization:
-      cur_frame = tf.map_fn(lambda frame: tf.image.per_image_standardization(frame), cur_frame)
-      prev_frame = tf.map_fn(lambda frame: tf.image.per_image_standardization(frame), prev_frame)
-
-    action = common_layers.embedding(tf.to_int64(features["action"]),
-                                     10, filters)
-    action = tf.reshape(action, [-1, 1, 1, filters])
-
-    frames = tf.concat([cur_frame, prev_frame], axis=3)
-    h1 = tf.layers.conv2d(frames, filters, kernel_size=(3, 3), padding="SAME")
-    h2 = tf.layers.conv2d(tf.nn.relu(h1 + action), filters,
-                          kernel_size=(5, 5), padding="SAME")
-    res = tf.layers.conv2d(tf.nn.relu(h2 + action), 3 * 256,
-                           kernel_size=(3, 3), padding="SAME")
-    reward_pred_h1 = tf.reduce_mean(res, axis=[1, 2])
-    reward_pred = tf.layers.dense(reward_pred_h1, 2, name="reward")
-    # reward_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-    #   labels=tf.to_int32(features["reward"]), logits=reward_pred)
-    # reward_loss = tf.reduce_mean(reward_loss)
-    x = tf.layers.flatten(h2)
-    # l = tf.shape(res)[1]
-    # w = tf.shape(res)[2]
-    l = 210
-    w = 160
-    res = tf.reshape(res, [-1, l, w, 768])
-    return {"targets": res, "reward": x}
-
-@registry.register_model
-class ResidualBasicConvGen(t2t_model.T2TModel):
-
-  def body(self, features):
-    filters = 38
-    num_hidden_layers = self.hparams.num_hidden_layers
-    #TODO: possibly make embeding of inputs_0 and inputs_1
-    cur_frame = features["inputs_0"]
-    prev_frame = features["inputs_1"]
-
-    if self.hparams.per_image_standardization:
-      cur_frame = tf.map_fn(lambda frame: tf.image.per_image_standardization(frame), cur_frame)
-      prev_frame = tf.map_fn(lambda frame: tf.image.per_image_standardization(frame), prev_frame)
-
-    # prev_frame = tf.Print(prev_frame, [prev_frame], "prev frame = ", summarize=200)
-    action_embedding_size = 32
-    action_space_size = 10
-    kernel = (3, 3)
-    # Gather all inputs.
-    action = common_layers.embedding(tf.to_int64(features["action"]),
-                                     action_space_size, action_embedding_size)
-    action = tf.reshape(action, [-1, 1, 1, action_embedding_size])
-    #broadcast to the shape compatibile with pictures
-    action += tf.expand_dims(tf.zeros_like(cur_frame[..., 0]), -1)
-    frames = tf.concat([cur_frame, prev_frame, action], axis=3)
-    # x = tf.layers.conv2d(frames, filters, kernel, activation=tf.nn.relu,
-    #                      strides=(2, 2), padding="SAME")
-    # Run a stack of convolutions.
-    x = frames
-    for _ in range(num_hidden_layers):
-      y = tf.layers.conv2d(x, filters, kernel, activation=tf.nn.relu,
-                           strides=(1, 1), padding="SAME")
-      x = common_layers.layer_norm(x + y)
-    # Up-convolve.
-    # x = tf.layers.conv2d_transpose(
-    #     frames, filters, kernel, activation=tf.nn.relu,
-    #     strides=(1, 1), padding="SAME")
-    # Output size is 3 * 256 for 3-channel color space.
-    res = tf.layers.conv2d(x, 3 * 256, kernel, padding="SAME")
-    x = tf.layers.flatten(x)
-
-    # TODO: pm->pm: add done
-    res_done = tf.layers.dense(x, 2)
-
-    return {"targets":res, "reward": x}
-
-
-@registry.register_model
-class MichiganBasicConvGen(t2t_model.T2TModel):
-
-  def body(self, features):
-    from tensor2tensor.layers.common_layers import shape_list
-    def standardize_images(x):
-      """Image standardization on batches (tf.image.per_image_standardization)."""
-      with tf.name_scope("standardize_images", [x]):
-        x = tf.to_float(x)
-        x_mean = tf.reduce_mean(x, axis=[1, 2, 3], keep_dims=True)
-        x_variance = tf.reduce_mean(
-          tf.square(x - x_mean), axis=[1, 2, 3], keep_dims=True)
-        x_shape = shape_list(x)
-        num_pixels = tf.to_float(x_shape[1] * x_shape[2] * 3)
-        x = (x - x_mean) / tf.maximum(tf.sqrt(x_variance), tf.rsqrt(num_pixels))
-        # TODO(lukaszkaiser): remove hack below, needed for greedy decoding for now.
-        if x.shape and len(x.shape) == 4 and x.shape[3] == 1:
-          x = tf.concat([x, x, x], axis=3)  # Not used, just a dead tf.cond branch.
-        x.set_shape([None, None, None, 3])
-        return x
-
-    def deconv2d(cur, i, kernel_size, output_filters, activation=tf.nn.relu):
-      from tensor2tensor.layers.common_layers import conv
-      thicker = conv(
-        cur,
-        output_filters * 4, kernel_size,
-        padding="SAME",
-        activation=activation,
-        name="deconv2d" + str(i))
-      return tf.depth_to_space(thicker, 2)
-
-    #
-    # cur_frame = features["inputs_0"]
-    # prev_frame = features["inputs_1"]
-
-    cur_frame = standardize_images(features["inputs_0"])
-    prev_frame = standardize_images(features["inputs_1"])
-    # action = common_layers.embedding(tf.to_int64(features["action"]),
-    #                                  10, filters)
-    # action = tf.reshape(action, [-1, 1, 1, filters])
-
-    frames = tf.concat([cur_frame, prev_frame], axis=3)
-    frames = tf.reshape(frames, [-1, 210, 160, 6])
-    # frames = tf.Print(frames, [tf.shape(frames)], "frames shape=")
-
-    h1 = tf.layers.conv2d(frames, filters=64, strides=2, kernel_size=(8, 8), padding="SAME", activation=tf.nn.relu)
-    h2 = tf.layers.conv2d(h1, filters=128, strides=2, kernel_size=(6, 6), padding="SAME", activation=tf.nn.relu)
-    h3 = tf.layers.conv2d(h2, filters=128, strides=2, kernel_size=(6, 6), padding="SAME", activation=tf.nn.relu)
-    h4 = tf.layers.conv2d(h3, filters=128, strides=2, kernel_size=(4, 4), padding="SAME", activation=tf.nn.relu)
-    h45 = tf.reshape(h4, [-1, 14 * 10 * 128])
-    h5 = tf.layers.dense(h45, 2048, activation=tf.nn.relu)
-    h6 = tf.layers.dense(h5, 2048, activation=tf.nn.relu)
-    h7 = tf.layers.dense(h6, 14 * 10 * 128, activation=tf.nn.relu)
-    h8 = tf.reshape(h7, [-1, 14, 10, 128])
-
-    h9 = deconv2d(h8, 1, (4, 4), 128, activation=tf.nn.relu)
-    h9 = h9[:, :27, :, :]
-    h10 = deconv2d(h9, 2, (6, 6), 128, activation=tf.nn.relu)
-    h10 = h10[:, :53, :, :]
-    h11 = deconv2d(h10, 3, (6, 6), 128, activation=tf.nn.relu)
-    h11 = h11[:, :105, :, :]
-    h12 = deconv2d(h11, 4, (8, 8), 3 * 256, activation=tf.identity)
-
-    reward = tf.layers.flatten(h12)
-
-    return {"targets": h12, "reward": reward}
diff --git a/tensor2tensor/models/research/rl.py b/tensor2tensor/models/research/rl.py
index 2c5181d95..858d6964e 100644
--- a/tensor2tensor/models/research/rl.py
+++ b/tensor2tensor/models/research/rl.py
@@ -50,9 +50,6 @@ def ppo_base_v1():
   hparams.add_hparam("num_eval_agents", 3)
   hparams.add_hparam("video_during_eval", True)
   hparams.add_hparam("save_models_every_epochs", 30)
-  hparams.add_hparam("optimization_batch_size", 50)
-  hparams.add_hparam("max_gradients_norm", 0.5)
-  hparams.add_hparam("simulated_environment", False)
   return hparams
 
 
diff --git a/tensor2tensor/models/research/transformer_moe.py b/tensor2tensor/models/research/transformer_moe.py
index 02a51dc08..57d82edf9 100644
--- a/tensor2tensor/models/research/transformer_moe.py
+++ b/tensor2tensor/models/research/transformer_moe.py
@@ -46,7 +46,7 @@
 # "a/a/a#": Encoder only model (3 layers)
 # "#a/a/a": Decoder only model (3 layers)
 # "a/a-moe#a/a/a": Encoder (2 layers with 1 moe), decoder (3 layers)
-# Note that all combinations are not necessarily possibles (some attention
+# Note that all combinaisons are not necessarily possibles (some attention
 # types are not necessarily compatible with the encoder, or can't accept certain
 # types of masking)
 
diff --git a/tensor2tensor/models/resnet.py b/tensor2tensor/models/resnet.py
index 39f3dd723..30275e7ca 100644
--- a/tensor2tensor/models/resnet.py
+++ b/tensor2tensor/models/resnet.py
@@ -223,7 +223,7 @@ def bottleneck_block(inputs,
     The output `Tensor` of the block.
   """
   # TODO(chrisying): this block is technically the post-activation resnet-v1
-  # bottleneck unit. Test with v2 (pre-activation) and replace if there is no
+  # bottlneck unit. Test with v2 (pre-activation) and replace if there is no
   # difference for consistency.
   shortcut = inputs
   if projection_shortcut is not None:
diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py
index 0410ff7d1..fe5dab52d 100644
--- a/tensor2tensor/models/slicenet.py
+++ b/tensor2tensor/models/slicenet.py
@@ -282,7 +282,7 @@ class SliceNet(t2t_model.T2TModel):
   def body(self, features):
     target_modality_name = (
         self._problem_hparams.target_modality.name)
-    # If we're just predicting a class, there is no use for a decoder.
+    # If we're just predicing a class, there is no use for a decoder.
     run_decoder = "class_label_modality" not in target_modality_name
     return slicenet_internal(
         features["inputs"],
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 488fbcbee..6e2220258 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -51,7 +51,7 @@ class Transformer(t2t_model.T2TModel):
 
   def __init__(self, *args, **kwargs):
     super(Transformer, self).__init__(*args, **kwargs)
-    self.attention_weights = dict()  # For visualizing attention heads.
+    self.attention_weights = dict()  # For vizualizing attention heads.
 
   def encode(self, inputs, target_space, hparams, features=None):
     """Encode transformer inputs.
@@ -60,7 +60,7 @@ def encode(self, inputs, target_space, hparams, features=None):
       inputs: Transformer inputs [batch_size, input_length, input_height,
         hidden_dim] which will be flattened along the two spatial dimensions.
       target_space: scalar, target space ID.
-      hparams: hyperparameters for model.
+      hparams: hyperparmeters for model.
       features: optionally pass the entire features dictionary as well.
         This is needed now for "packed" datasets.
 
@@ -69,7 +69,7 @@ def encode(self, inputs, target_space, hparams, features=None):
           encoder_output: Encoder representation.
               [batch_size, input_length, hidden_dim]
           encoder_decoder_attention_bias: Bias and mask weights for
-              encoder-decoder attention. [batch_size, input_length]
+              encodre-decoder attention. [batch_size, input_length]
     """
     inputs = common_layers.flatten4d3d(inputs)
 
@@ -106,7 +106,7 @@ def decode(self,
           encoder-decoder attention. [batch_size, input_length]
       decoder_self_attention_bias: Bias and mask weights for decoder
           self-attention. [batch_size, decoder_length]
-      hparams: hyperparameters for model.
+      hparams: hyperparmeters for model.
       cache: dict, containing tensors which are the results of previous
           attentions, used for fast decoding.
       nonpadding: optional Tensor with shape [batch_size, decoder_length]
@@ -142,7 +142,7 @@ def body(self, features):
     Args:
       features: Map of features to the model. Should contain the following:
           "inputs": Transformer inputs [batch_size, input_length, hidden_dim]
-          "targets": Target decoder outputs.
+          "tragets": Target decoder outputs.
               [batch_size, decoder_length, hidden_dim]
           "target_space_id"
 
@@ -214,7 +214,7 @@ def _beam_decode(self, features, decode_length, beam_size, top_beams, alpha):
       beam_size: number of beams.
       top_beams: an integer. How many of the beams to return.
       alpha: Float that controls the length penalty. larger the alpha, stronger
-        the preference for longer translations.
+        the preference for slonger translations.
 
     Returns:
       A dict of decoding results {
@@ -253,7 +253,7 @@ def _fast_decode(self,
       beam_size: number of beams.
       top_beams: an integer. How many of the beams to return.
       alpha: Float that controls the length penalty. larger the alpha, stronger
-        the preference for longer translations.
+        the preference for slonger translations.
 
     Returns:
       A dict of decoding results {
@@ -432,7 +432,7 @@ def fast_decode(encoder_output,
     beam_size: number of beams.
     top_beams: an integer. How many of the beams to return.
     alpha: Float that controls the length penalty. larger the alpha, stronger
-      the preference for longer translations.
+      the preference for slonger translations.
     eos_id: End-of-sequence symbol in beam search.
     batch_size: an integer scalar - must be passed if there is no input
 
@@ -625,7 +625,7 @@ def transformer_prepare_decoder(targets, hparams, features=None):
 
   Returns:
     decoder_input: a Tensor, bottom of decoder stack
-    decoder_self_attention_bias: a bias tensor for use in decoder self-attention
+    decoder_self_attention_bias: a bias tensor for use in encoder self-attention
   """
   if hparams.prepend_mode == "prepend_inputs_full_attention":
     decoder_self_attention_bias = (
@@ -679,10 +679,10 @@ def transformer_encoder(encoder_input,
       indicating what positions are not padding.  This must either be
       passed in, which we do for "packed" datasets, or inferred from
       encoder_self_attention_bias.  The knowledge about padding is used
-      for pad_remover(efficiency) and to mask out padding in convolutional
+      for pad_remover(efficiency) and to mask out padding in convoltutional
       layers.
     save_weights_to: an optional dictionary to capture attention weights
-      for visualization; the weights tensor will be appended there under
+      for vizualization; the weights tensor will be appended there under
       a string key created from the variable scope (including name).
     make_image_summary: Whether to make an attention image summary.
 
@@ -758,11 +758,11 @@ def transformer_decoder(decoder_input,
     name: a string
     nonpadding: optional Tensor with shape [batch_size, encoder_length]
       indicating what positions are not padding.  This is used
-      to mask out padding in convolutional layers.  We generally only
+      to mask out padding in convoltutional layers.  We generally only
       need this mask for "packed" datasets, because for ordinary datasets,
       no padding is ever followed by nonpadding.
     save_weights_to: an optional dictionary to capture attention weights
-      for visualization; the weights tensor will be appended there under
+      for vizualization; the weights tensor will be appended there under
       a string key created from the variable scope (including name).
     make_image_summary: Whether to make an attention image summary.
 
@@ -832,14 +832,14 @@ def transformer_ffn_layer(x,
 
   Args:
     x: a Tensor of shape [batch_size, length, hparams.hidden_size]
-    hparams: hyperparameters for model
+    hparams: hyperparmeters for model
     pad_remover: an expert_utils.PadRemover object tracking the padding
       positions. If provided, when using convolutional settings, the padding
       is removed before applying the convolution, and restored afterward. This
       can give a significant speedup.
     conv_padding: a string - either "LEFT" or "SAME".
     nonpadding_mask: an optional Tensor with shape [batch_size, length].
-      needed for convolutional layers with "SAME" padding.
+      needed for convolutoinal layers with "SAME" padding.
       Contains 1.0 in positions corresponding to nonpadding.
 
   Returns:
@@ -982,7 +982,7 @@ def transformer_base():
 
 @registry.register_hparams
 def transformer_big():
-  """HParams for transformer big model on WMT."""
+  """HParams for transfomer big model on WMT."""
   hparams = transformer_base()
   hparams.hidden_size = 1024
   hparams.filter_size = 4096
@@ -993,7 +993,7 @@ def transformer_big():
 
 @registry.register_hparams
 def transformer_big_single_gpu():
-  """HParams for transformer big model for single GPU."""
+  """HParams for transformer big model for single gpu."""
   hparams = transformer_big()
   hparams.layer_prepostprocess_dropout = 0.1
   hparams.learning_rate_warmup_steps = 16000
@@ -1002,7 +1002,7 @@ def transformer_big_single_gpu():
 
 @registry.register_hparams
 def transformer_base_single_gpu():
-  """HParams for transformer base model for single GPU."""
+  """HParams for transformer base model for single gpu."""
   hparams = transformer_base()
   hparams.batch_size = 2048
   hparams.learning_rate_warmup_steps = 16000
@@ -1011,7 +1011,7 @@ def transformer_base_single_gpu():
 
 @registry.register_hparams
 def transformer_parsing_base():
-  """HParams for parsing on WSJ only."""
+  """Hparams for parsing on wsj only."""
   hparams = transformer_base()
   hparams.attention_dropout = 0.2
   hparams.layer_prepostprocess_dropout = 0.2
@@ -1025,7 +1025,7 @@ def transformer_parsing_base():
 
 @registry.register_hparams
 def transformer_parsing_big():
-  """HParams for parsing on WSJ semi-supervised."""
+  """HParams for parsing on wsj semi-supervised."""
   hparams = transformer_big()
   hparams.max_length = 512
   hparams.shared_source_target_embedding = False
@@ -1038,7 +1038,7 @@ def transformer_parsing_big():
 
 @registry.register_hparams
 def transformer_parsing_ice():
-  """HParams for parsing and tagging Icelandic text."""
+  """Hparams for parsing and tagging Icelandic text."""
   hparams = transformer_base_single_gpu()
   hparams.batch_size = 4096
   hparams.shared_embedding_and_softmax_weights = False
@@ -1437,17 +1437,17 @@ def transformer_clean_big_tpu():
 def transformer_tpu_with_conv():
   """Cut down on the number of heads, and use convs instead."""
   hparams = transformer_tpu()
-  hparams.num_heads = 4   # heads are expensive on TPU
+  hparams.num_heads = 4   # heads are expensive on tpu
   hparams.ffn_layer = "conv_relu_conv"
   return hparams
 
 
 @registry.register_hparams
 def transformer_lm_tpu_0():
-  """HParams for training languagemodel_lm1b8k on tpu.  92M Params."""
+  """Hparams for training languagemodel_lm1b8k on tpu.  92M Params."""
   hparams = transformer_clean_big()
   update_hparams_for_tpu(hparams)
-  hparams.num_heads = 4   # heads are expensive on TPU
+  hparams.num_heads = 4   # heads are expensive on tpu
   hparams.batch_size = 4096
   hparams.shared_embedding_and_softmax_weights = False
   hparams.layer_prepostprocess_dropout = 0.1
@@ -1456,7 +1456,7 @@ def transformer_lm_tpu_0():
 
 @registry.register_hparams
 def transformer_lm_tpu_1():
-  """HParams for training languagemodel_lm1b8k on TPU.  335M Params."""
+  """Hparams for training languagemodel_lm1b8k on tpu.  335M Params."""
   hparams = transformer_lm_tpu_0()
   hparams.hidden_size = 2048
   hparams.filter_size = 8192
@@ -1465,7 +1465,7 @@ def transformer_lm_tpu_1():
 
 @registry.register_hparams
 def transformer_librispeech():
-  """HParams for training ASR model on Librispeech."""
+  """Hparams for training ASR model on Librispeech."""
   hparams = transformer_base()
 
   hparams.num_heads = 4
@@ -1482,7 +1482,7 @@ def transformer_librispeech():
 
 @registry.register_hparams
 def transformer_librispeech_tpu():
-  """HParams for training ASR model on Librispeech on TPU."""
+  """Hparams for training ASR model on Librispeech on TPU."""
   hparams = transformer_librispeech()
   update_hparams_for_tpu(hparams)
 
@@ -1493,7 +1493,7 @@ def transformer_librispeech_tpu():
 
 @registry.register_hparams
 def transformer_supervised_attention():
-  """HParams for supervised attention problems."""
+  """Hparams for supervised attention problems."""
   hparams = transformer_base()
   # Attention loss type (KL-divergence or MSE).
   hparams.add_hparam("expected_attention_loss_type", "kl_divergence")
diff --git a/tensor2tensor/models/vanilla_gan.py b/tensor2tensor/models/vanilla_gan.py
index e78d56679..100d60549 100644
--- a/tensor2tensor/models/vanilla_gan.py
+++ b/tensor2tensor/models/vanilla_gan.py
@@ -125,7 +125,7 @@ def body(self, features):
       features: a dictionary with the tensors.
 
     Returns:
-      A pair (predictions, losses) where predictions is the generated image
+      A pair (predictions, losses) where preditions is the generated image
       and losses is a dictionary of losses (that get added for the final loss).
     """
     features["targets"] = features["inputs"]
diff --git a/tensor2tensor/notebooks/hello_t2t-rl.ipynb b/tensor2tensor/notebooks/hello_t2t-rl.ipynb
index b209007d6..d7e0eb6e1 100644
--- a/tensor2tensor/notebooks/hello_t2t-rl.ipynb
+++ b/tensor2tensor/notebooks/hello_t2t-rl.ipynb
@@ -7,7 +7,7 @@
     "colab": {
      "autoexec": {
       "startup": false,
-      "wait_interval": 0.0
+      "wait_interval": 0
      }
     },
     "colab_type": "code",
@@ -50,7 +50,7 @@
     "colab": {
      "autoexec": {
       "startup": false,
-      "wait_interval": 0.0
+      "wait_interval": 0
      }
     },
     "colab_type": "code",
@@ -157,7 +157,7 @@
    },
    "outputs": [],
    "source": [
-    "agent_policy_path = os.path.join(ppo_dir, \"model{}.ckpt.index\".format(iteration_num))[:-6]"
+    "model_path = os.path.join(ppo_dir, \"model{}.ckpt.index\".format(iteration_num))[:-6]"
    ]
   },
   {
@@ -175,7 +175,7 @@
    },
    "outputs": [],
    "source": [
-    "sys.argv = [sys.argv[0], \"--agent_policy_path\", agent_policy_path]"
+    "sys.argv = [sys.argv[0], \"--model_path\", model_path]"
    ]
   },
   {
@@ -325,7 +325,7 @@
    "provenance": [
     {
      "file_id": "1-VScmaLkMqWiSbqgUCFWefzisSREd8l1",
-     "timestamp": 1.512175750497E12
+     "timestamp": 1512175750497
     }
    ],
    "version": "0.3.2",
diff --git a/tensor2tensor/rl/README.md b/tensor2tensor/rl/README.md
index b163a16a5..ffd595911 100644
--- a/tensor2tensor/rl/README.md
+++ b/tensor2tensor/rl/README.md
@@ -1,11 +1,11 @@
 # Tensor2Tensor experimental Model-Based Reinforcement Learning.
 
-The rl package intention is to provide the ability to run reinforcement
-algorithms within TensorFlow's computation graph, in order to do model-based
-RL using environment models from Tensor2Tensor. It's very experimental
+The rl package intention is to provide possiblity to run reinforcement
+algorithms within Tensorflow's computation graph, in order to do model-based
+RL using envoronment models from Tensor2Tensor. It's very experimental
 for now and under heavy development.
 
-Currently the only supported algorithm is Proximal Policy Optimization - PPO.
+Currently the only supported algorithm is Proximy Policy Optimization - PPO.
 
 # Sample usages
 
@@ -35,7 +35,7 @@ python tensor2tensor/bin/t2t-datagen \
   --data_dir=~/t2t_data \
   --tmp_dir=~/t2t_data/tmp \
   --problem=gym_pong_trajectories_from_policy \
-  --agent_policy_path [model]
+  --model_path [model]
 ```
 
 ## Training model for frames generation based on randomly played games
diff --git a/tensor2tensor/rl/collect.py b/tensor2tensor/rl/collect.py
index a43b3a551..8e81dafa7 100644
--- a/tensor2tensor/rl/collect.py
+++ b/tensor2tensor/rl/collect.py
@@ -18,8 +18,7 @@
 import tensorflow as tf
 
 
-def define_collect(policy_factory, batch_env, hparams,
-                   eval_phase, policy_to_actions_lambda=None, scope=""):
+def define_collect(policy_factory, batch_env, hparams, eval_phase):
   """Collect trajectories."""
   eval_phase = tf.convert_to_tensor(eval_phase)
   memory_shape = [hparams.epoch_length] + [batch_env.observ.shape.as_list()[0]]
@@ -35,9 +34,8 @@ def define_collect(policy_factory, batch_env, hparams,
   ]
   memory = [tf.Variable(tf.zeros(shape, dtype), trainable=False)
             for (shape, dtype) in memories_shapes_and_types]
-  with tf.variable_scope(scope):
-    cumulative_rewards = tf.get_variable("cumulative_rewards", len(batch_env),
-                                         trainable=False)
+  cumulative_rewards = tf.get_variable("cumulative_rewards", len(batch_env),
+                                       trainable=False)
 
   should_reset_var = tf.Variable(True, trainable=False)
 
@@ -61,12 +59,9 @@ def step(index, scores_sum, scores_num):
       obs_copy = batch_env.observ + 0
       actor_critic = policy_factory(tf.expand_dims(obs_copy, 0))
       policy = actor_critic.policy
-      if policy_to_actions_lambda:
-        action = policy_to_actions_lambda(policy)
-      else:
-        action = tf.cond(eval_phase,
-                         policy.mode,
-                         policy.sample)
+      action = tf.cond(eval_phase,
+                       policy.mode,
+                       policy.sample)
       postprocessed_action = actor_critic.action_postprocessing(action)
       simulate_output = batch_env.simulate(postprocessed_action[0, ...])
       pdf = policy.prob(action)[0]
diff --git a/tensor2tensor/rl/envs/atari_wrappers.py b/tensor2tensor/rl/envs/atari_wrappers.py
new file mode 100644
index 000000000..b8dd425ec
--- /dev/null
+++ b/tensor2tensor/rl/envs/atari_wrappers.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Various wrappers copied for Gym Baselines."""
+
+from collections import deque
+import gym
+import numpy as np
+
+
+# Adapted from the link below.
+# https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
+
+
+class WarpFrame(gym.ObservationWrapper):
+  """Wrap a frame."""
+
+  def __init__(self, env):
+    """Warp frames to 84x84 as done in the Nature paper and later work."""
+    gym.ObservationWrapper.__init__(self, env)
+    self.width = 84
+    self.height = 84
+    self.observation_space = gym.spaces.Box(
+        low=0, high=255,
+        shape=(self.height, self.width, 1), dtype=np.uint8)
+
+  def observation(self, frame):
+    import cv2  # pylint: disable=g-import-not-at-top
+    cv2.ocl.setUseOpenCL(False)
+    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
+    frame = cv2.resize(frame, (self.width, self.height),
+                       interpolation=cv2.INTER_AREA)
+    return frame[:, :, None]
+
+
+class LazyFrames(object):
+  """Lazy frame storage."""
+
+  def __init__(self, frames):
+    """Lazy frame storage.
+
+      This object ensures that common frames between the observations
+      are only stored once. It exists purely to optimize memory usage
+      which can be huge for DQN's 1M frames replay buffers.
+      This object should only be converted to numpy array before being passed
+      to the model.
+
+    Args:
+      frames: the frames.
+    """
+    self._frames = frames
+
+  def __array__(self, dtype=None):
+    out = np.concatenate(self._frames, axis=2)
+    if dtype is not None:
+      out = out.astype(dtype)
+    return out
+
+
+class FrameStack(gym.Wrapper):
+  """Stack frames."""
+
+  def __init__(self, env, k):
+    """Stack k last frames. Returns lazy array, memory efficient."""
+    gym.Wrapper.__init__(self, env)
+    self.k = k
+    self.frames = deque([], maxlen=k)
+    shp = env.observation_space.shape
+    self.observation_space = gym.spaces.Box(
+        low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
+
+  def reset(self):
+    ob = self.env.reset()
+    for _ in range(self.k):
+      self.frames.append(ob)
+    return self._get_ob()
+
+  def step(self, action):
+    ob, reward, done, info = self.env.step(action)
+    self.frames.append(ob)
+    return self._get_ob(), reward, done, info
+
+  def _get_ob(self):
+    assert len(self.frames) == self.k
+    return LazyFrames(list(self.frames))
+
+
+class MaxAndSkipEnv(gym.Wrapper):
+  """Max and skip env."""
+
+  def __init__(self, env, skip=4):
+    """Return only every `skip`-th frame."""
+    gym.Wrapper.__init__(self, env)
+    # Most recent raw observations (for max pooling across time steps).
+    self._obs_buffer = np.zeros((2,) + env.observation_space.shape,
+                                dtype=np.uint8)
+    self._skip = skip
+
+  def reset(self, **kwargs):
+    return self.env.reset(**kwargs)
+
+  def step(self, action):
+    """Repeat action, sum reward, and max over last observations."""
+    total_reward = 0.0
+    done = None
+    for i in range(self._skip):
+      obs, reward, done, info = self.env.step(action)
+      if i == self._skip - 2: self._obs_buffer[0] = obs
+      if i == self._skip - 1: self._obs_buffer[1] = obs
+      total_reward += reward
+      if done:
+        break
+    # Note that the observation on the done=True frame
+    # doesn't matter
+    max_frame = self._obs_buffer.max(axis=0)
+
+    return max_frame, total_reward, done, info
+
+
+def wrap_atari(env, warp=False, frame_skip=False, frame_stack=False):
+  if warp:
+    env = WarpFrame(env)
+  if frame_skip:
+    env = MaxAndSkipEnv(env, frame_skip)
+  if frame_stack:
+    env = FrameStack(env, frame_stack)
+  return env
diff --git a/tensor2tensor/rl/envs/frame1.png b/tensor2tensor/rl/envs/frame1.png
deleted file mode 100644
index e8288549c479d2f29af33a02c20edaaace05536e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 729
zcmeAS@N?(olHy`uVBq!ia0vp^3xN0%2NRHVa*x@-z`(S?)5S5QV$Rz;XFZt%B^)mH
z+-PvU;b>N<&#^5*R7^<ckXX#l>%Fo!4v9SU+jLRo(U}>qEmt@COk$tj&#rHO_!T$5
zxp=(o5l+cxJe9RuZ&siGeEKzy_RQnA|GvFfeeL#g&+7$dFIR82Xnf4lXC42lxBUAp
z=Wow~x9`3#x5@m!n6Qj`!JGqyOvgFV7~OMS|F?gB9$P%m{=Lybumbbf<(p>A=j;=<
z$dfo;_|nus5u5JkhubW$iQ}*b(`IUll<)h(bmRTof8Rb=?X9(XwNuT)?*TX{gmE$2
z+Glq!y?&s@2YaxfIIjE)|AF=E&E{V}ufKl(t{L-pRM*6>4eKwT?ydi%9~x-opXv`K
zeg8WT6l~b+-m&=nqW_1|56GNj!H5GC%cvTDhb{WN*Ee#`BoG)ZeZLvDYOCe`3zk;|
PCRqkgS3j3^P6<r_eh)Bw

diff --git a/tensor2tensor/rl/envs/frame2.png b/tensor2tensor/rl/envs/frame2.png
deleted file mode 100644
index 00b6b6628d4e8661ff5f243cc4fd31e761603b04..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 732
zcmeAS@N?(olHy`uVBq!ia0vp^3xN0%2NRHVa*x@-z`(TF)5S5QV$Rz;XFatOMH(Jj
zi%W0lxZNmym%B-K2^Y6uw+HvSJFk!N<s^3Pus-RQq+mS%^Dmp#jXsl@_1hayp8xWk
z+rNBzyzP-TpX|0hdw1GZojd*dHrM1ekN-aW_rfx~|9tnk59Jrb(=8ewv-Itj_<F4U
z`mOgdv$wzd5cPX*_UD~99)M+pE&R|J(q;O;%<t6OZF%<hxw9EWI_$5X*z3&)3z?4l
zezxhK*T{MkO%vD{d9x!}#X&B>w1>7LJGw727u@-M`0w+NS08`&Jiis>cupDhf;k6p
zFwAZ5#`fXz8k!g0J&|v?e=nw9ZvUQrb4w-8<=>XxbG!b{H~GcOe_Y%qJ7Y1(!+E#=
zZ`yqSue9wU|MX_8L16teZ0Y%i?MK=0y9|$`Nte28bAEApf+I9y@kGo&X8%@o<zl}5
RTY#yS!PC{xWt~$(698)5LVy4O

diff --git a/tensor2tensor/rl/envs/in_graph_batch_env.py b/tensor2tensor/rl/envs/in_graph_batch_env.py
index c74d71db8..44e30ddd9 100644
--- a/tensor2tensor/rl/envs/in_graph_batch_env.py
+++ b/tensor2tensor/rl/envs/in_graph_batch_env.py
@@ -1,7 +1,58 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Batch of environments inside the TensorFlow graph."""
+
+# The code was based on Danijar Hafner's code from tf.agents:
+# https://github.com/tensorflow/agents/blob/master/agents/tools/in_graph_batch_env.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import gym
+
+import tensorflow as tf
+
+
 class InGraphBatchEnv(object):
-  """Abstract class for batch of environments inside the TensorFlow graph.
+  """Batch of environments inside the TensorFlow graph.
+
+  The batch of environments will be stepped and reset inside of the graph using
+  a tf.py_func(). The current batch of observations, actions, rewards, and done
+  flags are held in according variables.
   """
 
+  def __init__(self, batch_env):
+    """Batch of environments inside the TensorFlow graph.
+
+    Args:
+      batch_env: Batch environment.
+    """
+    self._batch_env = batch_env
+    observ_shape = self._parse_shape(self._batch_env.observation_space)
+    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+    self.action_shape = list(self._parse_shape(self._batch_env.action_space))
+    self.action_dtype = self._parse_dtype(self._batch_env.action_space)
+    with tf.variable_scope('env_temporary'):
+      self._observ = tf.Variable(
+          tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
+          name='observ', trainable=False)
+
   def __getattr__(self, name):
     """Forward unimplemented attributes to one of the original environments.
 
@@ -32,8 +83,17 @@ def simulate(self, action):
     Returns:
       Operation.
     """
-    raise NotImplemented
-
+    with tf.name_scope('environment/simulate'):
+      if action.dtype in (tf.float16, tf.float32, tf.float64):
+        action = tf.check_numerics(action, 'action')
+      observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+      observ, reward, done = tf.py_func(
+          lambda a: self._batch_env.step(a)[:3], [action],
+          [observ_dtype, tf.float32, tf.bool], name='step')
+      observ = tf.check_numerics(observ, 'observ')
+      reward = tf.check_numerics(reward, 'reward')
+      with tf.control_dependencies([self._observ.assign(observ)]):
+        return tf.identity(reward), tf.identity(done)
 
   def reset(self, indices=None):
     """Reset the batch of environments.
@@ -44,7 +104,26 @@ def reset(self, indices=None):
     Returns:
       Batch tensor of the new observations.
     """
-    raise NotImplemented
+    return tf.cond(
+        tf.cast(tf.shape(indices)[0], tf.bool),
+        lambda: self._reset_non_empty(indices), lambda: 0.0)
+
+  def _reset_non_empty(self, indices):
+    """Reset the batch of environments.
+
+    Args:
+      indices: The batch indices of the environments to reset; defaults to all.
+
+    Returns:
+      Batch tensor of the new observations.
+    """
+    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+    observ = tf.py_func(
+        self._batch_env.reset, [indices], observ_dtype, name='reset')
+    observ = tf.check_numerics(observ, 'observ')
+    with tf.control_dependencies([
+        tf.scatter_update(self._observ, indices, observ)]):
+      return tf.identity(observ)
 
   @property
   def observ(self):
@@ -54,3 +133,33 @@ def observ(self):
   def close(self):
     """Send close messages to the external process and join them."""
     self._batch_env.close()
+
+  def _parse_shape(self, space):
+    """Get a tensor shape from a OpenAI Gym space.
+
+    Args:
+      space: Gym space.
+
+    Returns:
+      Shape tuple.
+    """
+    if isinstance(space, gym.spaces.Discrete):
+      return ()
+    if isinstance(space, gym.spaces.Box):
+      return space.shape
+    raise NotImplementedError()
+
+  def _parse_dtype(self, space):
+    """Get a tensor dtype from a OpenAI Gym space.
+
+    Args:
+      space: Gym space.
+
+    Returns:
+      TensorFlow data type.
+    """
+    if isinstance(space, gym.spaces.Discrete):
+      return tf.int32
+    if isinstance(space, gym.spaces.Box):
+      return tf.float32
+    raise NotImplementedError()
diff --git a/tensor2tensor/rl/envs/py_func_batch_env.py b/tensor2tensor/rl/envs/py_func_batch_env.py
deleted file mode 100644
index 518c7bf29..000000000
--- a/tensor2tensor/rl/envs/py_func_batch_env.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Batch of environments inside the TensorFlow graph."""
-
-# The code was based on Danijar Hafner's code from tf.agents:
-# https://github.com/tensorflow/agents/blob/master/agents/tools/in_graph_batch_env.py
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Dependency imports
-
-import gym
-
-from tensor2tensor.rl.envs.in_graph_batch_env import InGraphBatchEnv
-
-import tensorflow as tf
-
-
-class PyFuncBatchEnv(InGraphBatchEnv):
-  """Batch of environments inside the TensorFlow graph.
-
-  The batch of environments will be stepped and reset inside of the graph using
-  a tf.py_func(). The current batch of observations, actions, rewards, and done
-  flags are held in according variables.
-  """
-
-  def __init__(self, batch_env):
-    """Batch of environments inside the TensorFlow graph.
-
-    Args:
-      batch_env: Batch environment.
-    """
-    self._batch_env = batch_env
-    observ_shape = self._parse_shape(self._batch_env.observation_space)
-    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
-    self.action_shape = list(self._parse_shape(self._batch_env.action_space))
-    self.action_dtype = self._parse_dtype(self._batch_env.action_space)
-    with tf.variable_scope('env_temporary'):
-      self._observ = tf.Variable(
-          tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
-          name='observ', trainable=False)
-
-  def __getattr__(self, name):
-    """Forward unimplemented attributes to one of the original environments.
-
-    Args:
-      name: Attribute that was accessed.
-
-    Returns:
-      Value behind the attribute name in one of the original environments.
-    """
-    return getattr(self._batch_env, name)
-
-  def __len__(self):
-    """Number of combined environments."""
-    return len(self._batch_env)
-
-  def __getitem__(self, index):
-    """Access an underlying environment by index."""
-    return self._batch_env[index]
-
-  def simulate(self, action):
-    """Step the batch of environments.
-
-    The results of the step can be accessed from the variables defined below.
-
-    Args:
-      action: Tensor holding the batch of actions to apply.
-
-    Returns:
-      Operation.
-    """
-    with tf.name_scope('environment/simulate'):
-      if action.dtype in (tf.float16, tf.float32, tf.float64):
-        action = tf.check_numerics(action, 'action')
-      observ_dtype = self._parse_dtype(self._batch_env.observation_space)
-      observ, reward, done = tf.py_func(
-          lambda a: self._batch_env.step(a)[:3], [action],
-          [observ_dtype, tf.float32, tf.bool], name='step')
-      observ = tf.check_numerics(observ, 'observ')
-      reward = tf.check_numerics(reward, 'reward')
-      reward.set_shape((len(self),))
-      done.set_shape((len(self),))
-      with tf.control_dependencies([self._observ.assign(observ)]):
-        return tf.identity(reward), tf.identity(done)
-
-  def reset(self, indices=None):
-    """Reset the batch of environments.
-
-    Args:
-      indices: The batch indices of the environments to reset.
-
-    Returns:
-      Batch tensor of the new observations.
-    """
-    return tf.cond(
-        tf.cast(tf.shape(indices)[0], tf.bool),
-        lambda: self._reset_non_empty(indices), lambda: 0.0)
-
-  def _reset_non_empty(self, indices):
-    """Reset the batch of environments.
-
-    Args:
-      indices: The batch indices of the environments to reset; defaults to all.
-
-    Returns:
-      Batch tensor of the new observations.
-    """
-    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
-    observ = tf.py_func(
-        self._batch_env.reset, [indices], observ_dtype, name='reset')
-    observ = tf.check_numerics(observ, 'observ')
-    with tf.control_dependencies([
-        tf.scatter_update(self._observ, indices, observ)]):
-      return tf.identity(observ)
-
-  @property
-  def observ(self):
-    """Access the variable holding the current observation."""
-    return self._observ
-
-  def close(self):
-    """Send close messages to the external process and join them."""
-    self._batch_env.close()
-
-  def _parse_shape(self, space):
-    """Get a tensor shape from a OpenAI Gym space.
-
-    Args:
-      space: Gym space.
-
-    Returns:
-      Shape tuple.
-    """
-    if isinstance(space, gym.spaces.Discrete):
-      return ()
-    if isinstance(space, gym.spaces.Box):
-      return space.shape
-    raise NotImplementedError()
-
-  def _parse_dtype(self, space):
-    """Get a tensor dtype from a OpenAI Gym space.
-
-    Args:
-      space: Gym space.
-
-    Returns:
-      TensorFlow data type.
-    """
-    if isinstance(space, gym.spaces.Discrete):
-      return tf.int32
-    if isinstance(space, gym.spaces.Box):
-      return tf.float32
-    raise NotImplementedError()
diff --git a/tensor2tensor/rl/envs/simulated_batch_env.py b/tensor2tensor/rl/envs/simulated_batch_env.py
deleted file mode 100644
index 8ab4363eb..000000000
--- a/tensor2tensor/rl/envs/simulated_batch_env.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Batch of environments inside the TensorFlow graph."""
-
-# The code was based on Danijar Hafner's code from tf.agents:
-# https://github.com/tensorflow/agents/blob/master/agents/tools/in_graph_batch_env.py
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-import pkg_resources
-
-from tensor2tensor.rl.envs.in_graph_batch_env import InGraphBatchEnv
-from tensor2tensor.utils import t2t_model, trainer_lib
-from tensor2tensor.utils import registry
-
-flags = tf.flags
-FLAGS = flags.FLAGS
-
-
-class SimulatedBatchEnv(InGraphBatchEnv):
-  """Batch of environments inside the TensorFlow graph.
-
-  The batch of environments will be stepped and reset inside of the graph using
-  a tf.py_func(). The current batch of observations, actions, rewards, and done
-  flags are held in according variables.
-  """
-
-  def __init__(self, len, observ_shape, observ_dtype, action_shape, action_dtype):
-    """Batch of environments inside the TensorFlow graph.
-
-    Args:
-      batch_env: Batch environment.
-    """
-
-    self.length = len
-
-    hparams = trainer_lib.create_hparams(FLAGS.hparams_set, problem_name=FLAGS.problems,
-                                         data_dir="UNUSED")
-    hparams.force_full_predict = True
-    self._model = registry.model(FLAGS.model)(hparams, tf.estimator.ModeKeys.PREDICT)
-
-    self.action_shape = action_shape
-    self.action_dtype = action_dtype
-
-    with open(pkg_resources.resource_filename(
-            "tensor2tensor.rl.envs", "frame1.png"), "rb") as f:
-      png_frame_1_raw = f.read()
-
-    with open(pkg_resources.resource_filename(
-            "tensor2tensor.rl.envs", "frame2.png"), "rb") as f:
-      png_frame_2_raw = f.read()
-
-    self.frame_1 = tf.expand_dims(tf.cast(tf.image.decode_png(png_frame_1_raw), tf.float32), 0)
-    self.frame_2 = tf.expand_dims(tf.cast(tf.image.decode_png(png_frame_2_raw), tf.float32), 0)
-
-    shape = (self.length,) + observ_shape
-    self._observ = tf.Variable(tf.zeros(shape, observ_dtype), trainable=False)
-    self._prev_observ = tf.Variable(tf.zeros(shape, observ_dtype), trainable=False)
-    self._starting_observ = tf.Variable(tf.zeros(shape, observ_dtype), trainable=False)
-
-    observ_dtype = tf.int64
-    self._observ_not_sure_why_we_need_this = tf.Variable(
-        tf.zeros((self.length,) + observ_shape, observ_dtype),
-        name='observ_new', trainable=False)
-
-    self._reward_not_sure_why_we_need_this = tf.Variable(tf.zeros((self.length,1), observ_dtype),
-                                                         name='reward_new', trainable=False)
-
-
-  @property
-  def action_space(self):
-    import gym
-    return gym.make("PongNoFrameskip-v4").action_space
-
-  def __len__(self):
-    """Number of combined environments."""
-    return self.length
-
-  def simulate(self, action):
-
-    with tf.name_scope('environment/simulate'):
-      input = {"inputs_0": self._prev_observ.read_value(), "inputs_1": self._observ.read_value(),
-               "action": action,
-               "targets": self._observ_not_sure_why_we_need_this,
-               "reward": self._reward_not_sure_why_we_need_this}
-      model_output = self._model(input)
-      observ_expaned = model_output[0]['targets']
-      reward_expanded = model_output[0]['reward']
-      observ = tf.cast(tf.argmax(observ_expaned, axis=-1), tf.float32)
-      # observ = tf.Print(observ, [tf.norm(observ)], "our l2 =")
-      # reward = tf.squeeze(tf.cast(tf.argmax(reward_expanded, axis=-1), tf.float32))
-
-      #TODO: it might be better to have something here
-      reward = tf.constant(0, tf.float32, shape=(self.length,))
-      done = tf.constant(False, tf.bool, shape=(self.length,))
-
-      with tf.control_dependencies([observ]):
-        with tf.control_dependencies([self._prev_observ.assign(self._observ)]):
-          with tf.control_dependencies([self._observ.assign(observ)]):
-            return tf.identity(reward), tf.identity(done)
-
-  def reset(self, indices=None):
-    """Reset the batch of environments.
-
-    Args:
-      indices: The batch indices of the environments to reset.
-
-    Returns:
-      Batch tensor of the new observations.
-    """
-    return tf.cond(
-        tf.cast(tf.shape(indices)[0], tf.bool),
-        lambda: self._reset_non_empty(indices), lambda: 0.0)
-
-  def _reset_non_empty(self, indices):
-    """Reset the batch of environments.
-
-    Args:
-      indices: The batch indices of the environments to reset; defaults to all.
-
-    Returns:
-      Batch tensor of the new observations.
-    """
-    observ = tf.gather(self._observ, indices)
-    observ = 0.0 * tf.check_numerics(observ, 'observ')
-    with tf.control_dependencies([
-      tf.scatter_update(self._observ, indices, observ + self.frame_2),
-      tf.scatter_update(self._prev_observ, indices, observ + self.frame_1)]):
-      return tf.identity(self._observ.read_value())
-
-  @property
-  def observ(self):
-    """Access the variable holding the current observation."""
-    return self._observ
diff --git a/tensor2tensor/rl/envs/tf_atari_wrappers.py b/tensor2tensor/rl/envs/tf_atari_wrappers.py
deleted file mode 100644
index de99afeae..000000000
--- a/tensor2tensor/rl/envs/tf_atari_wrappers.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# This is the initial version of atari wrappers written in t2t. We assume that wrappers take as input
-# a class of the interface bach env (todo: pm, fill me)
-import tensorflow as tf
-
-
-from tensor2tensor.rl.envs.in_graph_batch_env import InGraphBatchEnv
-
-
-class WrapperBase(InGraphBatchEnv):
-
-  def __init__(self, batch_env):
-    self._length = len(batch_env)
-    self._batch_env = batch_env
-    self.action_shape = batch_env.action_shape
-    self.action_dtype = batch_env.action_dtype
-
-  @property
-  def observ(self):
-    """Access the variable holding the current observation."""
-    return self._observ
-
-  def __len__(self):
-    """Number of combined environments."""
-    return self._length
-
-  def reset(self, indices=None):
-    return self._batch_env.reset(indices)
-
-
-class TransformWrapper(WrapperBase):
-
-  def __init__(self, batch_env, transform_observation=None,
-               transform_reward=tf.identity, transform_done=tf.identity):
-    super().__init__(batch_env)
-    if transform_observation:
-      _, observ_shape, observ_dtype = transform_observation
-      self._observ = tf.Variable(
-          tf.zeros(len(self) + observ_shape, observ_dtype),trainable=False)
-    else:
-      self._observ = self._batch_env.observ
-
-    self.transform_observation = transform_observation
-    self.transform_reward = transform_reward
-    self.transform_done = transform_done
-
-  def simulate(self, action):
-    with tf.name_scope('environment/simulate'): #TODO: Do we need this?
-      reward, done = self._batch_env.simulate(action)
-      with tf.control_dependencies([reward]):
-        if self.transform_observation:
-          observ = self.transform_observation[0](self._batch_env.observ)
-          assign_op = self._observ.assign(observ)
-        else:
-          assign_op =tf.no_op()  #TODO: looks as if it is broken
-        with tf.control_dependencies([assign_op]):
-          return self.transform_reward(reward), self.transform_done(done)
-
-
-class WarpFrameWrapper(TransformWrapper):
-
-  def __init__(self, batch_env):
-    """Warp frames to 84x84 as done in the Nature paper and later work."""
-
-    dims = [84, 84]
-    nature_transform = \
-      lambda o: tf.image.rgb_to_grayscale(tf.image.resize_images(o, dims))
-
-    super().__init__(batch_env, transform_observation=(nature_transform, dims, tf.float32))
-
-
-class PongT2TGeneratorHackWrapper(TransformWrapper):
-
-  def __init__(self, batch_env, add_value):
-    shift_reward = lambda r: tf.add(r, add_value)
-
-    super().__init__(batch_env, transform_reward=shift_reward)
-
-
-class MaxAndSkipWrapper(WrapperBase):
-
-  def __init__(self, batch_env, skip=4):
-    super().__init__(batch_env)
-    self.skip = skip
-    self._observ = None
-    observs_shape = batch_env.observ.shape
-    observ_dtype = tf.float32
-
-    self._observ = tf.Variable(tf.zeros(observs_shape, observ_dtype), trainable=False)
-
-  def simulate(self, action):
-    with tf.name_scope('environment/simulate'): #TODO: Do we need this?
-
-      initializer = (tf.zeros_like(self._observ), tf.fill((len(self),), 0.0), tf.fill((len(self),), False))
-
-      def not_done_step(a, _):
-        reward, done = self._batch_env.simulate(action)
-        with tf.control_dependencies([reward, done]):
-          r0 = tf.maximum(a[0], self._batch_env.observ)
-          r1 = tf.add(a[1], reward)
-          r2 = tf.logical_or(a[2], done)
-
-          return (r0, r1, r2)
-
-      simulate_ret = tf.scan(not_done_step, tf.range(self.skip), initializer=initializer, parallel_iterations=1, infer_shape=False)
-      simulate_ret = [ret[-1, ...] for ret in simulate_ret]
-
-      with tf.control_dependencies([self._observ.assign(simulate_ret[0])]):
-        return tf.identity(simulate_ret[1]), tf.identity(simulate_ret[2])
-
-
-class TimeLimitWrapper(WrapperBase):
-
-  # TODO: Check if TimeLimitWrapper does what it is supposed to do
-  def __init__(self, batch_env, timelimit=100):
-    super().__init__(batch_env)
-    self.timelimit = timelimit
-    self._time_elapsed = tf.Variable(tf.zeros((len(self),), tf.int32), trainable=False)
-
-  def simulate(self, action):
-    with tf.name_scope('environment/simulate'):
-      reward, done = self._batch_env.simulate(action)
-      with tf.control_dependencies([reward, done]):
-        new_done = tf.logical_or(done, self._time_elapsed > self.timelimit)
-        inc = self._time_elapsed.assign_add(tf.ones_like(self._time_elapsed))
-
-        with tf.control_dependencies([inc]):
-          return tf.identity(reward), tf.identity(new_done)
-
-  def reset(self, indices=None):
-    op_zero = tf.scatter_update(self._time_elapsed, indices, tf.zeros(tf.shape(indices), dtype=tf.int32))
-    with tf.control_dependencies([op_zero]):
-      return self._batch_env.reset(indices)
-
-
-
-class MemoryWrapper(WrapperBase):
-
-  #This is a singleton class
-  # singleton = None
-
-  def __init__(self, batch_env):
-    super().__init__(batch_env)
-    # assert MemoryWrapper.singleton == None, "The class cannot be instantiated multiple times"
-    MemoryWrapper.singleton = self
-    assert self._length==1, "We support only one environment"
-
-    infinity = 10000000
-    self._speculum = tf.FIFOQueue(infinity, dtypes=[tf.string, tf.float32, tf.int32, tf.bool])
-
-    self._observ = self._batch_env.observ
-
-
-  def simulate(self, action):
-    with tf.name_scope('environment/simulate'): #TODO: Do we need this?
-      reward, done = self._batch_env.simulate(action)
-      encoded_image = tf.image.encode_png(tf.cast(self._batch_env.observ[0, ...], tf.uint8))
-      # done = tf.Print(done, [encoded_image], "im_size=", summarize=1 )
-      with tf.control_dependencies([reward, done]):
-
-        enqueue_op = self._speculum.enqueue([encoded_image, reward, action, done])
-
-        with tf.control_dependencies([enqueue_op]):
-          return tf.identity(reward), tf.identity(done)
diff --git a/tensor2tensor/rl/envs/utils.py b/tensor2tensor/rl/envs/utils.py
index 31ec9707d..59732fed0 100644
--- a/tensor2tensor/rl/envs/utils.py
+++ b/tensor2tensor/rl/envs/utils.py
@@ -33,14 +33,10 @@
 import gym
 
 from tensor2tensor.rl.envs import batch_env
-from tensor2tensor.rl.envs import py_func_batch_env
-from tensor2tensor.rl.envs import simulated_batch_env
-from tensor2tensor.rl.envs import tf_atari_wrappers
+from tensor2tensor.rl.envs import in_graph_batch_env
 import tensorflow as tf
 
 
-
-
 class EvalVideoWrapper(gym.Wrapper):
   """Wrapper for recording videos during eval phase.
 
@@ -85,7 +81,7 @@ def _reset(self, **kwargs):
 
 
 class ExternalProcessEnv(object):
-  """Step environment in a separate process for lock free parallelism."""
+  """Step environment in a separate process for lock free paralellism."""
 
   # Message types for communication via the pipe.
   _ACCESS = 1
@@ -95,7 +91,7 @@ class ExternalProcessEnv(object):
   _CLOSE = 5
 
   def __init__(self, constructor, xvfb):
-    """Step environment in a separate process for lock free parallelism.
+    """Step environment in a separate process for lock free paralellism.
 
     The environment will be created in the external process by calling the
     specified callable. This can be an environment class, or a function
@@ -230,7 +226,7 @@ def _receive(self):
 
     Raises:
       Exception: An exception was raised inside the worker process.
-      KeyError: The received message is of an unknown type.
+      KeyError: The reveived message is of an unknown type.
 
     Returns:
       Payload object of the message.
@@ -281,19 +277,8 @@ def _worker(self, constructor, conn):
       conn.send((self._EXCEPTION, stacktrace))
     conn.close()
 
-def batch_env_factory(environment_lambda, hparams, num_agents, xvfb=False):
-  # define env
-  wrappers = hparams.in_graph_wrappers if hasattr(hparams, "in_graph_wrappers") else []
 
-  if hparams.simulated_environment:
-    batch_env = define_simulated_batch_env(num_agents)
-  else:
-    batch_env = define_batch_env(environment_lambda, num_agents, xvfb=xvfb)  # TODO -video?
-  for w in wrappers:
-    batch_env = w[0](batch_env, **w[1])
-  return batch_env
-
-def define_batch_env(constructor, num_agents, xvfb=False):
+def define_batch_env(constructor, num_agents, xvfb=False, env_processes=True):
   """Create environments and apply all desired wrappers.
 
   Args:
@@ -306,17 +291,12 @@ def define_batch_env(constructor, num_agents, xvfb=False):
     In-graph environments object.
   """
   with tf.variable_scope("environments"):
-    envs = [
-        ExternalProcessEnv(constructor, xvfb)
-        for _ in range(num_agents)]
-    env = batch_env.BatchEnv(envs, blocking=False)
-    env = py_func_batch_env.PyFuncBatchEnv(env)
+    if env_processes:
+      envs = [
+          ExternalProcessEnv(constructor, xvfb)
+          for _ in range(num_agents)]
+    else:
+      envs = [constructor() for _ in range(num_agents)]
+    env = batch_env.BatchEnv(envs, blocking=not env_processes)
+    env = in_graph_batch_env.InGraphBatchEnv(env)
     return env
-
-
-def define_simulated_batch_env(num_agents):
-  #TODO: pm->Błażej. Should the paramters be infered.
-  observ_shape, observ_dtype, action_shape, action_dtype = (210, 160, 3), tf.float32, [], tf.int32
-  batch_env = simulated_batch_env.SimulatedBatchEnv(num_agents, observ_shape, observ_dtype, action_shape, action_dtype)
-
-  return batch_env
diff --git a/tensor2tensor/rl/model_rl_experiment.py b/tensor2tensor/rl/model_rl_experiment.py
deleted file mode 100644
index ee9e05211..000000000
--- a/tensor2tensor/rl/model_rl_experiment.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import os
-import tempfile
-from tensor2tensor import problems
-from tensor2tensor.bin import t2t_trainer
-from tensor2tensor.utils import trainer_lib
-from tensor2tensor.rl import rl_trainer_lib
-from tensor2tensor.rl.envs.tf_atari_wrappers import PongT2TGeneratorHackWrapper
-from tensor2tensor.rl.envs.tf_atari_wrappers import TimeLimitWrapper
-import tensorflow as tf
-import time
-import datetime
-
-flags = tf.flags
-FLAGS = flags.FLAGS
-
-
-def train(hparams, output_dir):
-  prefix = output_dir
-  #remove trash
-  # prefix = "~/trash/loop_{}".format(random.randint(10000, 99999))
-  data_dir = os.path.expanduser(prefix + "/data")
-  tmp_dir = os.path.expanduser(prefix + "/tmp")
-  output_dir = os.path.expanduser(prefix + "/output")
-  tf.gfile.MakeDirs(data_dir)
-  tf.gfile.MakeDirs(tmp_dir)
-  tf.gfile.MakeDirs(output_dir)
-  last_model = ""
-  start_time = time.time()
-  line = ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>    "
-  for iloop in range(hparams.epochs):
-      time_delta = time.time() - start_time
-      print(line+"Step {}.1. - generate data from policy. "
-            "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
-      FLAGS.problems = "gym_discrete_problem"
-      FLAGS.agent_policy_path = last_model
-      gym_problem = problems.problem(FLAGS.problems)
-      gym_problem.num_steps = hparams.true_env_generator_num_steps
-      iter_data_dir = os.path.join(data_dir, str(iloop))
-      tf.gfile.MakeDirs(iter_data_dir)
-      gym_problem.generate_data(iter_data_dir, tmp_dir)
-
-      time_delta = time.time() - start_time
-      print(line+"Step {}.2. - generate env model. "
-            "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
-      # 2. generate env model
-      FLAGS.data_dir = iter_data_dir
-      FLAGS.output_dir = output_dir
-      FLAGS.model = hparams.generative_model
-      FLAGS.hparams_set = hparams.generative_model_params
-      FLAGS.train_steps = hparams.model_train_steps
-      FLAGS.eval_steps = 1
-      t2t_trainer.main([])
-
-      time_delta = time.time() - start_time
-      print(line+"Step {}.3. - evalue env model. "
-            "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
-      gym_simulated_problem = problems.problem("gym_simulated_discrete_problem")
-      gym_simulated_problem.num_steps = hparams.simulated_env_generator_num_steps
-      gym_simulated_problem.generate_data(iter_data_dir, tmp_dir)
-
-      # time_delta = time.time() - start_time
-      print(line+"Step {}.4. - train PPO in model env."
-            " Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
-      ppo_epochs_num=hparams.ppo_epochs_num
-      ppo_hparams = trainer_lib.create_hparams("atari_base", "epochs_num={},simulated_environment=True,eval_every_epochs=0,save_models_every_epochs={}".format(ppo_epochs_num+1, ppo_epochs_num),
-                                           data_dir=output_dir)
-      ppo_hparams.epoch_length = hparams.ppo_epoch_length
-      ppo_dir = tempfile.mkdtemp(dir=data_dir, prefix="ppo_")
-      in_graph_wrappers = [(TimeLimitWrapper, {"timelimit": 150}),
-                           (PongT2TGeneratorHackWrapper, {"add_value": -2})] + gym_problem.in_graph_wrappers
-      ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers)
-      rl_trainer_lib.train(ppo_hparams, "PongNoFrameskip-v4", ppo_dir)
-
-      last_model = ppo_dir + "/model{}.ckpt".format(ppo_epochs_num)
-
-
-def main(_):
-    hparams = tf.contrib.training.HParams(
-        epochs=100,
-        true_env_generator_num_steps=100,
-        generative_model="static_basic_conv_gen",
-        generative_model_params="basic_conv_small",
-        model_train_steps=80,
-        simulated_env_generator_num_steps=300,
-        ppo_epochs_num=2,
-        ppo_epoch_length=300,
-    )
-    train(hparams, tempfile.mkdtemp())
-
-
-if __name__ == "__main__":
-  tf.app.run()
diff --git a/tensor2tensor/rl/ppo.py b/tensor2tensor/rl/ppo.py
index 2257dfac2..706e3c6b4 100644
--- a/tensor2tensor/rl/ppo.py
+++ b/tensor2tensor/rl/ppo.py
@@ -21,14 +21,15 @@
 import tensorflow as tf
 
 
-def get_optimiser(config):
-  if config.optimizer == 'Adam':
-    return tf.train.AdamOptimizer(learning_rate=config.learning_rate)
-  return config.optimizer(learning_rate=config.learning_rate)
+def get_optimizer(config):
+  if config.optimizer == "Adam":
+    return tf.train.AdamOptimizer(config.learning_rate)
+  return config.optimizer(config.learning_rate)
 
 
-def define_ppo_step(data_points, policy_factory, optimizer, config):
-  observation, action, discounted_reward, norm_advantage, old_pdf = data_points
+def define_ppo_step(observation, action, reward, done, value, old_pdf,
+                    policy_factory, config):
+  """Step of PPO."""
   new_policy_dist, new_value, _ = policy_factory(observation)
   new_pdf = new_policy_dist.prob(action)
 
@@ -36,30 +37,35 @@ def define_ppo_step(data_points, policy_factory, optimizer, config):
   clipped_ratio = tf.clip_by_value(ratio, 1 - config.clipping_coef,
                                    1 + config.clipping_coef)
 
-  surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
-                                   ratio * norm_advantage)
+  advantage = calculate_generalized_advantage_estimator(
+      reward, value, done, config.gae_gamma, config.gae_lambda)
+
+  advantage_mean, advantage_variance = tf.nn.moments(advantage, axes=[0, 1],
+                                                     keep_dims=True)
+  advantage_normalized = tf.stop_gradient(
+      (advantage - advantage_mean)/(tf.sqrt(advantage_variance) + 1e-8))
+
+  surrogate_objective = tf.minimum(clipped_ratio * advantage_normalized,
+                                   ratio * advantage_normalized)
   policy_loss = -tf.reduce_mean(surrogate_objective)
 
-  value_error = new_value - discounted_reward
+  value_error = calculate_generalized_advantage_estimator(
+      reward, new_value, done, config.gae_gamma, config.gae_lambda)
   value_loss = config.value_loss_coef * tf.reduce_mean(value_error ** 2)
 
   entropy = new_policy_dist.entropy()
   entropy_loss = -config.entropy_loss_coef * tf.reduce_mean(entropy)
 
+  optimizer = get_optimizer(config)
   losses = [policy_loss, value_loss, entropy_loss]
 
-  gradients = [list(zip(*optimizer.compute_gradients(loss)))
-               for loss in losses]
+  gradients = [list(zip(*optimizer.compute_gradients(loss))) for loss in losses]
 
   gradients_norms = [tf.global_norm(gradient[0]) for gradient in gradients]
 
   gradients_flat = sum([gradient[0] for gradient in gradients], ())
   gradients_variables_flat = sum([gradient[1] for gradient in gradients], ())
 
-  if config.max_gradients_norm:
-    gradients_flat, _ = tf.clip_by_global_norm(gradients_flat,
-                                               config.max_gradients_norm)
-
   optimize_op = optimizer.apply_gradients(zip(gradients_flat,
                                               gradients_variables_flat))
 
@@ -68,53 +74,26 @@ def define_ppo_step(data_points, policy_factory, optimizer, config):
 
 
 def define_ppo_epoch(memory, policy_factory, config):
+  """PPO epoch."""
   observation, reward, done, action, old_pdf, value = memory
 
-  # This is to avoid propagating gradients through simulated environment.
+  # This is to avoid propagating gradients though simulation of simulation
   observation = tf.stop_gradient(observation)
   action = tf.stop_gradient(action)
   reward = tf.stop_gradient(reward)
-  if hasattr(config, "rewards_preprocessing_fun"):
-    reward = config.rewards_preprocessing_fun(reward)
   done = tf.stop_gradient(done)
   value = tf.stop_gradient(value)
   old_pdf = tf.stop_gradient(old_pdf)
 
-  advantage = calculate_generalized_advantage_estimator(
-      reward, value, done, config.gae_gamma, config.gae_lambda)
-
-  discounted_reward = tf.stop_gradient(advantage + value)
-
-  advantage_mean, advantage_variance = tf.nn.moments(advantage, axes=[0, 1],
-                                                     keep_dims=True)
-  advantage_normalized = tf.stop_gradient(
-      (advantage - advantage_mean)/(tf.sqrt(advantage_variance) + 1e-8))
+  ppo_step_rets = tf.scan(
+      lambda _1, _2: define_ppo_step(  # pylint: disable=g-long-lambda
+          observation, action, reward, done, value,
+          old_pdf, policy_factory, config),
+      tf.range(config.optimization_epochs),
+      [0., 0., 0., 0., 0., 0.],
+      parallel_iterations=1)
 
-  add_lists_elementwise = lambda l1, l2: [x + y for x, y in zip(l1, l2)]
-
-  number_of_batches = (config.epoch_length * config.optimization_epochs
-                       / config.optimization_batch_size)
-
-  dataset = tf.data.Dataset.from_tensor_slices(
-      (observation, action, discounted_reward, advantage_normalized, old_pdf))
-  dataset = dataset.shuffle(buffer_size=config.epoch_length,
-                            reshuffle_each_iteration=True)
-  dataset = dataset.repeat(config.optimization_epochs)
-  dataset = dataset.batch(config.optimization_batch_size)
-  iterator = dataset.make_initializable_iterator()
-  optimizer = get_optimiser(config)
-
-  with tf.control_dependencies([iterator.initializer]):
-    ppo_step_rets = tf.scan(
-        lambda a, i: add_lists_elementwise(  # pylint: disable=g-long-lambda
-            a, define_ppo_step(iterator.get_next(), policy_factory, optimizer,
-                               config)),
-        tf.range(number_of_batches),
-        [0., 0., 0., 0., 0., 0.],
-        parallel_iterations=1)
-
-  ppo_summaries = [tf.reduce_mean(ret) / number_of_batches
-                   for ret in ppo_step_rets]
+  ppo_summaries = [tf.reduce_mean(ret) for ret in ppo_step_rets]
   summaries_names = ["policy_loss", "value_loss", "entropy_loss",
                      "policy_gradient", "value_gradient", "entropy_gradient"]
 
@@ -133,7 +112,7 @@ def calculate_generalized_advantage_estimator(
   """Generalized advantage estimator."""
 
   # Below is slight weirdness, we set the last reward to 0.
-  # This makes the advantage to be 0 in the last timestep
+  # This makes the adventantage to be 0 in the last timestep
   reward = tf.concat([reward[:-1, :], value[-1:, :]], axis=0)
   next_value = tf.concat([value[1:, :], tf.zeros_like(value[-1:, :])], axis=0)
   next_not_done = 1 - tf.cast(tf.concat([done[1:, :],
diff --git a/tensor2tensor/rl/rl_trainer_lib.py b/tensor2tensor/rl/rl_trainer_lib.py
index 4ba225256..4ff386362 100644
--- a/tensor2tensor/rl/rl_trainer_lib.py
+++ b/tensor2tensor/rl/rl_trainer_lib.py
@@ -29,9 +29,8 @@
 from tensor2tensor.models.research import rl  # pylint: disable=unused-import
 from tensor2tensor.rl import collect
 from tensor2tensor.rl import ppo
+from tensor2tensor.rl.envs import atari_wrappers
 from tensor2tensor.rl.envs import utils
-from tensor2tensor.rl.envs import tf_atari_wrappers
-
 
 import tensorflow as tf
 
@@ -40,31 +39,27 @@
 
 def define_train(hparams, environment_spec, event_dir):
   """Define the training setup."""
-  policy_lambda = hparams.network
-
-  if environment_spec == "stacked_pong":
-    environment_spec = lambda: gym.make("PongNoFrameskip-v4")
-    wrappers = hparams.in_graph_wrappers if hasattr(hparams, "in_graph_wrappers") else []
-    wrappers.append((tf_atari_wrappers.MaxAndSkipEnv, {"skip": 4}))
-    hparams.in_graph_wrappers = wrappers
   if isinstance(environment_spec, str):
     env_lambda = lambda: gym.make(environment_spec)
   else:
     env_lambda = environment_spec
+  policy_lambda = hparams.network
+  env = env_lambda()
+  action_space = env.action_space
 
-  batch_env = utils.batch_env_factory(env_lambda, hparams, num_agents=hparams.num_agents)
+  batch_env = utils.define_batch_env(env_lambda, hparams.num_agents)
 
   policy_factory = tf.make_template(
       "network",
-      functools.partial(policy_lambda, batch_env.action_space, hparams))
+      functools.partial(policy_lambda, action_space, hparams))
 
-  with tf.variable_scope("", reuse=tf.AUTO_REUSE):
+  with tf.variable_scope("train"):
     memory, collect_summary = collect.define_collect(
         policy_factory, batch_env, hparams, eval_phase=False)
-    ppo_summary = ppo.define_ppo_epoch(memory, policy_factory, hparams)
-    summary = tf.summary.merge([collect_summary, ppo_summary])
+  ppo_summary = ppo.define_ppo_epoch(memory, policy_factory, hparams)
+  summary = tf.summary.merge([collect_summary, ppo_summary])
 
-  with tf.variable_scope("eval", reuse=tf.AUTO_REUSE):
+  with tf.variable_scope("eval"):
     eval_env_lambda = env_lambda
     if event_dir and hparams.video_during_eval:
       # Some environments reset environments automatically, when reached done
@@ -73,42 +68,32 @@ def define_train(hparams, environment_spec, event_dir):
       eval_env_lambda = lambda: gym.wrappers.Monitor(  # pylint: disable=g-long-lambda
           env_lambda(), event_dir, video_callable=lambda i: i % d == 0)
     wrapped_eval_env_lambda = lambda: utils.EvalVideoWrapper(eval_env_lambda())
-    # eval_batch_env = utils.define_batch_env(wrapped_eval_env_lambda, hparams.num_eval_agents,
-    #                        xvfb=hparams.video_during_eval)
-    eval_batch_env = utils.batch_env_factory(wrapped_eval_env_lambda, hparams,
-                                             num_agents=hparams.num_eval_agents, xvfb=hparams.video_during_eval)
-
-    # _, eval_summary = collect.define_collect(
-    #     policy_factory, eval_batch_env, hparams, eval_phase=True)
-    #TODO: pm -> Błażej. I'm to tired to fix it now.
-    eval_summary = tf.no_op()
-  return summary, eval_summary
+    _, eval_summary = collect.define_collect(
+        policy_factory,
+        utils.define_batch_env(wrapped_eval_env_lambda, hparams.num_eval_agents,
+                               xvfb=hparams.video_during_eval),
+        hparams, eval_phase=True)
+  return summary, eval_summary, policy_factory
 
 
 def train(hparams, environment_spec, event_dir=None):
   """Train."""
-  train_summary_op, eval_summary_op = define_train(hparams, environment_spec,
+  if environment_spec == "stacked_pong":
+    environment_spec = lambda: atari_wrappers.wrap_atari(  # pylint: disable=g-long-lambda
+        gym.make("PongNoFrameskip-v4"),
+        warp=False, frame_skip=4, frame_stack=False)
+  train_summary_op, eval_summary_op, _ = define_train(hparams, environment_spec,
                                                       event_dir)
   if event_dir:
     summary_writer = tf.summary.FileWriter(
         event_dir, graph=tf.get_default_graph(), flush_secs=60)
     model_saver = tf.train.Saver(tf.global_variables(".*network_parameters.*"))
-    # TODO(blazej): Make sure that policy is restored properly.
   else:
     summary_writer = None
     model_saver = None
 
-  if hparams.simulated_environment:
-    env_model_loader = tf.train.Saver(tf.global_variables(".*basic_conv_gen.*"))
-  else:
-    env_model_loader = None
-
   with tf.Session() as sess:
     sess.run(tf.global_variables_initializer())
-    if env_model_loader:
-      ckpts = tf.train.get_checkpoint_state(hparams.data_dir)
-      ckpt = ckpts.model_checkpoint_path
-      env_model_loader.restore(sess, ckpt)
     for epoch_index in range(hparams.epochs_num):
       summary = sess.run(train_summary_op)
       if summary_writer:
diff --git a/tensor2tensor/rl/t2t_rl_trainer.py b/tensor2tensor/rl/t2t_rl_trainer.py
index bd3780a9b..3abe22723 100644
--- a/tensor2tensor/rl/t2t_rl_trainer.py
+++ b/tensor2tensor/rl/t2t_rl_trainer.py
@@ -27,7 +27,7 @@
 FLAGS = flags.FLAGS
 
 # To maintain compatibility with some internal libs, we guard against these flag
-# definitions possibly erring. Apologies for the ugliness.
+# definitions possibly erroring. Apologies for the ugliness.
 try:
   flags.DEFINE_string("output_dir", "", "Base output directory for run.")
 except:  # pylint: disable=bare-except
diff --git a/tensor2tensor/utils/adafactor.py b/tensor2tensor/utils/adafactor.py
index 979f1b3be..31c3a5558 100644
--- a/tensor2tensor/utils/adafactor.py
+++ b/tensor2tensor/utils/adafactor.py
@@ -32,7 +32,7 @@ class AdafactorOptimizer(tf.train.Optimizer):
 
   1. For a two-dimensional AxB weight matrix, Adafactor uses only A+B auxiliary
      parameters to maintain the second-moment estimator, instead of AB.
-     This is advantageous on memory-limited systems.  In addition, beta1
+     This is advantagous on memory-limited systems.  In addition, beta1
      (momentum) is set to zero by default, saving an additional auxiliary
      parameter per weight.
 
@@ -332,7 +332,7 @@ def _simulated_quantize(x, num_bits, quantization_noise):
   quantization_noise is a float32 Tensor containing values in [0, 1).
   Each value in quantization_noise should take different values across
   different steps, approximating a uniform distribution over [0, 1).
-  In the case of replicated TPU training, quantization_noise should be identical
+  In the case of relicated TPU training, quantization_noise should be identical
   across replicas in order to keep the parameters identical across replicas.
 
   The natural choice for quantization_noise would be tf.random_uniform(),
@@ -383,7 +383,7 @@ def _quantization_noise_from_step_num():
   """
   step = tf.to_int32(tf.train.get_or_create_global_step()) + 1
   phi = ((5 ** 0.5) - 1) / 2
-  # Naive computation tf.mod(phi * step, 1.0) in float32 would be disastrous
+  # Naive computation tf.mod(phi * step, 1.0) in float32 would be disasterous
   # due to loss of precision when the step number gets large.
   # Computation in doubles does not work on TPU, so we use this complicated
   # alternative computation which does not suffer from these roundoff errors.
diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py
index a83dc38ae..10ee6c1f7 100644
--- a/tensor2tensor/utils/beam_search.py
+++ b/tensor2tensor/utils/beam_search.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Implementation of beam search with penalties."""
+"""Implemetation of beam seach with penalties."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -94,7 +94,7 @@ def log_prob_from_logits(logits):
 
 
 def compute_batch_indices(batch_size, beam_size):
-  """Computes the i'th coordinate that contains the batch index for gathers.
+  """Computes the i'th coodinate that contains the batch index for gathers.
 
   Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..]. It says which
   batch the beam item is in. This will create the i of the i,j coordinate
@@ -135,7 +135,7 @@ def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags,
       [batch_size, beam_size]. We will return the gathered scores from here.
       Scores to gather is different from scores because for grow_alive, we will
       need to return log_probs, while for grow_finished, we will need to return
-      the length penalized scores.
+      the length penalized scors.
     flags: Tensor of bools for sequences that say whether a sequence has reached
       EOS or not
     beam_size: int
@@ -188,7 +188,7 @@ def beam_search(symbols_to_logits_fn,
                 stop_early=True):
   """Beam search with length penalties.
 
-  Requires a function that can take the currently decoded symbols and return
+  Requires a function that can take the currently decoded sybmols and return
   the logits for the next symbol. The implementation is inspired by
   https://arxiv.org/abs/1609.08144.
 
@@ -229,7 +229,7 @@ def beam_search(symbols_to_logits_fn,
   Returns:
     Tuple of
     (decoded beams [batch_size, beam_size, decode_length]
-     decoding probabilities [batch_size, beam_size])
+     decoding probablities [batch_size, beam_size])
   """
   batch_size = common_layers.shape_list(initial_ids)[0]
 
@@ -320,7 +320,7 @@ def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished, states):
                                        "grow_alive", states)
 
   def grow_topk(i, alive_seq, alive_log_probs, states):
-    r"""Inner beam search loop.
+    r"""Inner beam seach loop.
 
     This function takes the current alive sequences, and grows them to topk
     sequences where k = 2*beam. We use 2*beam because, we could have beam_size
@@ -361,14 +361,14 @@ def grow_topk(i, alive_seq, alive_log_probs, states):
     # Convert logits to normalized log probs
     candidate_log_probs = log_prob_from_logits(logits)
 
-    # Multiply the probabilities by the current probabilities of the beam.
+    # Multiply the probabilites by the current probabilites of the beam.
     # (batch_size, beam_size, vocab_size) + (batch_size, beam_size, 1)
     log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
 
     length_penalty = tf.pow(((5. + tf.to_float(i + 1)) / 6.), alpha)
 
     curr_scores = log_probs / length_penalty
-    # Flatten out (beam_size, vocab_size) probs in to a list of possibilities
+    # Flatten out (beam_size, vocab_size) probs in to a list of possibilites
     flat_curr_scores = tf.reshape(curr_scores, [-1, beam_size * vocab_size])
 
     topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores, k=beam_size * 2)
@@ -381,7 +381,7 @@ def grow_topk(i, alive_seq, alive_log_probs, states):
     topk_ids %= vocab_size  # Unflatten the ids
 
     # The next three steps are to create coordinates for tf.gather_nd to pull
-    # out the correct sequences from id's that we need to grow.
+    # out the correct seqences from id's that we need to grow.
     # We will also use the coordinates to gather the booleans of the beam items
     # that survived.
     batch_pos = compute_batch_indices(batch_size, beam_size * 2)
@@ -447,7 +447,7 @@ def inner_loop(i, alive_seq, alive_log_probs, finished_seq, finished_scores,
          Log probs of the alive sequences,
          New finished sequences,
          Scores of the new finished sequences,
-         Flags indicating which sequence in finished as reached EOS,
+         Flags inidicating which sequence in finished as reached EOS,
          dict of final decoding states)
     """
 
@@ -471,7 +471,7 @@ def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq,
     """Checking termination condition.
 
     We terminate when we decoded up to decode_length or the lowest scoring item
-    in finished has a greater score that the highest prob item in alive divided
+    in finished has a greater score that the higest prob item in alive divided
     by the max length penalty
 
     Args:
@@ -488,24 +488,24 @@ def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq,
     if not stop_early:
       return tf.less(i, decode_length)
     max_length_penalty = tf.pow(((5. + tf.to_float(decode_length)) / 6.), alpha)
-    # The best possible score of the most likely alive sequence
+    # The best possible score of the most likley alive sequence
     lower_bound_alive_scores = alive_log_probs[:, 0] / max_length_penalty
 
     # Now to compute the lowest score of a finished sequence in finished
     # If the sequence isn't finished, we multiply it's score by 0. since
     # scores are all -ve, taking the min will give us the score of the lowest
     # finished item.
-    lowest_score_of_finished_in_finished = tf.reduce_min(
+    lowest_score_of_fininshed_in_finished = tf.reduce_min(
         finished_scores * tf.to_float(finished_in_finished), axis=1)
     # If none of the sequences have finished, then the min will be 0 and
     # we have to replace it by -ve INF if it is. The score of any seq in alive
     # will be much higher than -ve INF and the termination condition will not
     # be met.
-    lowest_score_of_finished_in_finished += (
+    lowest_score_of_fininshed_in_finished += (
         (1. - tf.to_float(tf.reduce_any(finished_in_finished, 1))) * -INF)
 
     bound_is_met = tf.reduce_all(
-        tf.greater(lowest_score_of_finished_in_finished,
+        tf.greater(lowest_score_of_fininshed_in_finished,
                    lower_bound_alive_scores))
 
     return tf.logical_and(
diff --git a/tensor2tensor/utils/bleu_hook.py b/tensor2tensor/utils/bleu_hook.py
index 968a103ff..2c854cdba 100644
--- a/tensor2tensor/utils/bleu_hook.py
+++ b/tensor2tensor/utils/bleu_hook.py
@@ -39,7 +39,7 @@
 
 
 def _get_ngrams(segment, max_order):
-  """Extracts all n-grams up to a given maximum order from an input segment.
+  """Extracts all n-grams upto a given maximum order from an input segment.
 
   Args:
     segment: text segment from which n-grams will be extracted.
@@ -130,7 +130,7 @@ def bleu_score(predictions, labels, **unused_kwargs):
   and use brevity penalty. Also, this does not have beam search.
 
   Args:
-    predictions: tensor, model predictions
+    predictions: tensor, model predicitons
     labels: tensor, gold output.
 
   Returns:
diff --git a/tensor2tensor/utils/cloud_tpu.py b/tensor2tensor/utils/cloud_tpu.py
index 1518e69ae..96d011568 100644
--- a/tensor2tensor/utils/cloud_tpu.py
+++ b/tensor2tensor/utils/cloud_tpu.py
@@ -305,7 +305,7 @@ def tpu_tunnel(vm_name, tpu_ip):
     time.sleep(1)
     if tunnel_process.poll() is not None:
       raise ValueError("SSH failed")
-    tf.logging.info("Set up port forwarding. Local ports: %s", local_ports)
+    tf.logging.info("Set up port fowarding. Local ports: %s", local_ports)
     yield local_ports, tunnel_process.pid
 
 
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index d7be24e7e..8cece2625 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -136,7 +136,7 @@ def _batching_scheme(batch_size,
                      min_length=0):
   """A batching scheme based on model hyperparameters.
 
-  Every batch contains a number of sequences divisible by `shard_multiplier`.
+  Every batch containins a number of sequences divisible by `shard_multiplier`.
 
   Args:
     batch_size: int, total number of tokens in a batch.
@@ -177,7 +177,7 @@ def _batching_scheme(batch_size,
   ]
   max_batch_size = max(batch_sizes)
   # Since the Datasets API only allows a single constant for window_size,
-  # and it needs divide all bucket_batch_sizes, we pick a highly-composite
+  # and it needs divide all bucket_batch_sizes, we pick a highly-compoisite
   # window size and then round down all batch sizes to divisors of that window
   # size, so that a window can always be divided evenly into batches.
   # TODO(noam): remove this when Dataset API improves.
diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py
index 65616191c..437463514 100644
--- a/tensor2tensor/utils/decoding.py
+++ b/tensor2tensor/utils/decoding.py
@@ -343,7 +343,7 @@ def input_fn():
   return input_fn
 
 
-def decode_interactively(estimator, hparams, decode_hp, checkpoint_path=None):
+def decode_interactively(estimator, hparams, decode_hp):
   """Interactive decoding."""
 
   def input_fn():
@@ -353,7 +353,7 @@ def input_fn():
     example = _interactive_input_tensor_to_features_dict(example, hparams)
     return example
 
-  result_iter = estimator.predict(input_fn, checkpoint_path=checkpoint_path)
+  result_iter = estimator.predict(input_fn)
   for result in result_iter:
     problem_idx = result["problem_choice"]
     is_image = False  # TODO(lukaszkaiser): find out from problem id / class.
diff --git a/tensor2tensor/utils/diet.py b/tensor2tensor/utils/diet.py
index 24a215f96..7f1915e4d 100644
--- a/tensor2tensor/utils/diet.py
+++ b/tensor2tensor/utils/diet.py
@@ -54,7 +54,7 @@ def diet_expert(x, hidden_size, params):
   """A two-layer feed-forward network with relu activation on hidden layer.
 
   Uses diet variables.
-  Recomputes hidden layer on backprop to save activation memory.
+  Recompuets hidden layer on backprop to save activation memory.
 
   Args:
     x: a Tensor with shape [batch, io_size]
@@ -120,7 +120,7 @@ class DietAdamOptimizer(DietVariableOptimizer):
 
   Diet variables should be created with the
   DietAdamOptimizer.get_variable() method.  The resulting variables
-  have extra fields pointing to the optimizer and to the accumulator
+  have extra fields pointing to the otpimizer and to the accumulator
   slots.
 
   The variable is kept in quantized form, so you need to call
@@ -135,7 +135,7 @@ class DietAdamOptimizer(DietVariableOptimizer):
   diet_expert() for an example of how all of this is done.
 
   To facilitate fixed-point quantization and to make it easier to
-  choose a learning rate, all variables are initialized with unit
+  choose a learning rate, all varaibles are initialized with unit
   normal initialization.  If you want smaller values, downscale on the
   outside.
   """
diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py
index 2bfd35f01..1d465b8e7 100644
--- a/tensor2tensor/utils/expert_utils.py
+++ b/tensor2tensor/utils/expert_utils.py
@@ -66,7 +66,7 @@ def add_scope(scope=None, scope_fn=None):
   """Return a decorator which add a TF name/variable scope to a function.
 
   Note that the function returned by the decorator accept an additional 'name'
-  parameter, which can overwrite the name scope given when the function is
+  parameter, which can overwritte the name scope given when the function is
   created.
 
   Args:
@@ -412,7 +412,7 @@ def _my_top_k(x, k):
   tf.nn.top_k is implemented for GPU, but the gradient, sparse_to_dense,
   seems not to be, so if we use tf.nn.top_k, then both the top_k and its
   gradient go on cpu.  Once this is not an issue, this function becomes
-  obsolete and should be replaced by tf.nn.top_k.
+  obselete and should be replaced by tf.nn.top_k.
 
   Args:
     x: a 2d Tensor.
@@ -587,12 +587,12 @@ def restore(self, x):
 
 @add_name_scope("map_ids")
 def map_ids(x, indices, map_fn):
-  """Apply a function to each coordinate ids of a multidimensional tensor.
+  """Apply a function to each coordinate ids of a multidimentional tensor.
 
   This allows to process each sequence of a batch independently. This is
   similar to tf.map_fn but with tensor where the batch dim has been flatten.
 
-  Warning: The indices ids have to be contiguous and ordered in memory as the
+  Warning: The indices ids have to be contigous and orderd in memory as the
   output vector for each of the ids are simply concatenated after being
   processed.
   Ex: if your indices are [0,2,2,1,2,0], the output will contains the processed
diff --git a/tensor2tensor/utils/flags.py b/tensor2tensor/utils/flags.py
index 08b40efdf..2017cf019 100644
--- a/tensor2tensor/utils/flags.py
+++ b/tensor2tensor/utils/flags.py
@@ -101,7 +101,7 @@
 flags.DEFINE_float("worker_gpu_memory_fraction", 0.95,
                    "Fraction of GPU memory to allocate.")
 flags.DEFINE_integer("ps_gpu", 0, "How many GPUs to use per ps.")
-flags.DEFINE_string("gpu_order", "", "Optional order for daisy-chaining GPUs."
+flags.DEFINE_string("gpu_order", "", "Optional order for daisy-chaining gpus."
                     " e.g. \"1 3 2 4\"")
 flags.DEFINE_string("ps_job", "/job:ps", "name of ps job")
 flags.DEFINE_integer("ps_replicas", 0, "How many ps replicas.")
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index 9ab9adf9d..bb31a4dec 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -254,7 +254,7 @@ def image_summary(predictions, features, hparams):
 
   Returns:
     summary_proto: containing the summary images.
-    weights: A Tensor of zeros of the same shape as predictions.
+    weights: A Tensor of zeros of the same shape as preditions.
   """
   del hparams
   results = tf.cast(tf.argmax(predictions, axis=-1), tf.uint8)
@@ -285,9 +285,9 @@ def create_evaluation_metrics(problems, model_hparams):
   def make_problem_specific_metric_fn(metric_fn, problem_idx, weights_fn):
     """Create a metric fn conditioned on problem_idx."""
 
-    def problem_metric_fn(predictions, features, labels):
+    def problem_metric_fn(predictions, features):
       """Metric fn."""
-      #labels = features.get("targets", None)
+      labels = features.get("targets", None)
       problem_choice = features.get("problem_choice", 0)
 
       # Send along the entire features dict if the metric fn has the kwarg
diff --git a/tensor2tensor/utils/rouge.py b/tensor2tensor/utils/rouge.py
index cb3c9af4b..627b8d2ea 100644
--- a/tensor2tensor/utils/rouge.py
+++ b/tensor2tensor/utils/rouge.py
@@ -16,7 +16,7 @@
 # coding=utf-8
 """ROUGE metric implementation.
 
-This is a modified and slightly extended version of
+This is a modified and slightly extended verison of
 https://github.com/miso-belica/sumy/blob/dev/sumy/evaluation/rouge.py.
 """
 
@@ -116,7 +116,7 @@ def rouge_l_sentence_level(eval_sentences, ref_sentences):
 
   Args:
     eval_sentences: The sentences that have been picked by the summarizer
-    ref_sentences: The sentences from the reference set
+    ref_sentences: The sentences from the referene set
 
   Returns:
     A float: F_lcs
@@ -138,7 +138,7 @@ def rouge_l_fscore(predictions, labels, **unused_kwargs):
   or decode the ids and tokenize the output.
 
   Args:
-    predictions: tensor, model predictions
+    predictions: tensor, model predicitons
     labels: tensor, gold output.
 
   Returns:
@@ -221,7 +221,7 @@ def rouge_2_fscore(predictions, labels, **unused_kwargs):
   or decode the ids and tokenize the output.
 
   Args:
-    predictions: tensor, model predictions
+    predictions: tensor, model predicitons
     labels: tensor, gold output.
 
   Returns:
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index fa65889e3..716a6321d 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -352,7 +352,7 @@ def top(self, body_output, features):
           "problem_hparams.target_modality is a dict.")
       return self._top_single(body_output, target_modality, features)
 
-  def _loss_single(self, logits, target_modality, feature):
+  def _loss_single(self, logits, target_modality, features):
     # The current bfloat16 version still uses float32 for most parts of backward
     # propagation to keep model quality, so cast back before computing the loss
     # value.
@@ -362,7 +362,7 @@ def _loss_single(self, logits, target_modality, feature):
       return (tf.constant(0., dtype=tf.float32),
               tf.constant(1., dtype=tf.float32))
 
-    loss_num, loss_den = target_modality.loss(logits, feature)
+    loss_num, loss_den = target_modality.loss(logits, features["targets"])
     loss_num *= self._problem_hparams.loss_multiplier
     return loss_num, loss_den
 
@@ -377,7 +377,7 @@ def loss(self, logits, features):
           "of problem_hparams.target_modality's dict.")
       losses = {}
       for k, v in six.iteritems(logits):
-        losses[k] = self._loss_single(v, target_modality[k], features[k])
+        losses[k] = self._loss_single(v, target_modality[k], features)
       return tf.add_n([n / d for n, d in losses.values()])
     else:
       if self._problem_hparams:
@@ -387,7 +387,7 @@ def loss(self, logits, features):
       assert not isinstance(target_modality, dict), (
           "model_body must return a dictionary of logits when "
           "problem_hparams.target_modality is a dict.")
-      return self._loss_single(logits, target_modality, features["targets"])
+      return self._loss_single(logits, target_modality, features)
 
   def optimize(self, loss, num_async_replicas=1):
     """Return a training op minimizing loss."""
@@ -499,7 +499,7 @@ def infer(self,
       beam_size: number of beams.
       top_beams: an integer. How many of the beams to return.
       alpha: Float that controls the length penalty. larger the alpha, stronger
-        the preference for longer translations.
+        the preference for slonger translations.
 
     Returns:
       A dict of decoding results {
@@ -549,7 +549,7 @@ def _beam_decode(self, features, decode_length, beam_size, top_beams, alpha):
       beam_size: number of beams.
       top_beams: an integer. How many of the beams to return.
       alpha: Float that controls the length penalty. larger the alpha, stronger
-        the preference for longer translations.
+        the preference for slonger translations.
 
     Returns:
        samples: an integer `Tensor`. Top samples from the beam search
@@ -569,7 +569,7 @@ def _beam_decode_slow(self, features, decode_length, beam_size, top_beams,
       beam_size: number of beams.
       top_beams: an integer. How many of the beams to return.
       alpha: Float that controls the length penalty. larger the alpha, stronger
-        the preference for longer translations.
+        the preference for slonger translations.
 
     Returns:
        samples: an integer `Tensor`. Top samples from the beam search
@@ -765,7 +765,7 @@ def fn_not_eos():
               tf.squeeze(result[:, -1, :, :]), text_encoder.EOS_ID)
 
         not_eos = tf.cond(
-            # We only check for early stopping if there is at least 1 element (
+            # We only check for early stoping if there is at least 1 element (
             # otherwise not_eos will crash)
             tf.not_equal(length, 0),
             fn_not_eos,
@@ -774,7 +774,7 @@ def fn_not_eos():
 
         return tf.cond(
             tf.equal(batch_size, 1),
-            # If batch_size == 1, we check EOS for early stopping
+            # If batch_size == 1, we check EOS for early stoping
             lambda: tf.logical_and(not_overflow, not_eos),
             # Else, just wait for max length
             lambda: not_overflow)
@@ -1027,9 +1027,9 @@ def estimator_spec_eval(self, features, logits, labels, loss, losses_dict):
         if isinstance(logits, dict):
           # the key is located in the center of metric_name: "metrics-%s/%s/%s"
           k = metric_name.split("/")[1]
-          eval_metrics[metric_name] = metric_fn(logits[k], features, features[k])
+          eval_metrics[metric_name] = metric_fn(logits[k], features)
         else:
-          eval_metrics[metric_name] = metric_fn(logits, features, features["targets"])
+          eval_metrics[metric_name] = metric_fn(logits, features)
       if isinstance(logits, dict):
         predictions = logits
       else:
diff --git a/tensor2tensor/utils/yellowfin.py b/tensor2tensor/utils/yellowfin.py
index d90c75bc5..87b3ec9dd 100644
--- a/tensor2tensor/utils/yellowfin.py
+++ b/tensor2tensor/utils/yellowfin.py
@@ -57,7 +57,7 @@ def __init__(self,
         Set to 1.0 in the paper.
       momentum: A Tensor or a floating point value.  The momentum.
          Set to 0.0 in the paper.
-      clip_thresh: A Tensor or a floating point value. The clipping threshold for
+      clip_thresh: A Tensor or a floating point value. The cliping threshold for
         `tf.clip_by_global_norm`.  If None, no clipping will be carried out.
       beta: A float value or a constant float tensor.  The smoothing parameter
         for estimations.
@@ -358,7 +358,7 @@ def _get_cubic_root(self):
     # We substitute x, which is sqrt(mu), with x = y + 1.
     # It gives y^3 + py = q
     # where p = (D^2 h_min^2)/(2*C) and q = -p.
-    # We use the Vieta's substitution to compute the root.
+    # We use the Vieta's substution to compute the root.
     # There is only one real solution y (which is in [0, 1] ).
     # http://mathworld.wolfram.com/VietasSubstitution.html
     assert_array = [
@@ -390,7 +390,7 @@ def _get_cubic_root(self):
     return x
 
   def _get_lr_tensor(self):
-    """Get lr minimizing the surrogate.
+    """Get lr minimzing the surrogate.
 
     Returns:
       The lr_t.
@@ -461,7 +461,7 @@ def get_name(self):
     return self._momentum_optimizer.get_name()
 
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
-    """Applying gradients and tune hyperparams with YellowFin.
+    """Applying gradients aand tune hyperparams with YellowFin.
 
     Args:
       grads_and_vars: List of (gradient, variable) pairs as returned by
@@ -501,7 +501,7 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     # Begin lr and mu tuning.
     with tf.variable_scope("prepare_yellowFin_variables"):
       # the dependencies ideally only need to be after clip is done,
-      # i.e. depends on self._grads. However, the control_dependencies
+      # i.e. dependes on self._grads. However, the control_dependencies
       # does not support indexed slice for sparse gradients.
       # The alternative dependencies here might be slightly slower due
       # to less parallelization.
@@ -543,7 +543,7 @@ def compute_gradients(self,
         Can be GATE_NONE, GATE_OP, or GATE_GRAPH.
       aggregation_method: Specifies the method used to combine
         gradient terms. Valid values are defined in the class AggregationMethod.
-      colocate_gradients_with_ops: If True, try collocating gradients with
+      colocate_gradients_with_ops: If True, try colocating gradients with
         the corresponding op.
       name: Optional name for the returned operation. Default to the name
         passed to the Optimizer constructor.
@@ -571,7 +571,7 @@ def minimize(self,
                colocate_gradients_with_ops=False,
                name=None,
                grad_loss=None):
-    """Adapted from TensorFlow Optimizer base class member function.
+    """Adapted from Tensorflow Optimizer base class member function.
 
     Add operations to minimize `loss` by updating `var_list`.
     This method simply combines calls `compute_gradients()` and
@@ -590,7 +590,7 @@ def minimize(self,
         Can be GATE_NONE, GATE_OP, or GATE_GRAPH.
       aggregation_method: Specifies the method used to combine gradient terms.
         Valid values are defined in the class AggregationMethod.
-      colocate_gradients_with_ops: If True, try collocating gradients with
+      colocate_gradients_with_ops: If True, try colocating gradients with
         the corresponding op.
       name: Optional name for the returned operation.
       grad_loss: Optional. A Tensor holding the gradient computed for loss.
diff --git a/tensor2tensor/visualization/attention.py b/tensor2tensor/visualization/attention.py
index 56ece8154..e2a0a0551 100644
--- a/tensor2tensor/visualization/attention.py
+++ b/tensor2tensor/visualization/attention.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Module for postprocessing and displaying transformer attentions.
+"""Module for postprocessing and displaying tranformer attentions.
 
 This module is designed to be called from an ipython notebook.
 """
diff --git a/tensor2tensor/visualization/visualization.py b/tensor2tensor/visualization/visualization.py
index 119e7dbb3..f11074a96 100644
--- a/tensor2tensor/visualization/visualization.py
+++ b/tensor2tensor/visualization/visualization.py
@@ -73,7 +73,7 @@ def get_vis_data_from_string(self, sess, input_string):
 
     Args:
       sess: A tf.Session object.
-      input_string: The input sentence to be translated and visualized.
+      input_string: The input setence to be translated and visulized.
 
     Returns:
       Tuple of (
@@ -114,14 +114,14 @@ def get_vis_data_from_string(self, sess, input_string):
 
 
 def build_model(hparams_set, model_name, data_dir, problem_name, beam_size=1):
-  """Build the graph required to fetch the attention weights.
+  """Build the graph required to featch the attention weights.
 
   Args:
     hparams_set: HParams set to build the model with.
     model_name: Name of model.
-    data_dir: Path to directory containing training data.
+    data_dir: Path to directory contatining training data.
     problem_name: Name of problem.
-    beam_size: (Optional) Number of beams to use when decoding a translation.
+    beam_size: (Optional) Number of beams to use when decoding a traslation.
         If set to 1 (default) then greedy decoding is used.
 
   Returns:
@@ -147,7 +147,7 @@ def build_model(hparams_set, model_name, data_dir, problem_name, beam_size=1):
 
   # Must be called after building the training graph, so that the dict will
   # have been filled with the attention tensors. BUT before creating the
-  # inference graph otherwise the dict will be filled with tensors from
+  # interence graph otherwise the dict will be filled with tensors from
   # inside a tf.while_loop from decoding and are marked unfetchable.
   att_mats = get_att_mats(translate_model)
 

From d140a776d01b3d6e03d7137141084ab84f86cd8f Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Fri, 6 Apr 2018 15:18:15 -0700
Subject: [PATCH 02/29] Change all_problems import style to allow for smaller
 internal builds

PiperOrigin-RevId: 191948130
---
 tensor2tensor/bin/t2t_datagen.py              |  2 +-
 tensor2tensor/data_generators/all_problems.py | 84 +++++++++----------
 2 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/tensor2tensor/bin/t2t_datagen.py b/tensor2tensor/bin/t2t_datagen.py
index d00c54f59..862abca84 100644
--- a/tensor2tensor/bin/t2t_datagen.py
+++ b/tensor2tensor/bin/t2t_datagen.py
@@ -38,8 +38,8 @@
 
 import numpy as np
 
+from tensor2tensor import problems as problems_lib  # pylint: disable=unused-import
 from tensor2tensor.data_generators import algorithmic_math
-from tensor2tensor.data_generators import all_problems  # pylint: disable=unused-import
 from tensor2tensor.data_generators import audio
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import snli
diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py
index 313d56df3..495f85fc8 100644
--- a/tensor2tensor/data_generators/all_problems.py
+++ b/tensor2tensor/data_generators/all_problems.py
@@ -18,47 +18,47 @@
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import
-from tensor2tensor.data_generators import algorithmic
-from tensor2tensor.data_generators import algorithmic_math
-from tensor2tensor.data_generators import audio
-from tensor2tensor.data_generators import celeba
-from tensor2tensor.data_generators import cifar
-from tensor2tensor.data_generators import cipher
-from tensor2tensor.data_generators import cnn_dailymail
-from tensor2tensor.data_generators import desc2code
-from tensor2tensor.data_generators import fsns
-from tensor2tensor.data_generators import gym
-from tensor2tensor.data_generators import ice_parsing
-from tensor2tensor.data_generators import imagenet
-from tensor2tensor.data_generators import imdb
-from tensor2tensor.data_generators import librispeech
-from tensor2tensor.data_generators import lm1b
-from tensor2tensor.data_generators import mnist
-from tensor2tensor.data_generators import mscoco
-from tensor2tensor.data_generators import multinli
-from tensor2tensor.data_generators import ocr
-from tensor2tensor.data_generators import problem_hparams
-from tensor2tensor.data_generators import ptb
-from tensor2tensor.data_generators import snli
-from tensor2tensor.data_generators import squad
-from tensor2tensor.data_generators import translate_encs
-from tensor2tensor.data_generators import translate_ende
-from tensor2tensor.data_generators import translate_enfr
-from tensor2tensor.data_generators import translate_enmk
-from tensor2tensor.data_generators import translate_envi
-from tensor2tensor.data_generators import translate_enzh
-from tensor2tensor.data_generators import twentybn
-from tensor2tensor.data_generators import wiki
-from tensor2tensor.data_generators import wsj_parsing
+import importlib
 
+modules = [
+    'tensor2tensor.data_generators.algorithmic',
+    'tensor2tensor.data_generators.algorithmic_math',
+    'tensor2tensor.data_generators.audio',
+    'tensor2tensor.data_generators.celeba',
+    'tensor2tensor.data_generators.cifar',
+    'tensor2tensor.data_generators.cipher',
+    'tensor2tensor.data_generators.cnn_dailymail',
+    'tensor2tensor.data_generators.desc2code',
+    'tensor2tensor.data_generators.fsns',
+    'tensor2tensor.data_generators.gene_expression',
+    'tensor2tensor.data_generators.gym',
+    'tensor2tensor.data_generators.ice_parsing',
+    'tensor2tensor.data_generators.imagenet',
+    'tensor2tensor.data_generators.imdb',
+    'tensor2tensor.data_generators.librispeech',
+    'tensor2tensor.data_generators.lm1b',
+    'tensor2tensor.data_generators.mnist',
+    'tensor2tensor.data_generators.mscoco',
+    'tensor2tensor.data_generators.multinli',
+    'tensor2tensor.data_generators.ocr',
+    'tensor2tensor.data_generators.problem_hparams',
+    'tensor2tensor.data_generators.ptb',
+    'tensor2tensor.data_generators.snli',
+    'tensor2tensor.data_generators.squad',
+    'tensor2tensor.data_generators.translate_encs',
+    'tensor2tensor.data_generators.translate_ende',
+    'tensor2tensor.data_generators.translate_enfr',
+    'tensor2tensor.data_generators.translate_enmk',
+    'tensor2tensor.data_generators.translate_envi',
+    'tensor2tensor.data_generators.translate_enzh',
+    'tensor2tensor.data_generators.twentybn',
+    'tensor2tensor.data_generators.wiki',
+    'tensor2tensor.data_generators.wsj_parsing',
+]
 
-# Problem modules that require optional dependencies
-# pylint: disable=g-import-not-at-top
-try:
-  # Requires h5py
-  from tensor2tensor.data_generators import gene_expression
-except ImportError:
-  pass
-# pylint: enable=g-import-not-at-top
-# pylint: enable=unused-import
+
+for module in modules:
+  try:
+    importlib.import_module(module)
+  except ImportError:
+    pass

From 6c6010f44a968abf515daf06d2ea850daf971313 Mon Sep 17 00:00:00 2001
From: Aurko Roy <aurkor@google.com>
Date: Mon, 9 Apr 2018 09:27:13 -0700
Subject: [PATCH 03/29] Use non-causal attention for non-autoregressive
 decoding from latents

PiperOrigin-RevId: 192137977
---
 tensor2tensor/models/research/transformer_vae.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tensor2tensor/models/research/transformer_vae.py b/tensor2tensor/models/research/transformer_vae.py
index b5dbc9690..6121aa3f3 100644
--- a/tensor2tensor/models/research/transformer_vae.py
+++ b/tensor2tensor/models/research/transformer_vae.py
@@ -147,7 +147,8 @@ def decode_transformer(encoder_output,
                        targets,
                        hparams,
                        name,
-                       task=None):
+                       task=None,
+                       causal=True):
   """Original Transformer decoder."""
   orig_hparams = hparams
   if name == "extra":
@@ -166,6 +167,9 @@ def decode_transformer(encoder_output,
       decoder_input = tf.nn.dropout(decoder_input,
                                     1.0 - hparams.layer_prepostprocess_dropout)
 
+      if not causal:
+        decoder_self_bias *= 0.
+
       decoder_output = transformer.transformer_decoder(
           decoder_input,
           encoder_output,
@@ -184,6 +188,7 @@ def decode_transformer(encoder_output,
 
       # Prepare decoder inputs and bias.
       decoder_input, _, _, bias = cia.prepare_decoder(targets, hparams)
+
       # Add class label to decoder input.
       if not hparams.drop_inputs:
         decoder_input += tf.reshape(
@@ -438,6 +443,8 @@ def bn_inputs():
       if not hparams.do_refine:
         masking -= tf.random_uniform([]) * hparams.unmasked_percentage
       masking = tf.minimum(tf.maximum(masking, 0.0), 1.0)
+      if hparams.use_predict_mask:
+        masking = predict_mask
       if hparams.mode == tf.estimator.ModeKeys.PREDICT:
         masking = predict_mask
       mask = tf.less(masking, tf.random_uniform(
@@ -457,7 +464,8 @@ def bn_inputs():
     if hparams.task == "translate":
       targets = tf.concat([tf.reverse(latents_dense, [1]), targets], axis=1)
 
-  res = decode_transformer(inputs, ed, targets, hparams, "decoder")
+  res = decode_transformer(inputs, ed, targets, hparams, "decoder",
+                           causal=hparams.causal)
   if hparams.do_ae:
     if hparams.task == "translate":
       res = res[:, common_layers.shape_list(latents_dense)[1]:, :, :]
@@ -678,11 +686,13 @@ def transformer_ae_small():
   # Add an hparam for number of reiduals
   hparams.add_hparam("num_residuals", 1)
   # Reshape method for DVQ: slice, project
+  hparams.add_hparam("causal", True)
   hparams.add_hparam("reshape_method", "slice")
   hparams.add_hparam("trainable_projections", False)
   hparams.add_hparam("unmasked_percentage", 0.1)
   hparams.add_hparam("do_ae", True)
   hparams.add_hparam("do_mask", True)
+  hparams.add_hparam("use_predict_mask", True)
   hparams.add_hparam("do_refine", False)
   hparams.add_hparam("do_attend_compress", False)
   hparams.add_hparam("do_attend_decompress", True)

From dc72552c8e80d51dda0614801c21d585c20a28b7 Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Mon, 9 Apr 2018 13:45:49 -0700
Subject: [PATCH 04/29] Expose input queue preprocessing so that frames can be
 concatenated on the fly.

PiperOrigin-RevId: 192180939
---
 tensor2tensor/data_generators/gym.py     | 43 +++++++++++------
 tensor2tensor/data_generators/problem.py | 61 ++++++++++++++++--------
 2 files changed, 71 insertions(+), 33 deletions(-)

diff --git a/tensor2tensor/data_generators/gym.py b/tensor2tensor/data_generators/gym.py
index 0cdfe0fa9..2ff5ba934 100644
--- a/tensor2tensor/data_generators/gym.py
+++ b/tensor2tensor/data_generators/gym.py
@@ -55,9 +55,7 @@ def __init__(self, *args, **kwargs):
   def example_reading_spec(self, label_repr=None):
 
     data_fields = {
-        "inputs": tf.FixedLenFeature([210, 160, 3], tf.int64),
-        "inputs_prev": tf.FixedLenFeature([210, 160, 3], tf.int64),
-        "targets": tf.FixedLenFeature([210, 160, 3], tf.int64),
+        "frame": tf.FixedLenFeature([210, 160, 3], tf.int64),
         "action": tf.FixedLenFeature([1], tf.int64),
         "reward": tf.FixedLenFeature([1], tf.int64)
     }
@@ -103,6 +101,29 @@ def num_shards(self):
   def num_dev_shards(self):
     return 1
 
+  def preprocess(self, dataset, mode, hparams):
+    def unbatch(batched_features, n):
+      """Split each feature in batched_features into a list of n tensors."""
+      result = {}
+      for k, v in batched_features.iteritems():
+        result[k] = [tf.squeeze(t, axis=0) for t in tf.split(v, n)]
+      return result
+
+    def features_from_batch(batched_prefeatures):
+      """Construct final features from the batched inputs."""
+      unbatched = unbatch(batched_prefeatures, 3)
+      frames = unbatched["frame"]
+      return {"inputs_prev": frames[0],
+              "inputs": frames[1],
+              "targets": frames[2],
+              "action": unbatched["action"][1],
+              "reward": unbatched["reward"][1]}
+
+    # Batch and construct features.
+    batch_dataset = dataset.apply(
+        tf.contrib.data.batch_and_drop_remainder(3))
+    return batch_dataset.map(features_from_batch)
+
   def get_action(self, observation=None):
     return self.env.action_space.sample()
 
@@ -119,10 +140,7 @@ def hparams(self, defaults, unused_model_hparams):
   def generator(self, data_dir, tmp_dir):
     self.env.reset()
     action = self.get_action()
-    prev_observation, observation = None, None
     for _ in range(self.num_steps):
-      prev_prev_observation = prev_observation
-      prev_observation = observation
       observation, reward, done, _ = self.env.step(action)
       action = self.get_action(observation)
       if done:
@@ -130,13 +148,10 @@ def generator(self, data_dir, tmp_dir):
       def flatten(nparray):
         flat1 = [x for sublist in nparray.tolist() for x in sublist]
         return [x for sublist in flat1 for x in sublist]
-      if prev_prev_observation is not None:
-        yield {"inputs_prev": flatten(prev_prev_observation),
-               "inputs": flatten(prev_observation),
-               "action": [action],
-               "done": [done],
-               "reward": [int(reward)],
-               "targets": flatten(observation)}
+      yield {"frame": flatten(observation),
+             "action": [action],
+             "done": [done],
+             "reward": [int(reward)]}
 
   def generate_data(self, data_dir, tmp_dir, task_id=-1):
     train_paths = self.training_filepaths(
@@ -163,7 +178,7 @@ def num_actions(self):
 
   @property
   def num_rewards(self):
-    return 2
+    return 3
 
   @property
   def num_steps(self):
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index bcbb1abd2..81bcdad11 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -323,6 +323,37 @@ def eval_metrics(self):
   # END SUBCLASS INTERFACE
   # ============================================================================
 
+  def preprocess(self, dataset, mode, hparams):
+    """Runtime preprocessing on the whole dataset.
+
+    Return a tf.data.Datset -- the preprocessed version of the given one.
+    By default this function calls preprocess_example.
+
+    Args:
+      dataset: the Dataset of already decoded but not yet preprocessed features.
+      mode: tf.estimator.ModeKeys
+      hparams: HParams, model hyperparameters
+
+    Returns:
+      a Dataset
+    """
+    def _preprocess(example):
+      examples = self.preprocess_example(example, mode, hparams)
+      if not isinstance(examples, tf.data.Dataset):
+        examples = tf.data.Dataset.from_tensors(examples)
+      return examples
+
+    is_training = mode == tf.estimator.ModeKeys.TRAIN
+    if hasattr(tf.contrib.data, "parallel_interleave"):
+      dataset = dataset.apply(
+          tf.contrib.data.parallel_interleave(
+              _preprocess, sloppy=is_training, cycle_length=8))
+    else:
+      dataset = dataset.interleave(_preprocess, cycle_length=8,
+                                   block_length=16)
+
+    return dataset
+
   def training_filepaths(self, data_dir, num_shards, shuffled):
     file_basename = self.dataset_filename()
     if not shuffled:
@@ -514,15 +545,16 @@ def dataset(self,
         data_filepattern)
 
     # Functions used in dataset transforms below
-    def _load_records(filename):
+    def _load_records_and_preprocess(filename):
       # Load records from file with an 8MiB read buffer.
-      return tf.data.TFRecordDataset(filename, buffer_size=8 * 1024 * 1024)
-
-    def _preprocess(example):
-      examples = self.preprocess_example(example, mode, hparams)
-      if not isinstance(examples, tf.data.Dataset):
-        examples = tf.data.Dataset.from_tensors(examples)
-      return examples
+      dataset = tf.data.TFRecordDataset(filename, buffer_size=8 * 1024 * 1024)
+      # Decode.
+      dataset = dataset.map(self.decode_example, num_parallel_calls=num_threads)
+      # Preprocess if requested.
+      #   Note that preprocessing should happen per-file as order may matter.
+      if preprocess:
+        dataset = self.preprocess(dataset, mode, hparams)
+      return dataset
 
     if len(data_files) < num_partitions:
       raise ValueError(
@@ -539,20 +571,11 @@ def _preprocess(example):
     if hasattr(tf.contrib.data, "parallel_interleave"):
       dataset = dataset.apply(
           tf.contrib.data.parallel_interleave(
-              _load_records, sloppy=is_training, cycle_length=8))
+              _load_records_and_preprocess, sloppy=is_training, cycle_length=8))
     else:
-      dataset = dataset.interleave(_load_records, cycle_length=8,
+      dataset = dataset.interleave(_load_records_and_preprocess, cycle_length=8,
                                    block_length=16)
 
-    dataset = dataset.map(self.decode_example, num_parallel_calls=num_threads)
-    if preprocess:
-      if hasattr(tf.contrib.data, "parallel_interleave"):
-        dataset = dataset.apply(
-            tf.contrib.data.parallel_interleave(
-                _preprocess, sloppy=is_training, cycle_length=8))
-      else:
-        dataset = dataset.interleave(_preprocess, cycle_length=8,
-                                     block_length=16)
     dataset = dataset.map(
         self.maybe_reverse_and_copy, num_parallel_calls=num_threads)
 

From 51a77ce22c7b2f89439c9d572263e63900ca088e Mon Sep 17 00:00:00 2001
From: Aurko Roy <aurkor@google.com>
Date: Mon, 9 Apr 2018 13:52:25 -0700
Subject: [PATCH 05/29] Pass num_residuals to bottleneck function.

PiperOrigin-RevId: 192182206
---
 tensor2tensor/layers/discretization.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensor2tensor/layers/discretization.py b/tensor2tensor/layers/discretization.py
index c1596d89d..b956b12d0 100644
--- a/tensor2tensor/layers/discretization.py
+++ b/tensor2tensor/layers/discretization.py
@@ -142,7 +142,10 @@ def embedding_lookup(x,
   x_means_hot = nearest_neighbor(x, means, block_v_size, random_top_k, soft_em,
                                  inv_temp, ema_count)
   x_means_hot_flat = tf.reshape(x_means_hot, [-1, num_blocks, block_v_size])
-  x_means = tf.matmul(tf.transpose(x_means_hot_flat, perm=[1, 0, 2]), means)
+  x_means_idx = tf.argmax(x_means_hot_flat, axis=-1)
+  x_means = tf.matmul(
+      tf.transpose(tf.one_hot(x_means_idx, block_v_size), perm=[1, 0, 2]),
+      means)
   x_means = tf.transpose(x_means, [1, 0, 2])
   q_loss = tf.reduce_mean(tf.square((tf.stop_gradient(x) - x_means)))
   e_loss = tf.reduce_mean(tf.square(x - tf.stop_gradient(x_means)))

From 9ce3b240adcda0550991ded227497f535ddda52a Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Mon, 9 Apr 2018 14:32:51 -0700
Subject: [PATCH 06/29] Add SRU gate (https://arxiv.org/abs/1709.02755) to
 common layers.

PiperOrigin-RevId: 192188989
---
 tensor2tensor/data_generators/problem.py   |  2 +-
 tensor2tensor/layers/common_layers.py      | 58 ++++++++++++++++++++++
 tensor2tensor/layers/common_layers_test.py |  8 +++
 3 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 81bcdad11..a4e56c0ed 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -551,7 +551,7 @@ def _load_records_and_preprocess(filename):
       # Decode.
       dataset = dataset.map(self.decode_example, num_parallel_calls=num_threads)
       # Preprocess if requested.
-      #   Note that preprocessing should happen per-file as order may matter.
+      # Note that preprocessing should happen per-file as order may matter.
       if preprocess:
         dataset = self.preprocess(dataset, mode, hparams)
       return dataset
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index 5dc088234..14b3e0a2e 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -1872,6 +1872,64 @@ def gated_linear_unit_layer(x, name=None):
     return x * tf.nn.sigmoid(gating_x)
 
 
+def sru(x, num_layers=2,
+        activation=None, initial_state=None, name=None, reuse=None):
+  """SRU cell as in https://arxiv.org/abs/1709.02755.
+
+  As defined in the paper:
+  (1) x'_t = W x_t
+  (2) f_t = sigmoid(Wf x_t + bf)
+  (3) r_t = sigmoid(Wr x_t + br)
+  (4) c_t = f_t * c_{t-1} + (1 - f_t) * x'_t
+  (5) h_t = r_t * activation(c_t) + (1 - r_t) * x_t
+
+  Args:
+    x: A tensor of shape [batch, ..., channels] ; ... is treated as time.
+    num_layers: How many SRU layers; default is 2 as results for 1 disappoint.
+    activation: Optional activation function, try tf.nn.tanh or tf.nn.relu.
+    initial_state: Optional initial c-state, set to zeros if None.
+    name: Optional name, "sru" by default.
+    reuse: Optional reuse.
+
+  Returns:
+    A tensor of the same shape as x.
+
+  Raises:
+    ValueError: if num_layers is not positive.
+  """
+  if num_layers < 1:
+    raise ValueError("Number of layers must be positive: %d" % num_layers)
+  with tf.variable_scope(name, default_name="sru", values=[x], reuse=reuse):
+    # We assume x is [batch, ..., channels] and treat all ... as time.
+    x_shape = shape_list(x)
+    x = tf.reshape(x, [x_shape[0], -1, x_shape[-1]])
+    x = tf.transpose(x, [1, 0, 2])  # Scan assumes time on axis 0.
+    initial_state = initial_state or tf.zeros([x_shape[0], x_shape[-1]])
+    # SRU state manipulation function.
+    def next_state(cur_state, (cur_x_times_one_minus_f, cur_f)):
+      return cur_f * cur_state + cur_x_times_one_minus_f
+    # Calculate SRU on each layer.
+    for i in xrange(num_layers):
+      # The parallel part of the SRU.
+      x_orig = x
+      x, f, r = tf.split(tf.layers.dense(x, 3 * x_shape[-1],
+                                         name="kernel_%d" % i), 3, axis=-1)
+      f, r = tf.sigmoid(f), tf.sigmoid(r)
+      x_times_one_minus_f = x * (1.0 - f)  # Compute in parallel for speed.
+      # Calculate states.
+      c_states = tf.scan(next_state, (x_times_one_minus_f, f),
+                         initializer=initial_state,
+                         parallel_iterations=2, name="scan_%d" % i)
+      # Final output.
+      if activation is not None:
+        c_states = activation(c_states)
+      h = c_states * r + (1.0 - r) * x_orig
+      x = h  # Next layer.
+    # Transpose back to batch-major.
+    x = tf.transpose(x, [1, 0, 2])
+    return tf.reshape(x, x_shape)
+
+
 def linear_set_layer(layer_size,
                      inputs,
                      context=None,
diff --git a/tensor2tensor/layers/common_layers_test.py b/tensor2tensor/layers/common_layers_test.py
index 31ada31dc..2cc36d42d 100644
--- a/tensor2tensor/layers/common_layers_test.py
+++ b/tensor2tensor/layers/common_layers_test.py
@@ -228,6 +228,14 @@ def testConvGRU(self):
     self.assertEqual(res1.shape, (5, 7, 3, 11))
     self.assertEqual(res2.shape, (5, 7, 3, 11))
 
+  def testSRU(self):
+    x = np.random.rand(5, 7, 3, 11)
+    with self.test_session() as session:
+      y = common_layers.sru(tf.constant(x, dtype=tf.float32))
+      session.run(tf.global_variables_initializer())
+      res = session.run(y)
+    self.assertEqual(res.shape, (5, 7, 3, 11))
+
   def testLayerNorm(self):
     x = np.random.rand(5, 7, 11)
     with self.test_session() as session:

From dbbbf2515842ae876faf7632331d27ac8013cbfd Mon Sep 17 00:00:00 2001
From: Aurko Roy <aurkor@google.com>
Date: Mon, 9 Apr 2018 19:46:47 -0700
Subject: [PATCH 07/29] Remove kmeans_lr_factor which is not being used
 anywhere

PiperOrigin-RevId: 192228040
---
 tensor2tensor/models/research/transformer_vae.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensor2tensor/models/research/transformer_vae.py b/tensor2tensor/models/research/transformer_vae.py
index 6121aa3f3..5d88ed2c4 100644
--- a/tensor2tensor/models/research/transformer_vae.py
+++ b/tensor2tensor/models/research/transformer_vae.py
@@ -703,7 +703,6 @@ def transformer_ae_small():
   hparams.add_hparam("num_compress_steps", 3)
   hparams.add_hparam("startup_steps", 10000)
   hparams.add_hparam("mask_startup_steps", 50000)
-  hparams.add_hparam("kmeans_lr_factor", 0.002)
   hparams.add_hparam("z_dropout", 0.1)
   hparams.add_hparam("is_2d", 0)
   hparams.add_hparam("softmax_k", 0)
@@ -732,7 +731,6 @@ def imagetransformer_ae_cifar():
   hparams.filter_size = 512
   hparams.num_compress_steps = 3
   hparams.startup_steps = 10000
-  hparams.kmeans_lr_factor = 0.0
   hparams.is_2d = 0
   hparams.learning_rate_warmup_steps = 8000
   hparams.learning_rate = 0.2

From d0389f2b30f0af40389e39ae0276330f0a055d18 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Tue, 10 Apr 2018 13:16:35 -0700
Subject: [PATCH 08/29] Convergence test for transformer with BF16 activation.

PiperOrigin-RevId: 192334686
---
 tensor2tensor/models/transformer.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 6e2220258..ed981632c 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -1354,6 +1354,14 @@ def transformer_tpu():
   return hparams
 
 
+@registry.register_hparams
+def transformer_tpu_bf16_activation():
+  """HParams for Transformer model with BF16 activation on TPU."""
+  hparams = transformer_tpu()
+  hparams.activation_dtype = "bfloat16"
+  return hparams
+
+
 @registry.register_hparams
 def transformer_packed_tpu():
   """Deprecated alias for transformer_tpu()."""

From ba249e0e4708c2642d68f4514dcb94c860be72c9 Mon Sep 17 00:00:00 2001
From: Noam Shazeer <noam@google.com>
Date: Tue, 10 Apr 2018 16:00:42 -0700
Subject: [PATCH 09/29] Added config transformer_big_enfr_tpu.

PiperOrigin-RevId: 192362931
---
 tensor2tensor/models/transformer.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index ed981632c..f08254c00 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -1220,6 +1220,15 @@ def transformer_big_enfr():
   return hparams
 
 
+@registry.register_hparams
+def transformer_big_enfr_tpu():
+  hparams = transformer_big_enfr()
+  # For performance, use fewer heads so that matrix dimensions are at least 128
+  hparams.num_heads = 8
+  update_hparams_for_tpu(hparams)
+  return hparams
+
+
 @registry.register_hparams
 def transformer_big_dr2():
   hparams = transformer_big_dr1()

From b344e85049daa73b31701b0316a7c8a0ba8a5e1e Mon Sep 17 00:00:00 2001
From: Aurko Roy <aurkor@google.com>
Date: Tue, 10 Apr 2018 17:16:44 -0700
Subject: [PATCH 10/29] Add separate hparam for bottleneck hidden_size and for
 ImageTransformer

PiperOrigin-RevId: 192376027
---
 tensor2tensor/layers/discretization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensor2tensor/layers/discretization.py b/tensor2tensor/layers/discretization.py
index b956b12d0..2f537d31b 100644
--- a/tensor2tensor/layers/discretization.py
+++ b/tensor2tensor/layers/discretization.py
@@ -96,9 +96,9 @@ def nearest_neighbor(x,
   # computing cluster probabilities
   if soft_em:
     ema_count = tf.expand_dims(ema_count + 1., 0)
-    c_probs = ema_count / tf.reduce_sum(ema_count, 2, keepdims=True)
+    c_probs = ema_count / tf.reduce_sum(ema_count, axis=2, keepdims=True)
     nearest_hot = tf.exp(-inv_temp * dist) * c_probs
-    nearest_hot /= tf.reduce_sum(nearest_hot, 2, keepdims=True)
+    nearest_hot /= tf.reduce_sum(nearest_hot, axis=2, keepdims=True)
   else:
     if random_top_k > 1:
       _, top_k_idx = tf.nn.top_k(-dist, k=random_top_k)

From 2bade591782d8e92e8e84f7e9bfcce7705010c13 Mon Sep 17 00:00:00 2001
From: Aurko Roy <aurkor@google.com>
Date: Tue, 10 Apr 2018 19:52:14 -0700
Subject: [PATCH 11/29] Mask soft_em for some steps to avoid Nan loss.

PiperOrigin-RevId: 192393208
---
 tensor2tensor/layers/discretization.py        | 30 +++++++++++++++----
 .../models/research/transformer_vae.py        |  2 ++
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/tensor2tensor/layers/discretization.py b/tensor2tensor/layers/discretization.py
index 2f537d31b..6b01dcd7e 100644
--- a/tensor2tensor/layers/discretization.py
+++ b/tensor2tensor/layers/discretization.py
@@ -67,6 +67,7 @@ def nearest_neighbor(x,
                      block_v_size,
                      random_top_k=1,
                      soft_em=False,
+                     soft_em_startup_steps=10000,
                      inv_temp=1.0,
                      ema_count=None):
   """Find the nearest element in means to elements in x.
@@ -78,6 +79,8 @@ def nearest_neighbor(x,
     block_v_size: Number of table entries per block.
     random_top_k: Noisy top-k if this is bigger than 1 (Default: 1).
     soft_em: If True then use soft EM rather than hard EM (Default: False).
+    soft_em_startup_steps: Number of steps before soft_em activates
+      (Default: 10000).
     inv_temp: Inverse temperature for soft EM (Default: 1.)
     ema_count: Table of counts for each embedding corresponding to how many
       examples in a batch it was the closest to (Default: None).
@@ -95,10 +98,12 @@ def nearest_neighbor(x,
 
   # computing cluster probabilities
   if soft_em:
-    ema_count = tf.expand_dims(ema_count + 1., 0)
-    c_probs = ema_count / tf.reduce_sum(ema_count, axis=2, keepdims=True)
+    ema_count = tf.expand_dims(ema_count, 0)
+    c_probs = ema_count / tf.reduce_sum(ema_count, 2, keepdims=True)
+    mask = common_layers.inverse_lin_decay(soft_em_startup_steps)
+    c_probs = mask * c_probs + (1 - mask) * tf.ones_like(c_probs)
     nearest_hot = tf.exp(-inv_temp * dist) * c_probs
-    nearest_hot /= tf.reduce_sum(nearest_hot, axis=2, keepdims=True)
+    nearest_hot /= tf.reduce_sum(nearest_hot, 2, keepdims=True)
   else:
     if random_top_k > 1:
       _, top_k_idx = tf.nn.top_k(-dist, k=random_top_k)
@@ -119,6 +124,7 @@ def embedding_lookup(x,
                      block_v_size,
                      random_top_k=1,
                      soft_em=False,
+                     soft_em_startup_steps=10000,
                      inv_temp=1.0,
                      ema_count=None):
   """Compute nearest neighbors and loss for training the embeddings via DVQ.
@@ -131,6 +137,8 @@ def embedding_lookup(x,
     block_v_size: Number of table entries per block.
     random_top_k: Noisy top-k if this is bigger than 1 (Default: 1).
     soft_em: If True then use soft EM rather than hard EM (Default: False).
+    soft_em_startup_steps: Number of steps before soft_em activates
+      (Default: 10000).
     inv_temp: Inverse temperature for soft EM (Default: 1.)
     ema_count: Table of counts for each embedding corresponding to how many
       examples in a batch it was the closest to (Default: None).
@@ -139,8 +147,15 @@ def embedding_lookup(x,
     The nearest neighbor in one hot form, the nearest neighbor itself, the
     commitment loss, embedding training loss.
   """
-  x_means_hot = nearest_neighbor(x, means, block_v_size, random_top_k, soft_em,
-                                 inv_temp, ema_count)
+  x_means_hot = nearest_neighbor(
+      x,
+      means,
+      block_v_size,
+      random_top_k,
+      soft_em=soft_em,
+      soft_em_startup_steps=soft_em_startup_steps,
+      inv_temp=inv_temp,
+      ema_count=ema_count)
   x_means_hot_flat = tf.reshape(x_means_hot, [-1, num_blocks, block_v_size])
   x_means_idx = tf.argmax(x_means_hot_flat, axis=-1)
   x_means = tf.matmul(
@@ -410,6 +425,7 @@ def discrete_bottleneck(x,
                         discrete_mix=0.5,
                         random_top_k=1,
                         soft_em=False,
+                        soft_em_startup_steps=10000,
                         inv_temp=1.0,
                         epsilon=1e-5,
                         softmax_k=0,
@@ -448,6 +464,8 @@ def discrete_bottleneck(x,
       (Default: 0.5).
     random_top_k: Noisy top-k for DVQ (Default: 1).
     soft_em: If True then use soft EM rather than hard EM (Default: False).
+    soft_em_startup_steps: Number of steps before soft_em activates
+      (Default: 10000).
     inv_temp: Inverse temperature for soft EM (Default: 1.)
     epsilon: Epsilon parameter for DVQ (Default: 1e-5).
     softmax_k: If > 1 then do top-k softmax (Default: 0).
@@ -556,7 +574,7 @@ def discrete_bottleneck(x,
       for i in range(num_residuals):
         x_means_hot_res, x_means_res, q_loss_res, e_loss_res = embedding_lookup(
             x_res, means[i], num_blocks, block_v_size, random_top_k, soft_em,
-            inv_temp, ema_count[i])
+            soft_em_startup_steps, inv_temp, ema_count[i])
 
         # Update the ema variables
         if ema:
diff --git a/tensor2tensor/models/research/transformer_vae.py b/tensor2tensor/models/research/transformer_vae.py
index 5d88ed2c4..a49b85f1f 100644
--- a/tensor2tensor/models/research/transformer_vae.py
+++ b/tensor2tensor/models/research/transformer_vae.py
@@ -514,6 +514,7 @@ def __init__(self, *args, **kwargs):
         discrete_mix=self._hparams.d_mix,
         random_top_k=self._hparams.random_top_k,
         soft_em=self.hparams.soft_em,
+        soft_em_startup_steps=self.hparams.soft_em_startup_steps,
         inv_temp=self.hparams.inv_temp,
         epsilon=self._hparams.epsilon,
         softmax_k=self._hparams.softmax_k,
@@ -715,6 +716,7 @@ def transformer_ae_small():
   hparams.add_hparam("ema", True)
   hparams.add_hparam("random_top_k", 1)
   hparams.add_hparam("soft_em", False)
+  hparams.add_hparam("soft_em_startup_steps", 10000)
   hparams.add_hparam("inv_temp", 1.0)
   hparams.kl_warmup_steps = 150000
   hparams.force_full_predict = True

From dedb5dbc626fa8e480d46dc3f548a9e8e6ad52fc Mon Sep 17 00:00:00 2001
From: Aurko Roy <aurkor@google.com>
Date: Wed, 11 Apr 2018 11:14:19 -0700
Subject: [PATCH 12/29] We still need a tf.where like condition since we can't
 take convex comb. of Nan

PiperOrigin-RevId: 192478191
---
 tensor2tensor/layers/discretization.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensor2tensor/layers/discretization.py b/tensor2tensor/layers/discretization.py
index 6b01dcd7e..6ade1e38b 100644
--- a/tensor2tensor/layers/discretization.py
+++ b/tensor2tensor/layers/discretization.py
@@ -100,11 +100,15 @@ def nearest_neighbor(x,
   if soft_em:
     ema_count = tf.expand_dims(ema_count, 0)
     c_probs = ema_count / tf.reduce_sum(ema_count, 2, keepdims=True)
-    mask = common_layers.inverse_lin_decay(soft_em_startup_steps)
+    c_probs = tf.where(
+        tf.less(tf.to_int32(tf.train.get_global_step()), soft_em_startup_steps),
+        tf.ones_like(c_probs, dtype=tf.float32), c_probs)
+    mask = common_layers.inverse_lin_decay(2 * soft_em_startup_steps)
     c_probs = mask * c_probs + (1 - mask) * tf.ones_like(c_probs)
     nearest_hot = tf.exp(-inv_temp * dist) * c_probs
     nearest_hot /= tf.reduce_sum(nearest_hot, 2, keepdims=True)
   else:
+    dist = tf.Print(dist, [dist], message="dist=")
     if random_top_k > 1:
       _, top_k_idx = tf.nn.top_k(-dist, k=random_top_k)
       nearest_idx = tf.gather(

From 9cf2bf8f0d5109f0c9c9da95937474e608453283 Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Wed, 11 Apr 2018 14:04:48 -0700
Subject: [PATCH 13/29] Spelling corrections.

PiperOrigin-RevId: 192505347
---
 README.md                                     |  1 +
 docs/cloud_tpu.md                             |  4 +-
 docs/distributed_training.md                  |  2 +-
 docs/index.md                                 |  1 +
 docs/walkthrough.md                           |  1 +
 tensor2tensor/bin/t2t_avg_all.py              |  2 +-
 tensor2tensor/bin/t2t_decoder.py              |  3 +-
 tensor2tensor/bin/t2t_trainer.py              |  4 +-
 tensor2tensor/data_generators/desc2code.py    |  6 +-
 .../data_generators/generator_utils.py        |  2 +-
 .../data_generators/problem_hparams.py        |  2 +-
 tensor2tensor/data_generators/ptb.py          |  2 +-
 .../data_generators/speech_recognition.py     |  4 +-
 tensor2tensor/data_generators/translate.py    |  2 +-
 tensor2tensor/data_generators/wiki.py         |  6 +-
 tensor2tensor/data_generators/wsj_parsing.py  |  6 +-
 tensor2tensor/layers/common_attention.py      | 55 +++++++++----------
 tensor2tensor/layers/common_attention_test.py |  2 +-
 tensor2tensor/layers/common_hparams.py        |  2 +-
 .../layers/common_image_attention.py          |  6 +-
 tensor2tensor/layers/common_layers.py         | 12 ++--
 tensor2tensor/layers/discretization.py        |  2 +-
 tensor2tensor/models/image_transformer.py     |  2 +-
 tensor2tensor/models/image_transformer_2d.py  |  8 +--
 .../models/research/transformer_vae.py        |  2 +-
 tensor2tensor/models/resnet.py                |  2 +-
 tensor2tensor/models/slicenet.py              |  2 +-
 tensor2tensor/models/transformer.py           | 46 ++++++++--------
 tensor2tensor/rl/README.md                    |  6 +-
 tensor2tensor/rl/envs/utils.py                |  6 +-
 tensor2tensor/rl/ppo.py                       |  2 +-
 tensor2tensor/rl/t2t_rl_trainer.py            |  2 +-
 tensor2tensor/utils/adafactor.py              |  6 +-
 tensor2tensor/utils/beam_search.py            | 22 ++++----
 tensor2tensor/utils/bleu_hook.py              |  6 +-
 tensor2tensor/utils/cloud_tpu.py              |  2 +-
 tensor2tensor/utils/data_reader.py            |  2 +-
 tensor2tensor/utils/decoding.py               |  4 +-
 tensor2tensor/utils/diet.py                   |  8 +--
 tensor2tensor/utils/expert_utils.py           |  2 +-
 tensor2tensor/utils/flags.py                  |  2 +-
 tensor2tensor/utils/rouge.py                  |  8 +--
 tensor2tensor/utils/t2t_model.py              | 12 ++--
 tensor2tensor/utils/yellowfin.py              | 18 +++---
 tensor2tensor/visualization/attention.py      |  2 +-
 tensor2tensor/visualization/visualization.py  | 10 ++--
 46 files changed, 154 insertions(+), 155 deletions(-)

diff --git a/README.md b/README.md
index 12e05e936..889d5ad64 100644
--- a/README.md
+++ b/README.md
@@ -143,6 +143,7 @@ There are a number of translation data-sets in T2T:
 * English-French: `--problems=translate_enfr_wmt32k`
 * English-Czech: `--problems=translate_encs_wmt32k`
 * English-Chinese: `--problems=translate_enzh_wmt32k`
+* English-Vietnamese: `--problems=translate_envi_iwslt32k`
 
 You can get translations in the other direction by appending `_rev` to
 the problem name, e.g., for German-English use
diff --git a/docs/cloud_tpu.md b/docs/cloud_tpu.md
index cfc0c0a96..d923ee02e 100644
--- a/docs/cloud_tpu.md
+++ b/docs/cloud_tpu.md
@@ -1,7 +1,7 @@
 # Running on Cloud TPUs
 
 Tensor2Tensor supports running on Google Cloud Platforms TPUs, chips
-specialized for ML training. See the official tutorial for [running Transfomer
+specialized for ML training. See the official tutorial for [running Transformer
 on Cloud TPUs](https://cloud.google.com/tpu/docs/tutorials/transformer) or
 read on for more T2T models on TPUs.
 
@@ -14,7 +14,7 @@ Transformer:
 
 You can run the Transformer model on a number of problems,
 from translation through language modeling to sentiment analysis.
-See the official tutorial for [running Transfomer
+See the official tutorial for [running Transformer
 on Cloud TPUs](https://cloud.google.com/tpu/docs/tutorials/transformer)
 for some examples and try out your own problems.
 
diff --git a/docs/distributed_training.md b/docs/distributed_training.md
index 95b499f87..74ae0e536 100644
--- a/docs/distributed_training.md
+++ b/docs/distributed_training.md
@@ -68,7 +68,7 @@ For example:
 TF_CONFIG=$JOB_TF_CONFIG t2t-trainer $JOB_FLAGS --model=transformer ...
 ```
 
-Modify the `--worker_gpu` and `--ps_gpu` flags, which specify how many gpus are
+Modify the `--worker_gpu` and `--ps_gpu` flags, which specify how many GPUs are
 on each master and ps, respectively, as needed for your machine/cluster setup.
 
 ## Command-line flags for eval jobs
diff --git a/docs/index.md b/docs/index.md
index b7d0236c9..060e10471 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -107,6 +107,7 @@ There are a number of translation data-sets in T2T:
 * English-French: `--problems=translate_enfr_wmt32k`
 * English-Czech: `--problems=translate_encs_wmt32k`
 * English-Chinese: `--problems=translate_enzh_wmt32k`
+* English-Vietnamese: `--problems=translate_envi_iwslt32k`
 
 You can get translations in the other direction by appending `_rev` to
 the problem name, e.g., for German-English use
diff --git a/docs/walkthrough.md b/docs/walkthrough.md
index 12e05e936..889d5ad64 100644
--- a/docs/walkthrough.md
+++ b/docs/walkthrough.md
@@ -143,6 +143,7 @@ There are a number of translation data-sets in T2T:
 * English-French: `--problems=translate_enfr_wmt32k`
 * English-Czech: `--problems=translate_encs_wmt32k`
 * English-Chinese: `--problems=translate_enzh_wmt32k`
+* English-Vietnamese: `--problems=translate_envi_iwslt32k`
 
 You can get translations in the other direction by appending `_rev` to
 the problem name, e.g., for German-English use
diff --git a/tensor2tensor/bin/t2t_avg_all.py b/tensor2tensor/bin/t2t_avg_all.py
index 0b0aa266d..694ab26ed 100644
--- a/tensor2tensor/bin/t2t_avg_all.py
+++ b/tensor2tensor/bin/t2t_avg_all.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Script to continously average last N checkpoints in a given directory."""
+"""Script to continuously average last N checkpoints in a given directory."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensor2tensor/bin/t2t_decoder.py b/tensor2tensor/bin/t2t_decoder.py
index fd103a6a1..25f47eace 100644
--- a/tensor2tensor/bin/t2t_decoder.py
+++ b/tensor2tensor/bin/t2t_decoder.py
@@ -82,7 +82,8 @@ def create_decode_hparams():
 
 def decode(estimator, hparams, decode_hp):
   if FLAGS.decode_interactive:
-    decoding.decode_interactively(estimator, hparams, decode_hp)
+    decoding.decode_interactively(estimator, hparams, decode_hp,
+                                  checkpoint_path=FLAGS.checkpoint_path)
   elif FLAGS.decode_from_file:
     decoding.decode_from_file(estimator, FLAGS.decode_from_file, hparams,
                               decode_hp, FLAGS.decode_to_file,
diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
index 7d8db041b..b82f7f6e4 100644
--- a/tensor2tensor/bin/t2t_trainer.py
+++ b/tensor2tensor/bin/t2t_trainer.py
@@ -59,7 +59,7 @@
 flags.DEFINE_bool("profile", False, "Profile performance?")
 
 # To maintain compatibility with some internal libs, we guard against these flag
-# definitions possibly erroring. Apologies for the ugliness.
+# definitions possibly erring. Apologies for the ugliness.
 try:
   flags.DEFINE_string("master", "", "Address of TensorFlow master.")
   flags.DEFINE_string("output_dir", "", "Base output directory for run.")
@@ -178,7 +178,7 @@ def create_run_config(hp):
   save_ckpt_secs = FLAGS.save_checkpoints_secs or None
   if save_ckpt_secs:
     save_ckpt_steps = None
-  assert FLAGS.output_dir
+  assert FLAGS.output_dir or FLAGS.checkpoint_path
   return trainer_lib.create_run_config(
       model_dir=os.path.expanduser(FLAGS.output_dir),
       master=FLAGS.master,
diff --git a/tensor2tensor/data_generators/desc2code.py b/tensor2tensor/data_generators/desc2code.py
index 1e72746fb..145279a84 100644
--- a/tensor2tensor/data_generators/desc2code.py
+++ b/tensor2tensor/data_generators/desc2code.py
@@ -140,8 +140,8 @@ def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
     samples = list(generator_samples(tmp_dir, self.pb_constants))
 
     # Split between train and dev
-    # Suffle to get problems from diverse sources (CodeChef and CodeForces) and
-    # dificulties in each set.
+    # Shuffle to get problems from diverse sources (CodeChef and CodeForces) and
+    # difficulties in each set.
     # Need to sort the samples first before shuffling (as walk() isn't
     # deterministic)
     samples.sort(key=lambda x: x.desc_file)  # in-place
@@ -289,7 +289,7 @@ def next_sample(subdir, dirs, files):  # pylint: disable=unused-argument
     for f in tf.gfile.Glob(code_pattern):
       with tf.gfile.GFile(f, mode="r") as target_file:
         # Hack to filter C++/Java files. In theory some python comments could
-        # make the file be concidered as C++ but in practice the chance of
+        # make the file be considered as C++ but in practice the chance of
         # getting a false negative is low.
         content = target_file.read()
         if not any(p in content for p in pb_cst.filter_patterns):
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
index a628252a5..3078f8dfe 100644
--- a/tensor2tensor/data_generators/generator_utils.py
+++ b/tensor2tensor/data_generators/generator_utils.py
@@ -227,7 +227,7 @@ def maybe_download(directory, filename, uri):
 
 
 def maybe_download_from_drive(directory, filename, url):
-  """Download filename from google drive unless it's already in directory.
+  """Download filename from Google drive unless it's already in directory.
 
   Args:
     directory: path to the directory that will be used.
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
index c69a78eb9..262a0dc51 100644
--- a/tensor2tensor/data_generators/problem_hparams.py
+++ b/tensor2tensor/data_generators/problem_hparams.py
@@ -31,7 +31,7 @@
 
 import tensorflow as tf
 
-# TODO(rsepassi): Merge these problems with their data generators. Currenlty
+# TODO(rsepassi): Merge these problems with their data generators. Currently
 # they only implement the hparams.
 
 
diff --git a/tensor2tensor/data_generators/ptb.py b/tensor2tensor/data_generators/ptb.py
index af455749d..4ac3911b9 100644
--- a/tensor2tensor/data_generators/ptb.py
+++ b/tensor2tensor/data_generators/ptb.py
@@ -58,7 +58,7 @@ def _build_vocab(filename, vocab_path, vocab_size):
   Args:
     filename: file to read list of words from.
     vocab_path: path where to save the vocabulary.
-    vocab_size: size of the vocablulary to generate.
+    vocab_size: size of the vocabulary to generate.
   """
   data = _read_words(filename)
   counter = collections.Counter(data)
diff --git a/tensor2tensor/data_generators/speech_recognition.py b/tensor2tensor/data_generators/speech_recognition.py
index 25cea7cc5..6c4645a56 100644
--- a/tensor2tensor/data_generators/speech_recognition.py
+++ b/tensor2tensor/data_generators/speech_recognition.py
@@ -85,7 +85,7 @@ def compute_mel_filterbank_features(
     sample_rate: sampling rate of the waveform
     dither: stddev of Gaussian noise added to waveform to prevent quantization
       artefacts
-    preemphasis: waveform high-pass filtering costant
+    preemphasis: waveform high-pass filtering constant
     frame_length: frame length in ms
     frame_step: frame_Step in ms
     fft_length: number of fft bins
@@ -182,7 +182,7 @@ def encode(self, s):
     """
     # Make sure that the data is a single channel, 16bit, 16kHz wave.
     # TODO(chorowski): the directory may not be writable, this should fallback
-    # to a temp path, and provide instructions for instaling sox.
+    # to a temp path, and provide instructions for installing sox.
     if not s.endswith(".wav"):
       out_filepath = s + ".wav"
       if not os.path.exists(out_filepath):
diff --git a/tensor2tensor/data_generators/translate.py b/tensor2tensor/data_generators/translate.py
index e0b9c6d3f..8d5cf808f 100644
--- a/tensor2tensor/data_generators/translate.py
+++ b/tensor2tensor/data_generators/translate.py
@@ -170,7 +170,7 @@ def is_generate_per_split(self):
 
   def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
     """Get vocab for distill problems."""
-    # We assume that voab file is present in data_dir, directory where the
+    # We assume that vocab file is present in data_dir directory where the
     # data generated will be stored.
     vocab_filepath = os.path.join(data_dir, self.vocab_filename)
     encoder = text_encoder.SubwordTextEncoder(vocab_filepath)
diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py
index 80f1ed36d..db18ced36 100644
--- a/tensor2tensor/data_generators/wiki.py
+++ b/tensor2tensor/data_generators/wiki.py
@@ -177,7 +177,7 @@ def scramble(self, seq):
 
 @registry.register_problem
 class LanguagemodelWikiScrambleL128(LanguagemodelWikiScramble):
-  """Sequence length 128, 50% scrambed."""
+  """Sequence length 128, 50% scrambled."""
 
   @property
   def sequence_length(self):
@@ -209,7 +209,7 @@ class LanguagemodelWikiNorefV8kL1k(LanguagemodelWikiXmlV8kL1k):
 
   Special pages (non-articles) are dropped.
 
-  This more closely resemples plain text, though there are still some xml
+  This more closely resembles plain text, though there are still some xml
   elements, like tables.
 
   Each article is prefixed by a line containing the title and length in
@@ -228,7 +228,7 @@ def vocab_filename(self):
     return "vocab.wiki_noref.%d" % self.approx_vocab_size
 
   def filepath_to_unicode_strings(self, filepath):
-    """Overriddes the base class to clean up the xml dump before tokenizing."""
+    """Overrides the base class to clean up the xml dump before tokenizing."""
     dump = text_encoder.to_unicode_ignore_errors(tf.gfile.Open(filepath).read())
     pages = _dump_to_pages(dump)
     ret = u""
diff --git a/tensor2tensor/data_generators/wsj_parsing.py b/tensor2tensor/data_generators/wsj_parsing.py
index bef82eb1b..867277de9 100644
--- a/tensor2tensor/data_generators/wsj_parsing.py
+++ b/tensor2tensor/data_generators/wsj_parsing.py
@@ -63,11 +63,11 @@ def token_generator(tree_path, source_token_vocab, target_token_vocab,
 
   This generator assumes the files at source_path and target_path have
   the same number of lines and yields dictionaries of "inputs" and "targets"
-  where inputs and targets are token ids from source and taret lines
+  where inputs and targets are token ids from source and target lines
   converted to integers using the token_map.
 
   Args:
-    tree_path: path to the file with wsj format trees, one per line.
+    tree_path: path to the file with WSJ format trees, one per line.
     source_token_vocab: GenericVocabulary object for source vocabulary.
     target_token_vocab: GenericVocabulary object for target vocabulary.
     eos: integer to append at the end of each sequence (default: None).
@@ -92,7 +92,7 @@ def parsing_token_generator(data_dir, tmp_dir, train, source_vocab_size,
   """Generator for parsing as a sequence-to-sequence task that uses tokens.
 
   This generator assumes the files parsing_{train,dev}.trees, which contain
-  trees in wsj format.
+  trees in WSJ format.
 
   Args:
     data_dir: path to the data directory.
diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index 0ccb72745..8eda5b662 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -37,7 +37,7 @@
 
 from tensorflow.python.framework import function
 
-# Struct conatining the sequences ids and order on a batch (are send to the
+# Struct containing the sequences ids and order on a batch (are send to the
 # expert to allow them to compute the bias mask)
 BatchInfo = collections.namedtuple("BatchInfo", "coordinates, order")
 
@@ -57,9 +57,9 @@ def get_standardized_layers(hparams, dp=None, ps_devices=None):
 
   Args:
     hparams (tf.HParams): the model hparameters
-    dp (expert_utils.Parallelism): A data paralelism object. If not given,
+    dp (expert_utils.Parallelism): A data parallelism object. If not given,
       the dp calls are simply ignored.
-    ps_devices: a reference to model._ps_devices (only used by the moe layer)
+    ps_devices: a reference to model._ps_devices (only used by the MOE layer)
 
   Returns:
     dict[str:fct]: A dictionary containing the standardized functions
@@ -82,9 +82,9 @@ def register_layer(
       fct_in (fct): The function to register
       default_args (list): The default parameters to add to the function.
       default_kwargs (dict): The default parameters to add to the function.
-        Those arguments can be overwriten when calling the function.
+        Those arguments can be overwritten when calling the function.
       use_dp (bool): Wrap the function call within a dataparalellism object if
-        dp is available. Some layers (like moe) must be called without dp.
+        dp is available. Some layers (like MOE) must be called without dp.
       recompute_grad (bool): If True, recompute the function during the
         backward pass to save memory
 
@@ -319,8 +319,8 @@ def add_standard_attention_hparams(hparams):
   hparams.add_hparam("attention_red_nonlinearity", "none")
 
   # Fully connected layers flags
-  # To be more concistent, should use filter_size to also controle the moe
-  # size if moe_hidden_sizes not set
+  # To be more consistent, should use filter_size to also control the MOE
+  # size if moe_hidden_sizes not set.
   hparams.add_hparam("filter_size", 2048)
   hparams.add_hparam("relu_dropout", 0.0)
 
@@ -400,7 +400,7 @@ def get_timing_signal_1d(length,
   memory inputs to attention.
 
   The use of relative position is possible because sin(x+y) and cos(x+y) can be
-  experessed in terms of y, sin(x) and cos(x).
+  expressed in terms of y, sin(x) and cos(x).
 
   In particular, we use a geometric sequence of timescales starting with
   min_timescale and ending with max_timescale.  The number of different
@@ -861,8 +861,8 @@ def to_float(bc):
 
   bc_v = tf.expand_dims(to_float(batch_coordinates_q), 1)
   bc_h = tf.expand_dims(to_float(batch_coordinates_k), 0)
-  bias_batch = bc_h - bc_v  # Broadcast to create [length_q, length_k] mask
-  # Theshold non zeros to 1.0
+  bias_batch = bc_h - bc_v  # Broadcast to create [length_q, length_k] mask.
+  # Threshold non zeros to 1.0.
   bias_batch = condition_fn(bias_batch)
   bias_batch *= -1e9  # Set non zeros to -infinity
   return bias_batch
@@ -877,9 +877,9 @@ def to_float(bc):
 # Mask similar to upper triangular mask, but allow dispatching
 attention_bias_future = functools.partial(
     attention_bias_batch,
-    # Elems can attend to themself (otherwise would use bias_batch + 1.0)
-    # No tf.abs to consider the order
-    # tf.maximum and tf.minimum to threshold the values
+    # Elems can attend to themselves (otherwise would use bias_batch + 1.0).
+    # No tf.abs to consider the order,
+    # tf.maximum and tf.minimum to threshold the values.
     condition_fn=lambda bias: tf.maximum(0.0, tf.minimum(1.0, bias)),
 )
 
@@ -1060,7 +1060,7 @@ def grouped_attention_multihead(query_antecedent,
   memory_target_density indicates the average how many groups in which
   a key-value pair should participate.
 
-  We use auxialiary losses to ensure that each group contains roughly
+  We use auxiliary losses to ensure that each group contains roughly
   the same number of queries and the same number of key-value pairs.
   If for a given sequence, the actual number of queries/pairs sent to
   an expert exceeds this target by a factor of more than
@@ -1316,7 +1316,7 @@ def dot_product_attention(q,
     name: an optional string
     make_image_summary: True if you want an image summary.
     save_weights_to: an optional dictionary to capture attention weights
-      for vizualization; the weights tensor will be appended there under
+      for visualization; the weights tensor will be appended there under
       a string key created from the variable scope (including name).
     dropout_broadcast_dims:  an optional list of integers less than 4
       specifying in which dimensions to broadcast the dropout decisions.
@@ -2509,7 +2509,7 @@ def multihead_attention(query_antecedent,
         [batch_size, length_q, hidden_dim]
     unless the cache dict is provided in which case only the last memory
     position is calculated and the output shape is [batch_size, 1, hidden_dim]
-    Optionaly returns an additional loss parameters (ex: load balance loss for
+    Optionally returns an additional loss parameters (ex: load balance loss for
     the experts) returned by the attention_type function.
 
   Raises:
@@ -2787,7 +2787,7 @@ def parameter_attention(x,
 
 @expert_utils.add_name_scope()
 def coordinate_tensor(shape, axis):
-  """Return a tensor with given shape containing coordinte along given axis.
+  """Return a tensor with given shape containing coordinate along given axis.
 
   Args:
     shape: a Tensor representing the shape of the output Tensor
@@ -2879,7 +2879,7 @@ def add_or_set_if(prev_bias, new_bias, condition):
     def mask_and_call_attention(x):
       """Function applied once for each sequence of the batch."""
 
-      # Mask to prevent sequences of attenting to the future
+      # Mask to prevent sequences of attending to the future
       length = common_layers.shape_list(x)[1]  # x has shape [1, length,...]
       bias_past = tf.reshape(
           attention_bias_lower_triangle(length), [length, length])
@@ -2972,7 +2972,7 @@ def expert_dot_product(q, k, v, info_q, info_k):
   """Perform dot product on a subset of the sequence.
 
   Can add a mask to the attention to prevent sequences to attend to each other
-  and to prevent attention to the futur.
+  and to prevent attention to the future.
 
   Args:
     q (tf.Tensor): Queries of shape [length_expert_q, depth_k]
@@ -3201,16 +3201,12 @@ def flatten_batch(x):
   gates_q = tf.stack(list_gates_q)
   gates_k = tf.stack(list_gates_k)
 
-  # Process each head separatly
+  # Process each head separately.
   v_out = map_fn_switch(
       lambda args: dot_product_single_head(bi=bi, *args),
       elems=(q, k, v, gates_q, gates_k),
       dtype=(tf.float32),
       parallel_iterations=2,
-      # back_prop=True,
-      # swap_memory=False,
-      # infer_shape=True,
-      # name=None
       use_map_fn=use_map_fn,
   )
 
@@ -3436,9 +3432,8 @@ def conv_elems_1d(x, factor, out_depth=None):
 
   Merge/restore/compress factors positions of dim depth of the input into
   a single position of dim out_depth.
-  This is basically just a strided convolution without overlapp
-  between each strides.
-  The original length has to be divided by factor.
+  This is basically just a strided convolution without overlap
+  between each strides. The original length has to be divided by factor.
 
   Args:
     x (tf.Tensor): shape [batch_size, length, depth]
@@ -3481,7 +3476,7 @@ def local_reduction_attention(x, block_length, multihead_params):
   def dot_product_self_local_attention_flattened(q, k, v):
     """Strided block local self-attention.
 
-    No overlapp between the blocks.
+    No overlap between the blocks.
 
     Args:
       q (tf.Tensor): shape [batch, heads, length, depth_k]
@@ -3562,7 +3557,7 @@ def multihead_self_attention_reduced(
 
   Args:
     x (tf.Tensor): float32 of shape [batch, length, depth]
-    memory_antecedent (tf.Tensor): Unsuported for now
+    memory_antecedent (tf.Tensor): Unsupported for now
     bias (tf.Tensor): Ignored
     factor (int): compression factor for the memory sequence
     multihead_params (dict): parameters for multihead attention
@@ -3584,7 +3579,7 @@ def multihead_self_attention_reduced(
 
   depth = x.get_shape().as_list()[-1]
 
-  # Could try to have some overlapp between the blocks but that would
+  # Could try to have some overlap between the blocks but that would
   # create conv artifacts, would make it difficult to not attend to the future
   # within one group and the padding should be handled specially.
 
diff --git a/tensor2tensor/layers/common_attention_test.py b/tensor2tensor/layers/common_attention_test.py
index bb84a515d..61ff5a6d5 100644
--- a/tensor2tensor/layers/common_attention_test.py
+++ b/tensor2tensor/layers/common_attention_test.py
@@ -403,7 +403,7 @@ def testDotProductAttentionRelative(self):
     self.assertEqual(res.shape, (5, 7, 12, 32))
 
   def testBiasBatchCoordinates(self):
-    """Testing the batch cooridnates mask."""
+    """Testing the batch coordinates mask."""
     q = tf.constant([0, 0, 1, 1, 1, 1, 2, 2, 2], dtype=tf.int32)
     q = tf.expand_dims(q, axis=-1)
 
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
index ea0e93fbd..083634785 100644
--- a/tensor2tensor/layers/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -220,7 +220,7 @@ def basic_params1():
       scheduled_sampling_warmup_steps=50000,
       scheduled_sampling_gold_mixin_prob=0.5,
       # This setting controls whether to copy variables around in a daisy chain
-      # (if true) or leave their placement to Tensorflow. It only affects multi
+      # (if true) or leave their placement to TensorFlow. It only affects multi
       # device training and mostly should be turned on for performance. One
       # exception are recurrent models: with dynamic loops it must be off.
       daisy_chain_variables=True,
diff --git a/tensor2tensor/layers/common_image_attention.py b/tensor2tensor/layers/common_image_attention.py
index 23730c0d6..f80b04e49 100644
--- a/tensor2tensor/layers/common_image_attention.py
+++ b/tensor2tensor/layers/common_image_attention.py
@@ -537,12 +537,12 @@ def prepare_decoder(targets, hparams):
   # At inference, they are [batch, curr_infer_length, 1, 1]
   if hparams.mode == tf.contrib.learn.ModeKeys.INFER:
     curr_infer_length = targets_shape[1]
-    if hparams.block_rastor_scan:
+    if hparams.block_raster_scan:
       assert hparams.img_len*channels % hparams.query_shape[1] == 0
       assert hparams.img_len % hparams.query_shape[0] == 0
       total_block_width = hparams.img_len*channels
-      # Decoding is in block rastor scan order. We divide the image into
-      # hparams.query_shape blocks and then decode each block in rastor scan.
+      # Decoding is in block raster scan order. We divide the image into
+      # hparams.query_shape blocks and then decode each block in raster scan.
       # To make that compatible with our inference pipeline, pad the target so
       # that rows is a multiple of query_shape and columns is a multiple of
       # hparams.img_len*channels
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index 14b3e0a2e..16836adee 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -1698,7 +1698,7 @@ def padded_cross_entropy(logits,
     label_smoothing: a floating point `Scalar`.
     weights_fn: A function from labels to weights.
     reduce_sum: a Boolean, whether to sum at the end or not.
-    gaussian: If true, use a gaussian distribution for label smoothing
+    gaussian: If true, use a Gaussian distribution for label smoothing
 
   Returns:
     loss_numerator: a `Scalar`.  Sum of losses.
@@ -1747,9 +1747,9 @@ def smoothing_cross_entropy(logits,
     labels: Tensor of size [batch_size, ?, ?, ?]
     vocab_size: Tensor representing the size of the vocabulary.
     confidence: Used to determine on and off values for label smoothing.
-      If `gaussian` is true, `confidence` is the variance to the gaussian
+      If `gaussian` is true, `confidence` is the variance to the Gaussian
       distribution.
-    gaussian: Uses a gaussian distribution for label smoothing
+    gaussian: Uses a Gaussian distribution for label smoothing
 
   Returns:
 
@@ -1993,7 +1993,7 @@ def ravanbakhsh_set_layer(layer_size,
                           name=None):
   """Layer from Deep Sets paper: https://arxiv.org/abs/1611.04500 .
 
-  More parameter-efficient verstion of a linear-set-layer with context.
+  More parameter-efficient version of a linear-set-layer with context.
 
   Args:
     layer_size: Dimension to transform the input vectors to.
@@ -2719,7 +2719,7 @@ def grad_fn(inputs, variables, outputs, output_grads):
       grad_vars = [tf.cast(grad_var, tf.bfloat16) for grad_var in grad_vars]
     if is_on_tpu():
       # TODO(noam): remove this hack once XLA does the right thing.
-      # Force the gradinets on the inputs to be computed before the variables
+      # Force the gradients on the inputs to be computed before the variables
       # are updated.  This saves memory by preventing XLA from making an extra
       # copy of the variables.
       grad_vars = force_dependency(grad_vars, grad_inputs)
@@ -2761,7 +2761,7 @@ def dense(x, units, **kwargs):
   fn = lambda x: tf.layers.dense(x, units, **kwargs)
   if is_on_tpu():
     # TODO(noam): remove this hack once XLA does the right thing.
-    # Forces the gradinets on the inputs to be computed before the variables
+    # Forces the gradients on the inputs to be computed before the variables
     # are updated.  This saves memory by preventing XLA from making an extra
     # copy of the variables.
     return _recompute_grad(fn, [x])
diff --git a/tensor2tensor/layers/discretization.py b/tensor2tensor/layers/discretization.py
index 6ade1e38b..75519f5a5 100644
--- a/tensor2tensor/layers/discretization.py
+++ b/tensor2tensor/layers/discretization.py
@@ -616,7 +616,7 @@ def discrete_bottleneck(x,
         x_means += x_means_res
         x_means_hot.append(x_means_hot_res)
 
-      # Get the discrete latent represenation
+      # Get the discrete latent representation
       x_means_hot = tf.stack(x_means_hot, axis=1)
       x_means_idx = tf.argmax(x_means_hot, axis=-1)
 
diff --git a/tensor2tensor/models/image_transformer.py b/tensor2tensor/models/image_transformer.py
index 0f6244e36..f0130b195 100644
--- a/tensor2tensor/models/image_transformer.py
+++ b/tensor2tensor/models/image_transformer.py
@@ -164,7 +164,7 @@ def image_transformer_base():
   hparams.add_hparam("num_decoder_layers", 12)
   hparams.sep_rgb_embed = False
   hparams.add_hparam("dec_attention_type", cia.AttentionType.LOCAL_1D)
-  hparams.add_hparam("block_rastor_scan", False)
+  hparams.add_hparam("block_raster_scan", False)
 
   # multipos attention params
   hparams.add_hparam("q_filter_width", 1)
diff --git a/tensor2tensor/models/image_transformer_2d.py b/tensor2tensor/models/image_transformer_2d.py
index 101126d31..cdcd0c654 100644
--- a/tensor2tensor/models/image_transformer_2d.py
+++ b/tensor2tensor/models/image_transformer_2d.py
@@ -155,7 +155,7 @@ def image_transformer2d_base():
   # attention type related params
   hparams.add_hparam("enc_attention_type", cia.AttentionType.GLOBAL)
   hparams.add_hparam("dec_attention_type", cia.AttentionType.LOCAL_2D)
-  hparams.add_hparam("block_rastor_scan", False)
+  hparams.add_hparam("block_raster_scan", False)
 
   # multipos attention params
   hparams.add_hparam("q_filter_width", 1)
@@ -169,7 +169,7 @@ def image_transformer2d_base():
 def imagetransformer2d_base():
   hparams = image_transformer2d_base()
   hparams.dec_attention_type = cia.AttentionType.LOCAL_2D
-  hparams.block_rastor_scan = True
+  hparams.block_raster_scan = True
   return hparams
 
 
@@ -329,7 +329,7 @@ def img2img_transformer2d_base():
   hparams.num_encoder_layers = 4
   hparams.num_decoder_layers = 8
   hparams.dec_attention_type = cia.AttentionType.LOCAL_2D
-  hparams.block_rastor_scan = True
+  hparams.block_raster_scan = True
   return hparams
 
 
@@ -382,7 +382,7 @@ def img2img_transformer_base():
   hparams.block_length = 256
   hparams.block_width = 256
   hparams.dec_attention_type = cia.AttentionType.LOCAL_1D
-  hparams.block_rastor_scan = False
+  hparams.block_raster_scan = False
   return hparams
 
 
diff --git a/tensor2tensor/models/research/transformer_vae.py b/tensor2tensor/models/research/transformer_vae.py
index a49b85f1f..ec2966b6a 100644
--- a/tensor2tensor/models/research/transformer_vae.py
+++ b/tensor2tensor/models/research/transformer_vae.py
@@ -781,7 +781,7 @@ def imagetransformer_ae_cifar():
   hparams.num_decoder_layers = 12
   hparams.sep_rgb_embed = False
   hparams.add_hparam("dec_attention_type", cia.AttentionType.LOCAL_1D)
-  hparams.add_hparam("block_rastor_scan", False)
+  hparams.add_hparam("block_raster_scan", False)
 
   # multipos attention params
   hparams.add_hparam("q_filter_width", 1)
diff --git a/tensor2tensor/models/resnet.py b/tensor2tensor/models/resnet.py
index 30275e7ca..39f3dd723 100644
--- a/tensor2tensor/models/resnet.py
+++ b/tensor2tensor/models/resnet.py
@@ -223,7 +223,7 @@ def bottleneck_block(inputs,
     The output `Tensor` of the block.
   """
   # TODO(chrisying): this block is technically the post-activation resnet-v1
-  # bottlneck unit. Test with v2 (pre-activation) and replace if there is no
+  # bottleneck unit. Test with v2 (pre-activation) and replace if there is no
   # difference for consistency.
   shortcut = inputs
   if projection_shortcut is not None:
diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py
index fe5dab52d..0410ff7d1 100644
--- a/tensor2tensor/models/slicenet.py
+++ b/tensor2tensor/models/slicenet.py
@@ -282,7 +282,7 @@ class SliceNet(t2t_model.T2TModel):
   def body(self, features):
     target_modality_name = (
         self._problem_hparams.target_modality.name)
-    # If we're just predicing a class, there is no use for a decoder.
+    # If we're just predicting a class, there is no use for a decoder.
     run_decoder = "class_label_modality" not in target_modality_name
     return slicenet_internal(
         features["inputs"],
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index f08254c00..32e2eaf9e 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -69,7 +69,7 @@ def encode(self, inputs, target_space, hparams, features=None):
           encoder_output: Encoder representation.
               [batch_size, input_length, hidden_dim]
           encoder_decoder_attention_bias: Bias and mask weights for
-              encodre-decoder attention. [batch_size, input_length]
+              encoder-decoder attention. [batch_size, input_length]
     """
     inputs = common_layers.flatten4d3d(inputs)
 
@@ -214,7 +214,7 @@ def _beam_decode(self, features, decode_length, beam_size, top_beams, alpha):
       beam_size: number of beams.
       top_beams: an integer. How many of the beams to return.
       alpha: Float that controls the length penalty. larger the alpha, stronger
-        the preference for slonger translations.
+        the preference for longer translations.
 
     Returns:
       A dict of decoding results {
@@ -253,7 +253,7 @@ def _fast_decode(self,
       beam_size: number of beams.
       top_beams: an integer. How many of the beams to return.
       alpha: Float that controls the length penalty. larger the alpha, stronger
-        the preference for slonger translations.
+        the preference for longer translations.
 
     Returns:
       A dict of decoding results {
@@ -432,7 +432,7 @@ def fast_decode(encoder_output,
     beam_size: number of beams.
     top_beams: an integer. How many of the beams to return.
     alpha: Float that controls the length penalty. larger the alpha, stronger
-      the preference for slonger translations.
+      the preference for longer translations.
     eos_id: End-of-sequence symbol in beam search.
     batch_size: an integer scalar - must be passed if there is no input
 
@@ -625,7 +625,7 @@ def transformer_prepare_decoder(targets, hparams, features=None):
 
   Returns:
     decoder_input: a Tensor, bottom of decoder stack
-    decoder_self_attention_bias: a bias tensor for use in encoder self-attention
+    decoder_self_attention_bias: a bias tensor for use in decoder self-attention
   """
   if hparams.prepend_mode == "prepend_inputs_full_attention":
     decoder_self_attention_bias = (
@@ -679,10 +679,10 @@ def transformer_encoder(encoder_input,
       indicating what positions are not padding.  This must either be
       passed in, which we do for "packed" datasets, or inferred from
       encoder_self_attention_bias.  The knowledge about padding is used
-      for pad_remover(efficiency) and to mask out padding in convoltutional
+      for pad_remover(efficiency) and to mask out padding in convolutional
       layers.
     save_weights_to: an optional dictionary to capture attention weights
-      for vizualization; the weights tensor will be appended there under
+      for visualization; the weights tensor will be appended there under
       a string key created from the variable scope (including name).
     make_image_summary: Whether to make an attention image summary.
 
@@ -758,11 +758,11 @@ def transformer_decoder(decoder_input,
     name: a string
     nonpadding: optional Tensor with shape [batch_size, encoder_length]
       indicating what positions are not padding.  This is used
-      to mask out padding in convoltutional layers.  We generally only
+      to mask out padding in convolutional layers.  We generally only
       need this mask for "packed" datasets, because for ordinary datasets,
       no padding is ever followed by nonpadding.
     save_weights_to: an optional dictionary to capture attention weights
-      for vizualization; the weights tensor will be appended there under
+      for visualization; the weights tensor will be appended there under
       a string key created from the variable scope (including name).
     make_image_summary: Whether to make an attention image summary.
 
@@ -839,7 +839,7 @@ def transformer_ffn_layer(x,
       can give a significant speedup.
     conv_padding: a string - either "LEFT" or "SAME".
     nonpadding_mask: an optional Tensor with shape [batch_size, length].
-      needed for convolutoinal layers with "SAME" padding.
+      needed for convolutional layers with "SAME" padding.
       Contains 1.0 in positions corresponding to nonpadding.
 
   Returns:
@@ -982,7 +982,7 @@ def transformer_base():
 
 @registry.register_hparams
 def transformer_big():
-  """HParams for transfomer big model on WMT."""
+  """HParams for transformer big model on WMT."""
   hparams = transformer_base()
   hparams.hidden_size = 1024
   hparams.filter_size = 4096
@@ -993,7 +993,7 @@ def transformer_big():
 
 @registry.register_hparams
 def transformer_big_single_gpu():
-  """HParams for transformer big model for single gpu."""
+  """HParams for transformer big model for single GPU."""
   hparams = transformer_big()
   hparams.layer_prepostprocess_dropout = 0.1
   hparams.learning_rate_warmup_steps = 16000
@@ -1002,7 +1002,7 @@ def transformer_big_single_gpu():
 
 @registry.register_hparams
 def transformer_base_single_gpu():
-  """HParams for transformer base model for single gpu."""
+  """HParams for transformer base model for single GPU."""
   hparams = transformer_base()
   hparams.batch_size = 2048
   hparams.learning_rate_warmup_steps = 16000
@@ -1011,7 +1011,7 @@ def transformer_base_single_gpu():
 
 @registry.register_hparams
 def transformer_parsing_base():
-  """Hparams for parsing on wsj only."""
+  """HParams for parsing on WSJ only."""
   hparams = transformer_base()
   hparams.attention_dropout = 0.2
   hparams.layer_prepostprocess_dropout = 0.2
@@ -1025,7 +1025,7 @@ def transformer_parsing_base():
 
 @registry.register_hparams
 def transformer_parsing_big():
-  """HParams for parsing on wsj semi-supervised."""
+  """HParams for parsing on WSJ semi-supervised."""
   hparams = transformer_big()
   hparams.max_length = 512
   hparams.shared_source_target_embedding = False
@@ -1038,7 +1038,7 @@ def transformer_parsing_big():
 
 @registry.register_hparams
 def transformer_parsing_ice():
-  """Hparams for parsing and tagging Icelandic text."""
+  """HParams for parsing and tagging Icelandic text."""
   hparams = transformer_base_single_gpu()
   hparams.batch_size = 4096
   hparams.shared_embedding_and_softmax_weights = False
@@ -1454,17 +1454,17 @@ def transformer_clean_big_tpu():
 def transformer_tpu_with_conv():
   """Cut down on the number of heads, and use convs instead."""
   hparams = transformer_tpu()
-  hparams.num_heads = 4   # heads are expensive on tpu
+  hparams.num_heads = 4   # Heads are expensive on TPUs.
   hparams.ffn_layer = "conv_relu_conv"
   return hparams
 
 
 @registry.register_hparams
 def transformer_lm_tpu_0():
-  """Hparams for training languagemodel_lm1b8k on tpu.  92M Params."""
+  """HParams for training languagemodel_lm1b8k on tpu.  92M Params."""
   hparams = transformer_clean_big()
   update_hparams_for_tpu(hparams)
-  hparams.num_heads = 4   # heads are expensive on tpu
+  hparams.num_heads = 4   # Heads are expensive on TPUs.
   hparams.batch_size = 4096
   hparams.shared_embedding_and_softmax_weights = False
   hparams.layer_prepostprocess_dropout = 0.1
@@ -1473,7 +1473,7 @@ def transformer_lm_tpu_0():
 
 @registry.register_hparams
 def transformer_lm_tpu_1():
-  """Hparams for training languagemodel_lm1b8k on tpu.  335M Params."""
+  """HParams for training languagemodel_lm1b8k on tpu.  335M Params."""
   hparams = transformer_lm_tpu_0()
   hparams.hidden_size = 2048
   hparams.filter_size = 8192
@@ -1482,7 +1482,7 @@ def transformer_lm_tpu_1():
 
 @registry.register_hparams
 def transformer_librispeech():
-  """Hparams for training ASR model on Librispeech."""
+  """HParams for training ASR model on Librispeech."""
   hparams = transformer_base()
 
   hparams.num_heads = 4
@@ -1499,7 +1499,7 @@ def transformer_librispeech():
 
 @registry.register_hparams
 def transformer_librispeech_tpu():
-  """Hparams for training ASR model on Librispeech on TPU."""
+  """HParams for training ASR model on Librispeech on TPU."""
   hparams = transformer_librispeech()
   update_hparams_for_tpu(hparams)
 
@@ -1510,7 +1510,7 @@ def transformer_librispeech_tpu():
 
 @registry.register_hparams
 def transformer_supervised_attention():
-  """Hparams for supervised attention problems."""
+  """HParams for supervised attention problems."""
   hparams = transformer_base()
   # Attention loss type (KL-divergence or MSE).
   hparams.add_hparam("expected_attention_loss_type", "kl_divergence")
diff --git a/tensor2tensor/rl/README.md b/tensor2tensor/rl/README.md
index ffd595911..d925aef09 100644
--- a/tensor2tensor/rl/README.md
+++ b/tensor2tensor/rl/README.md
@@ -1,8 +1,8 @@
 # Tensor2Tensor experimental Model-Based Reinforcement Learning.
 
-The rl package intention is to provide possiblity to run reinforcement
-algorithms within Tensorflow's computation graph, in order to do model-based
-RL using envoronment models from Tensor2Tensor. It's very experimental
+The rl package intention is to provide possibility to run reinforcement
+algorithms within TensorFlow's computation graph, in order to do model-based
+RL using environment models from Tensor2Tensor. It's very experimental
 for now and under heavy development.
 
 Currently the only supported algorithm is Proximy Policy Optimization - PPO.
diff --git a/tensor2tensor/rl/envs/utils.py b/tensor2tensor/rl/envs/utils.py
index 59732fed0..4f6fb8891 100644
--- a/tensor2tensor/rl/envs/utils.py
+++ b/tensor2tensor/rl/envs/utils.py
@@ -81,7 +81,7 @@ def _reset(self, **kwargs):
 
 
 class ExternalProcessEnv(object):
-  """Step environment in a separate process for lock free paralellism."""
+  """Step environment in a separate process for lock free parallelism."""
 
   # Message types for communication via the pipe.
   _ACCESS = 1
@@ -91,7 +91,7 @@ class ExternalProcessEnv(object):
   _CLOSE = 5
 
   def __init__(self, constructor, xvfb):
-    """Step environment in a separate process for lock free paralellism.
+    """Step environment in a separate process for lock free parallelism.
 
     The environment will be created in the external process by calling the
     specified callable. This can be an environment class, or a function
@@ -226,7 +226,7 @@ def _receive(self):
 
     Raises:
       Exception: An exception was raised inside the worker process.
-      KeyError: The reveived message is of an unknown type.
+      KeyError: The received message is of an unknown type.
 
     Returns:
       Payload object of the message.
diff --git a/tensor2tensor/rl/ppo.py b/tensor2tensor/rl/ppo.py
index 706e3c6b4..a1a677147 100644
--- a/tensor2tensor/rl/ppo.py
+++ b/tensor2tensor/rl/ppo.py
@@ -112,7 +112,7 @@ def calculate_generalized_advantage_estimator(
   """Generalized advantage estimator."""
 
   # Below is slight weirdness, we set the last reward to 0.
-  # This makes the adventantage to be 0 in the last timestep
+  # This makes the advantage to be 0 in the last timestep
   reward = tf.concat([reward[:-1, :], value[-1:, :]], axis=0)
   next_value = tf.concat([value[1:, :], tf.zeros_like(value[-1:, :])], axis=0)
   next_not_done = 1 - tf.cast(tf.concat([done[1:, :],
diff --git a/tensor2tensor/rl/t2t_rl_trainer.py b/tensor2tensor/rl/t2t_rl_trainer.py
index 3abe22723..bd3780a9b 100644
--- a/tensor2tensor/rl/t2t_rl_trainer.py
+++ b/tensor2tensor/rl/t2t_rl_trainer.py
@@ -27,7 +27,7 @@
 FLAGS = flags.FLAGS
 
 # To maintain compatibility with some internal libs, we guard against these flag
-# definitions possibly erroring. Apologies for the ugliness.
+# definitions possibly erring. Apologies for the ugliness.
 try:
   flags.DEFINE_string("output_dir", "", "Base output directory for run.")
 except:  # pylint: disable=bare-except
diff --git a/tensor2tensor/utils/adafactor.py b/tensor2tensor/utils/adafactor.py
index 31c3a5558..979f1b3be 100644
--- a/tensor2tensor/utils/adafactor.py
+++ b/tensor2tensor/utils/adafactor.py
@@ -32,7 +32,7 @@ class AdafactorOptimizer(tf.train.Optimizer):
 
   1. For a two-dimensional AxB weight matrix, Adafactor uses only A+B auxiliary
      parameters to maintain the second-moment estimator, instead of AB.
-     This is advantagous on memory-limited systems.  In addition, beta1
+     This is advantageous on memory-limited systems.  In addition, beta1
      (momentum) is set to zero by default, saving an additional auxiliary
      parameter per weight.
 
@@ -332,7 +332,7 @@ def _simulated_quantize(x, num_bits, quantization_noise):
   quantization_noise is a float32 Tensor containing values in [0, 1).
   Each value in quantization_noise should take different values across
   different steps, approximating a uniform distribution over [0, 1).
-  In the case of relicated TPU training, quantization_noise should be identical
+  In the case of replicated TPU training, quantization_noise should be identical
   across replicas in order to keep the parameters identical across replicas.
 
   The natural choice for quantization_noise would be tf.random_uniform(),
@@ -383,7 +383,7 @@ def _quantization_noise_from_step_num():
   """
   step = tf.to_int32(tf.train.get_or_create_global_step()) + 1
   phi = ((5 ** 0.5) - 1) / 2
-  # Naive computation tf.mod(phi * step, 1.0) in float32 would be disasterous
+  # Naive computation tf.mod(phi * step, 1.0) in float32 would be disastrous
   # due to loss of precision when the step number gets large.
   # Computation in doubles does not work on TPU, so we use this complicated
   # alternative computation which does not suffer from these roundoff errors.
diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py
index 10ee6c1f7..fa09a1d1b 100644
--- a/tensor2tensor/utils/beam_search.py
+++ b/tensor2tensor/utils/beam_search.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Implemetation of beam seach with penalties."""
+"""Implementation of beam search with penalties."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -94,7 +94,7 @@ def log_prob_from_logits(logits):
 
 
 def compute_batch_indices(batch_size, beam_size):
-  """Computes the i'th coodinate that contains the batch index for gathers.
+  """Computes the i'th coordinate that contains the batch index for gathers.
 
   Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..]. It says which
   batch the beam item is in. This will create the i of the i,j coordinate
@@ -188,7 +188,7 @@ def beam_search(symbols_to_logits_fn,
                 stop_early=True):
   """Beam search with length penalties.
 
-  Requires a function that can take the currently decoded sybmols and return
+  Requires a function that can take the currently decoded symbols and return
   the logits for the next symbol. The implementation is inspired by
   https://arxiv.org/abs/1609.08144.
 
@@ -320,7 +320,7 @@ def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished, states):
                                        "grow_alive", states)
 
   def grow_topk(i, alive_seq, alive_log_probs, states):
-    r"""Inner beam seach loop.
+    r"""Inner beam search loop.
 
     This function takes the current alive sequences, and grows them to topk
     sequences where k = 2*beam. We use 2*beam because, we could have beam_size
@@ -361,14 +361,14 @@ def grow_topk(i, alive_seq, alive_log_probs, states):
     # Convert logits to normalized log probs
     candidate_log_probs = log_prob_from_logits(logits)
 
-    # Multiply the probabilites by the current probabilites of the beam.
+    # Multiply the probabilities by the current probabilities of the beam.
     # (batch_size, beam_size, vocab_size) + (batch_size, beam_size, 1)
     log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
 
     length_penalty = tf.pow(((5. + tf.to_float(i + 1)) / 6.), alpha)
 
     curr_scores = log_probs / length_penalty
-    # Flatten out (beam_size, vocab_size) probs in to a list of possibilites
+    # Flatten out (beam_size, vocab_size) probs in to a list of possibilities
     flat_curr_scores = tf.reshape(curr_scores, [-1, beam_size * vocab_size])
 
     topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores, k=beam_size * 2)
@@ -381,7 +381,7 @@ def grow_topk(i, alive_seq, alive_log_probs, states):
     topk_ids %= vocab_size  # Unflatten the ids
 
     # The next three steps are to create coordinates for tf.gather_nd to pull
-    # out the correct seqences from id's that we need to grow.
+    # out the correct sequences from id's that we need to grow.
     # We will also use the coordinates to gather the booleans of the beam items
     # that survived.
     batch_pos = compute_batch_indices(batch_size, beam_size * 2)
@@ -407,7 +407,7 @@ def grow_topk(i, alive_seq, alive_log_probs, states):
 
   def inner_loop(i, alive_seq, alive_log_probs, finished_seq, finished_scores,
                  finished_flags, states):
-    """Inner beam seach loop.
+    """Inner beam search loop.
 
     There are three groups of tensors, alive, finished, and topk.
     The alive group contains information about the current alive sequences
@@ -447,7 +447,7 @@ def inner_loop(i, alive_seq, alive_log_probs, finished_seq, finished_scores,
          Log probs of the alive sequences,
          New finished sequences,
          Scores of the new finished sequences,
-         Flags inidicating which sequence in finished as reached EOS,
+         Flags indicating which sequence in finished as reached EOS,
          dict of final decoding states)
     """
 
@@ -471,7 +471,7 @@ def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq,
     """Checking termination condition.
 
     We terminate when we decoded up to decode_length or the lowest scoring item
-    in finished has a greater score that the higest prob item in alive divided
+    in finished has a greater score that the highest prob item in alive divided
     by the max length penalty
 
     Args:
@@ -488,7 +488,7 @@ def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq,
     if not stop_early:
       return tf.less(i, decode_length)
     max_length_penalty = tf.pow(((5. + tf.to_float(decode_length)) / 6.), alpha)
-    # The best possible score of the most likley alive sequence
+    # The best possible score of the most likely alive sequence.
     lower_bound_alive_scores = alive_log_probs[:, 0] / max_length_penalty
 
     # Now to compute the lowest score of a finished sequence in finished
diff --git a/tensor2tensor/utils/bleu_hook.py b/tensor2tensor/utils/bleu_hook.py
index 2c854cdba..925d0cfae 100644
--- a/tensor2tensor/utils/bleu_hook.py
+++ b/tensor2tensor/utils/bleu_hook.py
@@ -39,7 +39,7 @@
 
 
 def _get_ngrams(segment, max_order):
-  """Extracts all n-grams upto a given maximum order from an input segment.
+  """Extracts all n-grams up to a given maximum order from an input segment.
 
   Args:
     segment: text segment from which n-grams will be extracted.
@@ -47,7 +47,7 @@ def _get_ngrams(segment, max_order):
         methods.
 
   Returns:
-    The Counter containing all n-grams upto max_order in segment
+    The Counter containing all n-grams up to max_order in segment
     with a count of how many times each n-gram occurred.
   """
   ngram_counts = collections.Counter()
@@ -130,7 +130,7 @@ def bleu_score(predictions, labels, **unused_kwargs):
   and use brevity penalty. Also, this does not have beam search.
 
   Args:
-    predictions: tensor, model predicitons
+    predictions: tensor, model predictions
     labels: tensor, gold output.
 
   Returns:
diff --git a/tensor2tensor/utils/cloud_tpu.py b/tensor2tensor/utils/cloud_tpu.py
index 96d011568..1518e69ae 100644
--- a/tensor2tensor/utils/cloud_tpu.py
+++ b/tensor2tensor/utils/cloud_tpu.py
@@ -305,7 +305,7 @@ def tpu_tunnel(vm_name, tpu_ip):
     time.sleep(1)
     if tunnel_process.poll() is not None:
       raise ValueError("SSH failed")
-    tf.logging.info("Set up port fowarding. Local ports: %s", local_ports)
+    tf.logging.info("Set up port forwarding. Local ports: %s", local_ports)
     yield local_ports, tunnel_process.pid
 
 
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index 8cece2625..09eb9417e 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -177,7 +177,7 @@ def _batching_scheme(batch_size,
   ]
   max_batch_size = max(batch_sizes)
   # Since the Datasets API only allows a single constant for window_size,
-  # and it needs divide all bucket_batch_sizes, we pick a highly-compoisite
+  # and it needs divide all bucket_batch_sizes, we pick a highly-composite
   # window size and then round down all batch sizes to divisors of that window
   # size, so that a window can always be divided evenly into batches.
   # TODO(noam): remove this when Dataset API improves.
diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py
index 437463514..65616191c 100644
--- a/tensor2tensor/utils/decoding.py
+++ b/tensor2tensor/utils/decoding.py
@@ -343,7 +343,7 @@ def input_fn():
   return input_fn
 
 
-def decode_interactively(estimator, hparams, decode_hp):
+def decode_interactively(estimator, hparams, decode_hp, checkpoint_path=None):
   """Interactive decoding."""
 
   def input_fn():
@@ -353,7 +353,7 @@ def input_fn():
     example = _interactive_input_tensor_to_features_dict(example, hparams)
     return example
 
-  result_iter = estimator.predict(input_fn)
+  result_iter = estimator.predict(input_fn, checkpoint_path=checkpoint_path)
   for result in result_iter:
     problem_idx = result["problem_choice"]
     is_image = False  # TODO(lukaszkaiser): find out from problem id / class.
diff --git a/tensor2tensor/utils/diet.py b/tensor2tensor/utils/diet.py
index 7f1915e4d..df78c77a1 100644
--- a/tensor2tensor/utils/diet.py
+++ b/tensor2tensor/utils/diet.py
@@ -54,7 +54,7 @@ def diet_expert(x, hidden_size, params):
   """A two-layer feed-forward network with relu activation on hidden layer.
 
   Uses diet variables.
-  Recompuets hidden layer on backprop to save activation memory.
+  Recomputes hidden layer on backprop to save activation memory.
 
   Args:
     x: a Tensor with shape [batch, io_size]
@@ -120,7 +120,7 @@ class DietAdamOptimizer(DietVariableOptimizer):
 
   Diet variables should be created with the
   DietAdamOptimizer.get_variable() method.  The resulting variables
-  have extra fields pointing to the otpimizer and to the accumulator
+  have extra fields pointing to the optimizer and to the accumulator
   slots.
 
   The variable is kept in quantized form, so you need to call
@@ -135,7 +135,7 @@ class DietAdamOptimizer(DietVariableOptimizer):
   diet_expert() for an example of how all of this is done.
 
   To facilitate fixed-point quantization and to make it easier to
-  choose a learning rate, all varaibles are initialized with unit
+  choose a learning rate, all variables are initialized with unit
   normal initialization.  If you want smaller values, downscale on the
   outside.
   """
@@ -185,7 +185,7 @@ def update_variable(self, var, grad_var):
                           global_step**-0.5)
     else:
       assert params.learning_rate_decay_scheme == "none"
-      lrate *= tf.minumum(global_step / params.learning_rate_warmup_steps, 1.0)
+      lrate *= tf.minimum(global_step / params.learning_rate_warmup_steps, 1.0)
 
     # compute adjustment due to second moment
     slots = params.slots[var.op.name]
diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py
index 1d465b8e7..3f49aa6c2 100644
--- a/tensor2tensor/utils/expert_utils.py
+++ b/tensor2tensor/utils/expert_utils.py
@@ -412,7 +412,7 @@ def _my_top_k(x, k):
   tf.nn.top_k is implemented for GPU, but the gradient, sparse_to_dense,
   seems not to be, so if we use tf.nn.top_k, then both the top_k and its
   gradient go on cpu.  Once this is not an issue, this function becomes
-  obselete and should be replaced by tf.nn.top_k.
+  obsolete and should be replaced by tf.nn.top_k.
 
   Args:
     x: a 2d Tensor.
diff --git a/tensor2tensor/utils/flags.py b/tensor2tensor/utils/flags.py
index 2017cf019..08b40efdf 100644
--- a/tensor2tensor/utils/flags.py
+++ b/tensor2tensor/utils/flags.py
@@ -101,7 +101,7 @@
 flags.DEFINE_float("worker_gpu_memory_fraction", 0.95,
                    "Fraction of GPU memory to allocate.")
 flags.DEFINE_integer("ps_gpu", 0, "How many GPUs to use per ps.")
-flags.DEFINE_string("gpu_order", "", "Optional order for daisy-chaining gpus."
+flags.DEFINE_string("gpu_order", "", "Optional order for daisy-chaining GPUs."
                     " e.g. \"1 3 2 4\"")
 flags.DEFINE_string("ps_job", "/job:ps", "name of ps job")
 flags.DEFINE_integer("ps_replicas", 0, "How many ps replicas.")
diff --git a/tensor2tensor/utils/rouge.py b/tensor2tensor/utils/rouge.py
index 627b8d2ea..cb3c9af4b 100644
--- a/tensor2tensor/utils/rouge.py
+++ b/tensor2tensor/utils/rouge.py
@@ -16,7 +16,7 @@
 # coding=utf-8
 """ROUGE metric implementation.
 
-This is a modified and slightly extended verison of
+This is a modified and slightly extended version of
 https://github.com/miso-belica/sumy/blob/dev/sumy/evaluation/rouge.py.
 """
 
@@ -116,7 +116,7 @@ def rouge_l_sentence_level(eval_sentences, ref_sentences):
 
   Args:
     eval_sentences: The sentences that have been picked by the summarizer
-    ref_sentences: The sentences from the referene set
+    ref_sentences: The sentences from the reference set
 
   Returns:
     A float: F_lcs
@@ -138,7 +138,7 @@ def rouge_l_fscore(predictions, labels, **unused_kwargs):
   or decode the ids and tokenize the output.
 
   Args:
-    predictions: tensor, model predicitons
+    predictions: tensor, model predictions
     labels: tensor, gold output.
 
   Returns:
@@ -221,7 +221,7 @@ def rouge_2_fscore(predictions, labels, **unused_kwargs):
   or decode the ids and tokenize the output.
 
   Args:
-    predictions: tensor, model predicitons
+    predictions: tensor, model predictions
     labels: tensor, gold output.
 
   Returns:
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index 716a6321d..a9c18399f 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -499,7 +499,7 @@ def infer(self,
       beam_size: number of beams.
       top_beams: an integer. How many of the beams to return.
       alpha: Float that controls the length penalty. larger the alpha, stronger
-        the preference for slonger translations.
+        the preference for longer translations.
 
     Returns:
       A dict of decoding results {
@@ -549,7 +549,7 @@ def _beam_decode(self, features, decode_length, beam_size, top_beams, alpha):
       beam_size: number of beams.
       top_beams: an integer. How many of the beams to return.
       alpha: Float that controls the length penalty. larger the alpha, stronger
-        the preference for slonger translations.
+        the preference for longer translations.
 
     Returns:
        samples: an integer `Tensor`. Top samples from the beam search
@@ -569,7 +569,7 @@ def _beam_decode_slow(self, features, decode_length, beam_size, top_beams,
       beam_size: number of beams.
       top_beams: an integer. How many of the beams to return.
       alpha: Float that controls the length penalty. larger the alpha, stronger
-        the preference for slonger translations.
+        the preference for longer translations.
 
     Returns:
        samples: an integer `Tensor`. Top samples from the beam search
@@ -765,8 +765,8 @@ def fn_not_eos():
               tf.squeeze(result[:, -1, :, :]), text_encoder.EOS_ID)
 
         not_eos = tf.cond(
-            # We only check for early stoping if there is at least 1 element (
-            # otherwise not_eos will crash)
+            # We only check for early stopping if there is at least 1 element (
+            # otherwise not_eos will crash).
             tf.not_equal(length, 0),
             fn_not_eos,
             lambda: True,
@@ -774,7 +774,7 @@ def fn_not_eos():
 
         return tf.cond(
             tf.equal(batch_size, 1),
-            # If batch_size == 1, we check EOS for early stoping
+            # If batch_size == 1, we check EOS for early stopping.
             lambda: tf.logical_and(not_overflow, not_eos),
             # Else, just wait for max length
             lambda: not_overflow)
diff --git a/tensor2tensor/utils/yellowfin.py b/tensor2tensor/utils/yellowfin.py
index 87b3ec9dd..8653d2baf 100644
--- a/tensor2tensor/utils/yellowfin.py
+++ b/tensor2tensor/utils/yellowfin.py
@@ -57,8 +57,8 @@ def __init__(self,
         Set to 1.0 in the paper.
       momentum: A Tensor or a floating point value.  The momentum.
          Set to 0.0 in the paper.
-      clip_thresh: A Tensor or a floating point value. The cliping threshold for
-        `tf.clip_by_global_norm`.  If None, no clipping will be carried out.
+      clip_thresh: A Tensor or a floating point value. The clipping threshold
+        for `tf.clip_by_global_norm`.  If None, no clipping will be carried out.
       beta: A float value or a constant float tensor.  The smoothing parameter
         for estimations.
       curvature_window_width: A int value or a constant int tensor.
@@ -358,7 +358,7 @@ def _get_cubic_root(self):
     # We substitute x, which is sqrt(mu), with x = y + 1.
     # It gives y^3 + py = q
     # where p = (D^2 h_min^2)/(2*C) and q = -p.
-    # We use the Vieta's substution to compute the root.
+    # We use the Vieta's substitution to compute the root.
     # There is only one real solution y (which is in [0, 1] ).
     # http://mathworld.wolfram.com/VietasSubstitution.html
     assert_array = [
@@ -390,7 +390,7 @@ def _get_cubic_root(self):
     return x
 
   def _get_lr_tensor(self):
-    """Get lr minimzing the surrogate.
+    """Get lr minimizing the surrogate.
 
     Returns:
       The lr_t.
@@ -461,7 +461,7 @@ def get_name(self):
     return self._momentum_optimizer.get_name()
 
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
-    """Applying gradients aand tune hyperparams with YellowFin.
+    """Applying gradients and tune hyperparams with YellowFin.
 
     Args:
       grads_and_vars: List of (gradient, variable) pairs as returned by
@@ -501,7 +501,7 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     # Begin lr and mu tuning.
     with tf.variable_scope("prepare_yellowFin_variables"):
       # the dependencies ideally only need to be after clip is done,
-      # i.e. dependes on self._grads. However, the control_dependencies
+      # i.e. depends on self._grads. However, the control_dependencies
       # does not support indexed slice for sparse gradients.
       # The alternative dependencies here might be slightly slower due
       # to less parallelization.
@@ -543,7 +543,7 @@ def compute_gradients(self,
         Can be GATE_NONE, GATE_OP, or GATE_GRAPH.
       aggregation_method: Specifies the method used to combine
         gradient terms. Valid values are defined in the class AggregationMethod.
-      colocate_gradients_with_ops: If True, try colocating gradients with
+      colocate_gradients_with_ops: If True, try collocating gradients with
         the corresponding op.
       name: Optional name for the returned operation. Default to the name
         passed to the Optimizer constructor.
@@ -571,7 +571,7 @@ def minimize(self,
                colocate_gradients_with_ops=False,
                name=None,
                grad_loss=None):
-    """Adapted from Tensorflow Optimizer base class member function.
+    """Adapted from TensorFlow Optimizer base class member function.
 
     Add operations to minimize `loss` by updating `var_list`.
     This method simply combines calls `compute_gradients()` and
@@ -590,7 +590,7 @@ def minimize(self,
         Can be GATE_NONE, GATE_OP, or GATE_GRAPH.
       aggregation_method: Specifies the method used to combine gradient terms.
         Valid values are defined in the class AggregationMethod.
-      colocate_gradients_with_ops: If True, try colocating gradients with
+      colocate_gradients_with_ops: If True, try collocating gradients with
         the corresponding op.
       name: Optional name for the returned operation.
       grad_loss: Optional. A Tensor holding the gradient computed for loss.
diff --git a/tensor2tensor/visualization/attention.py b/tensor2tensor/visualization/attention.py
index e2a0a0551..56ece8154 100644
--- a/tensor2tensor/visualization/attention.py
+++ b/tensor2tensor/visualization/attention.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Module for postprocessing and displaying tranformer attentions.
+"""Module for postprocessing and displaying transformer attentions.
 
 This module is designed to be called from an ipython notebook.
 """
diff --git a/tensor2tensor/visualization/visualization.py b/tensor2tensor/visualization/visualization.py
index f11074a96..119e7dbb3 100644
--- a/tensor2tensor/visualization/visualization.py
+++ b/tensor2tensor/visualization/visualization.py
@@ -73,7 +73,7 @@ def get_vis_data_from_string(self, sess, input_string):
 
     Args:
       sess: A tf.Session object.
-      input_string: The input setence to be translated and visulized.
+      input_string: The input sentence to be translated and visualized.
 
     Returns:
       Tuple of (
@@ -114,14 +114,14 @@ def get_vis_data_from_string(self, sess, input_string):
 
 
 def build_model(hparams_set, model_name, data_dir, problem_name, beam_size=1):
-  """Build the graph required to featch the attention weights.
+  """Build the graph required to fetch the attention weights.
 
   Args:
     hparams_set: HParams set to build the model with.
     model_name: Name of model.
-    data_dir: Path to directory contatining training data.
+    data_dir: Path to directory containing training data.
     problem_name: Name of problem.
-    beam_size: (Optional) Number of beams to use when decoding a traslation.
+    beam_size: (Optional) Number of beams to use when decoding a translation.
         If set to 1 (default) then greedy decoding is used.
 
   Returns:
@@ -147,7 +147,7 @@ def build_model(hparams_set, model_name, data_dir, problem_name, beam_size=1):
 
   # Must be called after building the training graph, so that the dict will
   # have been filled with the attention tensors. BUT before creating the
-  # interence graph otherwise the dict will be filled with tensors from
+  # inference graph otherwise the dict will be filled with tensors from
   # inside a tf.while_loop from decoding and are marked unfetchable.
   att_mats = get_att_mats(translate_model)
 

From afb5c2240cd7b3c4d4fef116b8b06f965a0516b2 Mon Sep 17 00:00:00 2001
From: Aidan Gomez <aidangomez@google.com>
Date: Wed, 11 Apr 2018 18:26:41 -0700
Subject: [PATCH 14/29] Add distillation to T2T

PiperOrigin-RevId: 192542052
---
 tensor2tensor/bin/t2t_distill.py       |  95 ++++++++++++
 tensor2tensor/data_generators/cifar.py | 114 +++++++++++++-
 tensor2tensor/models/__init__.py       |   1 +
 tensor2tensor/models/distillation.py   | 196 +++++++++++++++++++++++++
 tensor2tensor/models/resnet.py         |  72 +++++----
 tensor2tensor/models/resnet_test.py    |   1 +
 tensor2tensor/utils/learning_rate.py   |   1 +
 7 files changed, 451 insertions(+), 29 deletions(-)
 create mode 100644 tensor2tensor/bin/t2t_distill.py
 create mode 100644 tensor2tensor/models/distillation.py

diff --git a/tensor2tensor/bin/t2t_distill.py b/tensor2tensor/bin/t2t_distill.py
new file mode 100644
index 000000000..be31a2ba7
--- /dev/null
+++ b/tensor2tensor/bin/t2t_distill.py
@@ -0,0 +1,95 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Perform distillation for a teacher to student.
+
+This script is intended to be used with --model=distillation. See the model for
+example hyperparameters and usage.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+# Dependency imports
+
+from tensor2tensor import models  # pylint: disable=unused-import
+from tensor2tensor import problems as problems_lib  # pylint: disable=unused-import
+from tensor2tensor.bin import t2t_trainer
+from tensor2tensor.utils import cloud_mlengine
+from tensor2tensor.utils import flags as t2t_flags  # pylint: disable=unused-import
+from tensor2tensor.utils import trainer_lib
+from tensor2tensor.utils import usr_dir
+
+import tensorflow as tf
+
+flags = tf.flags
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+  tf.logging.set_verbosity(tf.logging.INFO)
+  trainer_lib.set_random_seed(FLAGS.random_seed)
+  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
+  t2t_trainer.log_registry()
+
+  if FLAGS.cloud_mlengine:
+    return cloud_mlengine.launch()
+
+  if FLAGS.generate_data:
+    t2t_trainer.generate_data()
+
+  if cloud_mlengine.job_dir():
+    FLAGS.output_dir = cloud_mlengine.job_dir()
+
+  if argv:
+    t2t_trainer.set_hparams_from_args(argv[1:])
+
+  with t2t_trainer.maybe_cloud_tpu():
+    root_output_dir = FLAGS.output_dir
+
+    # Train Teacher ============
+    hparams = t2t_trainer.create_hparams()
+    hparams.distill_phase = "train"
+    teacher_dir = os.path.join(root_output_dir, "teacher")
+    FLAGS.output_dir = teacher_dir
+
+    exp_fn = t2t_trainer.create_experiment_fn()
+    run_config = t2t_trainer.create_run_config(hparams)
+    exp = exp_fn(run_config, hparams)
+    if t2t_trainer.is_chief():
+      t2t_trainer.save_metadata(hparams)
+    t2t_trainer.execute_schedule(exp)
+    # ==========================
+    # Train Student ============
+    hparams = t2t_trainer.create_hparams()
+    hparams.add_hparam("teacher_dir", teacher_dir)
+    hparams.distill_phase = "distill"
+    student_dir = os.path.join(root_output_dir, "student")
+    FLAGS.output_dir = student_dir
+
+    exp_fn = t2t_trainer.create_experiment_fn()
+    run_config = t2t_trainer.create_run_config(hparams)
+    exp = exp_fn(run_config, hparams)
+
+    if t2t_trainer.is_chief():
+      t2t_trainer.save_metadata(hparams)
+    t2t_trainer.execute_schedule(exp)
+    # ==========================
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/data_generators/cifar.py b/tensor2tensor/data_generators/cifar.py
index 3dd5c8f39..9a1a80ef3 100644
--- a/tensor2tensor/data_generators/cifar.py
+++ b/tensor2tensor/data_generators/cifar.py
@@ -77,12 +77,17 @@ def cifar_generator(cifar_version, tmp_dir, training, how_many, start_from=0):
     test_files = _CIFAR10_TEST_FILES
     prefix = _CIFAR10_PREFIX
     image_size = _CIFAR10_IMAGE_SIZE
-  elif cifar_version == "cifar100":
+    label_key = "labels"
+  elif cifar_version == "cifar100" or cifar_version == "cifar20":
     url = _CIFAR100_URL
     train_files = _CIFAR100_TRAIN_FILES
     test_files = _CIFAR100_TEST_FILES
     prefix = _CIFAR100_PREFIX
     image_size = _CIFAR100_IMAGE_SIZE
+    if cifar_version == "cifar100":
+      label_key = "fine_labels"
+    else:
+      label_key = "coarse_labels"
 
   _get_cifar(tmp_dir, url)
   data_files = train_files if training else test_files
@@ -97,7 +102,7 @@ def cifar_generator(cifar_version, tmp_dir, training, how_many, start_from=0):
     all_images.extend([
         np.squeeze(images[j]).transpose((1, 2, 0)) for j in xrange(num_images)
     ])
-    labels = data["labels" if cifar_version == "cifar10" else "fine_labels"]
+    labels = data[label_key]
     all_labels.extend([labels[j] for j in xrange(num_images)])
   return image_utils.image_generator(
       all_images[start_from:start_from + how_many],
@@ -417,3 +422,108 @@ def hparams(self, defaults, unused_model_hparams):
     p.max_expected_batch_size_per_shard = 4
     p.input_space_id = 1
     p.target_space_id = 1
+
+
+@registry.register_problem
+class ImageCifar20Tune(mnist.ImageMnistTune):
+  """Cifar-20 Tune."""
+
+  @property
+  def num_classes(self):
+    return 20
+
+  @property
+  def num_channels(self):
+    return 3
+
+  @property
+  def class_labels(self):
+    return [
+        "aquatic mammals",
+        "fish",
+        "flowers",
+        "food containers",
+        "fruit and vegetables",
+        "household electrical devices",
+        "household furniture",
+        "insects",
+        "large carnivores",
+        "large man-made outdoor things",
+        "large natural outdoor scenes",
+        "large omnivores and herbivores",
+        "medium-sized mammals",
+        "non-insect invertebrates",
+        "people",
+        "reptiles",
+        "small mammals",
+        "trees",
+        "vehicles 1",
+        "vehicles 2",
+    ]
+
+  def preprocess_example(self, example, mode, unused_hparams):
+    image = example["inputs"]
+    image.set_shape([_CIFAR100_IMAGE_SIZE, _CIFAR100_IMAGE_SIZE, 3])
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      image = image_utils.cifar_image_augmentation(image)
+    if not self._was_reversed:
+      image = tf.image.per_image_standardization(image)
+    example["inputs"] = image
+    return example
+
+  def generator(self, data_dir, tmp_dir, is_training):
+    if is_training:
+      return cifar_generator("cifar20", tmp_dir, True, 48000)
+    else:
+      return cifar_generator("cifar20", tmp_dir, True, 2000, 48000)
+
+
+@registry.register_problem
+class ImageCifar20(ImageCifar20Tune):
+
+  def generator(self, data_dir, tmp_dir, is_training):
+    if is_training:
+      return cifar_generator("cifar20", tmp_dir, True, 50000)
+    else:
+      return cifar_generator("cifar20", tmp_dir, False, 10000)
+
+
+@registry.register_problem
+class ImageCifar20Plain(ImageCifar20):
+
+  def preprocess_example(self, example, mode, unused_hparams):
+    image = example["inputs"]
+    image.set_shape([_CIFAR100_IMAGE_SIZE, _CIFAR100_IMAGE_SIZE, 3])
+    if not self._was_reversed:
+      image = tf.image.per_image_standardization(image)
+    example["inputs"] = image
+    return example
+
+
+@registry.register_problem
+class ImageCifar20PlainGen(ImageCifar20Plain):
+  """CIFAR-20 32x32 for image generation without standardization preprep."""
+
+  def dataset_filename(self):
+    return "image_cifar20_plain"  # Reuse CIFAR-20 plain data.
+
+  def preprocess_example(self, example, mode, unused_hparams):
+    example["inputs"].set_shape([_CIFAR100_IMAGE_SIZE, _CIFAR100_IMAGE_SIZE, 3])
+    example["inputs"] = tf.to_int64(example["inputs"])
+    return example
+
+
+@registry.register_problem
+class ImageCifar20Plain8(ImageCifar20):
+  """CIFAR-20 rescaled to 8x8 for output: Conditional image generation."""
+
+  def dataset_filename(self):
+    return "image_cifar20_plain"  # Reuse CIFAR-20 plain data.
+
+  def preprocess_example(self, example, mode, unused_hparams):
+    image = example["inputs"]
+    image = image_utils.resize_by_area(image, 8)
+    if not self._was_reversed:
+      image = tf.image.per_image_standardization(image)
+    example["inputs"] = image
+    return example
diff --git a/tensor2tensor/models/__init__.py b/tensor2tensor/models/__init__.py
index 301c6d42a..c76570d9a 100644
--- a/tensor2tensor/models/__init__.py
+++ b/tensor2tensor/models/__init__.py
@@ -25,6 +25,7 @@
 from tensor2tensor.layers import modalities
 from tensor2tensor.models import basic
 from tensor2tensor.models import bytenet
+from tensor2tensor.models import distillation
 from tensor2tensor.models import image_transformer
 from tensor2tensor.models import image_transformer_2d
 from tensor2tensor.models import lstm
diff --git a/tensor2tensor/models/distillation.py b/tensor2tensor/models/distillation.py
new file mode 100644
index 000000000..3f468f54c
--- /dev/null
+++ b/tensor2tensor/models/distillation.py
@@ -0,0 +1,196 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Traditional Student-Teacher Distillation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+
+@registry.register_model
+class Distillation(t2t_model.T2TModel):
+  """Distillation from a teacher to student network.
+
+  First, a teacher is train on a task; Second, a student is trained to perform
+  the task while matching the teacher's softened outputs. For more details, see
+  the paper below.
+
+  In the hparams passed to this model include the desired
+  {teacher/student}_model and {teacher/student}_hparams to be used. Also,
+  specify the distillation temperature and task-distillation balance.
+
+  Distilling the Knowledge in a Neural Network
+  Hinton, Vinyals and Dean
+  https://arxiv.org/abs/1503.02531
+  """
+
+  def __init__(self,
+               hparams,
+               mode=tf.estimator.ModeKeys.TRAIN,
+               problem_hparams=None,
+               data_parallelism=None,
+               decode_hparams=None):
+    assert hparams.distill_phase in ["train", "distill"]
+
+    if hparams.distill_phase == "train" and hparams.teacher_learning_rate:
+      hparams.learning_rate = hparams.teacher_learning_rate
+    elif hparams.distill_phase == "distill" and hparams.student_learning_rate:
+      hparams.learning_rate = hparams.student_learning_rate
+
+    self.teacher_hparams = registry.hparams(hparams.teacher_hparams)()
+    self.teacher_model = registry.model(
+        hparams.teacher_model)(self.teacher_hparams, mode, problem_hparams,
+                               data_parallelism, decode_hparams)
+    self.student_hparams = registry.hparams(hparams.student_hparams)()
+    self.student_model = registry.model(
+        hparams.student_model)(self.student_hparams, mode, problem_hparams,
+                               data_parallelism, decode_hparams)
+    super(Distillation, self).__init__(hparams, mode, problem_hparams,
+                                       data_parallelism, decode_hparams)
+
+  def body(self, features):
+    hp = self.hparams
+    is_distill = hp.distill_phase == "distill"
+
+    targets = features["targets_raw"]
+    targets = tf.squeeze(targets, [1, 2, 3])
+    one_hot_targets = tf.one_hot(targets, hp.num_classes, dtype=tf.float32)
+
+    # Teacher Network
+    with tf.variable_scope("teacher"):
+      teacher_outputs = self.teacher_model.body(features)
+      tf.logging.info("teacher output shape: %s" % teacher_outputs.get_shape())
+      teacher_outputs = tf.reduce_mean(teacher_outputs, axis=[1, 2])
+      teacher_logits = tf.layers.dense(teacher_outputs, hp.num_classes)
+
+      teacher_task_xent = tf.nn.softmax_cross_entropy_with_logits_v2(
+          labels=one_hot_targets, logits=teacher_logits)
+      outputs = teacher_logits
+
+    if is_distill:
+      # Load teacher weights
+      tf.train.init_from_checkpoint(hp.teacher_dir, {"teacher/": "teacher/"})
+      # Do not train the teacher
+      trainable_vars = tf.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES)
+      del trainable_vars[:]
+
+    # Student Network
+    if is_distill:
+      with tf.variable_scope("student"):
+        student_outputs = self.student_model.body(features)
+        tf.logging.info(
+            "student output shape: %s" % student_outputs.get_shape())
+        student_outputs = tf.reduce_mean(student_outputs, axis=[1, 2])
+        student_logits = tf.layers.dense(student_outputs, hp.num_classes)
+
+        student_task_xent = tf.nn.softmax_cross_entropy_with_logits_v2(
+            labels=one_hot_targets, logits=student_logits)
+        teacher_targets = tf.nn.softmax(teacher_logits / hp.distill_temperature)
+        student_distill_xent = tf.nn.softmax_cross_entropy_with_logits_v2(
+            labels=tf.stop_gradient(teacher_targets), logits=student_logits)
+
+        outputs = student_logits
+
+        # Summaries
+        tf.summary.scalar("distill_xent", student_distill_xent)
+
+    if not is_distill:
+      phase_loss = teacher_task_xent
+    else:
+      phase_loss = hp.task_balance * student_task_xent
+      phase_loss += (1 - hp.task_balance) * student_distill_xent
+
+    losses = {"training": phase_loss}
+    outputs = tf.reshape(outputs, [-1, 1, 1, 1, outputs.shape[1]])
+
+    return outputs, losses
+
+  def top(self, body_output, features):
+    return body_output
+
+
+def distill_base():
+  """Set of hyperparameters."""
+  # Base
+  hparams = common_hparams.basic_params1()
+
+  # teacher/student parameters
+  hparams.add_hparam("teacher_model", "")
+  hparams.add_hparam("teacher_hparams", "")
+  hparams.add_hparam("student_model", "")
+  hparams.add_hparam("student_hparams", "")
+
+  # Distillation parameters
+  # WARNING: distill_phase hparam will be overwritten in /bin/t2t_distill.py
+  hparams.add_hparam("distill_phase", None)
+  hparams.add_hparam("task_balance", 1.0)
+  hparams.add_hparam("distill_temperature", 1.0)
+  hparams.add_hparam("num_classes", 10)
+
+  # Optional Phase-specific hyperparameters
+  hparams.add_hparam("teacher_learning_rate", None)
+  hparams.add_hparam("student_learning_rate", None)
+
+  # Training parameters (stolen from ResNet)
+  hparams.batch_size = 128
+  hparams.optimizer = "Momentum"
+  hparams.optimizer_momentum_momentum = 0.9
+  hparams.optimizer_momentum_nesterov = True
+  hparams.weight_decay = 1e-4
+  hparams.clip_grad_norm = 0.0
+  # (base_lr=0.1) * (batch_size=128*8 (on TPU, or 8 GPUs)=1024) / (256.)
+  hparams.learning_rate = 0.4
+  hparams.learning_rate_decay_scheme = "cosine"
+  # For image_imagenet224, 120k training steps, which effectively makes this a
+  # cosine decay (i.e. no cycles).
+  hparams.learning_rate_cosine_cycle_steps = 120000
+  hparams.initializer = "normal_unit_scaling"
+  hparams.initializer_gain = 2.
+
+  return hparams
+
+
+@registry.register_hparams
+def distill_resnet_32_to_15_cifar20x5():
+  """Set of hyperparameters."""
+  hparams = distill_base()
+  hparams.teacher_model = "resnet"
+  hparams.teacher_hparams = "resnet_cifar_32"
+  hparams.student_model = "resnet"
+  hparams.student_hparams = "resnet_cifar_15"
+
+  hparams.optimizer_momentum_nesterov = True
+  # (base_lr=0.1) * (batch_size=128*8 (on TPU, or 8 GPUs)=1024) / (256.)
+  hparams.teacher_learning_rate = 0.25 * 128. * 8. / 256.
+  hparams.student_learning_rate = 0.2 * 128. * 8. / 256.
+  hparams.learning_rate_decay_scheme = "piecewise"
+  hparams.add_hparam("learning_rate_boundaries", [40000, 60000, 80000])
+  hparams.add_hparam("learning_rate_multiples", [0.1, 0.01, 0.001])
+
+  hparams.task_balance = 0.28
+  hparams.distill_temperature = 2.0
+
+  hparams.num_classes = 20
+
+  return hparams
diff --git a/tensor2tensor/models/resnet.py b/tensor2tensor/models/resnet.py
index 39f3dd723..d889ba328 100644
--- a/tensor2tensor/models/resnet.py
+++ b/tensor2tensor/models/resnet.py
@@ -313,6 +313,7 @@ def projection_shortcut(inputs):
 def resnet_v2(inputs,
               block_fn,
               layers,
+              filters,
               data_format="channels_first",
               is_training=False):
   """Resnet model.
@@ -321,9 +322,11 @@ def resnet_v2(inputs,
     inputs: `Tensor` images.
     block_fn: `function` for the block to use within the model. Either
         `residual_block` or `bottleneck_block`.
-    layers: list of 4 `int`s denoting the number of blocks to include in each
-      of the 4 block groups. Each group consists of blocks that take inputs of
-      the same resolution.
+    layers: list of 3 or 4 `int`s denoting the number of blocks to include in
+      each of the 3 or 4 block groups. Each group consists of blocks that take
+      inputs of the same resolution.
+    filters: list of 4 or 5 `int`s denoting the number of filter to include in
+      block.
     data_format: `str`, "channels_first" `[batch, channels, height,
         width]` or "channels_last" `[batch, height, width, channels]`.
     is_training: bool, build in training mode or not.
@@ -333,7 +336,7 @@ def resnet_v2(inputs,
   """
   inputs = conv2d_fixed_padding(
       inputs=inputs,
-      filters=64,
+      filters=filters[0],
       kernel_size=7,
       strides=2,
       data_format=data_format)
@@ -350,7 +353,7 @@ def resnet_v2(inputs,
 
   inputs = block_layer(
       inputs=inputs,
-      filters=64,
+      filters=filters[1],
       block_fn=block_fn,
       blocks=layers[0],
       strides=1,
@@ -359,7 +362,7 @@ def resnet_v2(inputs,
       data_format=data_format)
   inputs = block_layer(
       inputs=inputs,
-      filters=128,
+      filters=filters[2],
       block_fn=block_fn,
       blocks=layers[1],
       strides=2,
@@ -368,32 +371,24 @@ def resnet_v2(inputs,
       data_format=data_format)
   inputs = block_layer(
       inputs=inputs,
-      filters=256,
+      filters=filters[3],
       block_fn=block_fn,
       blocks=layers[2],
       strides=2,
       is_training=is_training,
       name="block_layer3",
       data_format=data_format)
-  inputs = block_layer(
-      inputs=inputs,
-      filters=512,
-      block_fn=block_fn,
-      blocks=layers[3],
-      strides=2,
-      is_training=is_training,
-      name="block_layer4",
-      data_format=data_format)
+  if filters[4]:
+    inputs = block_layer(
+        inputs=inputs,
+        filters=filters[4],
+        block_fn=block_fn,
+        blocks=layers[3],
+        strides=2,
+        is_training=is_training,
+        name="block_layer4",
+        data_format=data_format)
 
-  inputs = tf.layers.average_pooling2d(
-      inputs=inputs,
-      pool_size=7,
-      strides=1,
-      padding="VALID",
-      data_format=data_format)
-  inputs = tf.identity(inputs, "final_avg_pool")
-  inputs = tf.reshape(inputs,
-                      [-1, 2048 if block_fn is bottleneck_block else 512])
   return inputs
 
 
@@ -421,11 +416,10 @@ def body(self, features):
         inputs,
         block_fns[hp.block_fn],
         hp.layer_sizes,
+        hp.filter_sizes,
         data_format,
         is_training=hp.mode == tf.estimator.ModeKeys.TRAIN)
 
-    out = tf.expand_dims(out, 1)
-    out = tf.expand_dims(out, 1)
     return out
 
 
@@ -440,6 +434,7 @@ def resnet_base():
 
   # Model-specific parameters
   hparams.add_hparam("layer_sizes", [3, 4, 6, 3])
+  hparams.add_hparam("filter_sizes", [64, 64, 128, 256, 512])
   hparams.add_hparam("block_fn", "bottleneck")
   hparams.add_hparam("use_nchw", True)
 
@@ -478,6 +473,29 @@ def resnet_18():
   return hp
 
 
+@registry.register_hparams
+def resnet_cifar_15():
+  """Set of hyperparameters."""
+  hp = resnet_base()
+  hp.block_fn = "residual"
+  hp.layer_sizes = [2, 2, 2]
+  hp.filter_sizes = [16, 16, 32, 64, None]
+
+  hp.learning_rate = 0.1 * 128. * 8. / 256.
+  hp.learning_rate_decay_scheme = "piecewise"
+  hp.add_hparam("learning_rate_boundaries", [40000, 60000, 80000])
+  hp.add_hparam("learning_rate_multiples", [0.1, 0.01, 0.001])
+
+  return hp
+
+
+@registry.register_hparams
+def resnet_cifar_32():
+  hp = resnet_cifar_15()
+  hp.layer_sizes = [5, 5, 5]
+  return hp
+
+
 @registry.register_hparams
 def resnet_34():
   hp = resnet_base()
diff --git a/tensor2tensor/models/resnet_test.py b/tensor2tensor/models/resnet_test.py
index 12aca39bb..c5c7312da 100644
--- a/tensor2tensor/models/resnet_test.py
+++ b/tensor2tensor/models/resnet_test.py
@@ -49,6 +49,7 @@ def _testResnet(self, img_size, output_size):
     hparams = resnet_tiny_cpu()
     p_hparams = problem_hparams.test_problem_hparams(vocab_size, vocab_size)
     p_hparams.input_modality["inputs"] = (registry.Modalities.IMAGE, None)
+    p_hparams.target_modality = (registry.Modalities.CLASS_LABEL, vocab_size)
     with self.test_session() as session:
       features = {
           "inputs": tf.constant(x, dtype=tf.int32),
diff --git a/tensor2tensor/utils/learning_rate.py b/tensor2tensor/utils/learning_rate.py
index 169b59348..f7fe33100 100644
--- a/tensor2tensor/utils/learning_rate.py
+++ b/tensor2tensor/utils/learning_rate.py
@@ -87,6 +87,7 @@ def _piecewise_learning_rate(step, boundaries, values):
     Scaled value for the learning rate.
   """
   values = [1.0] + values
+  boundaries = [float(x) for x in boundaries]
   return tf.train.piecewise_constant(
       step, boundaries, values, name="piecewise_lr")
 

From feeff2bd8992ad7cfe9f0cd62948b8ec7a92106f Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Thu, 12 Apr 2018 09:23:01 -0700
Subject: [PATCH 15/29] More spelling corrections.

PiperOrigin-RevId: 192621142
---
 README.md                                           |  2 +-
 docs/walkthrough.md                                 |  2 +-
 tensor2tensor/bin/t2t_bleu.py                       |  2 +-
 tensor2tensor/data_generators/algorithmic_math.py   |  8 ++++----
 tensor2tensor/data_generators/speech_recognition.py |  2 +-
 tensor2tensor/data_generators/text_encoder.py       |  2 +-
 tensor2tensor/data_generators/translate_enzh.py     |  2 +-
 tensor2tensor/data_generators/wiki.py               |  2 +-
 tensor2tensor/layers/common_attention.py            | 12 ++++++------
 tensor2tensor/layers/common_hparams.py              |  8 ++++----
 tensor2tensor/layers/common_layers.py               |  2 +-
 tensor2tensor/models/research/attention_lm.py       |  2 +-
 tensor2tensor/models/research/attention_lm_moe.py   |  4 ++--
 tensor2tensor/models/research/transformer_moe.py    |  2 +-
 tensor2tensor/models/transformer.py                 | 10 +++++-----
 tensor2tensor/models/vanilla_gan.py                 |  2 +-
 tensor2tensor/rl/README.md                          |  4 ++--
 tensor2tensor/utils/beam_search.py                  | 10 +++++-----
 tensor2tensor/utils/data_reader.py                  |  2 +-
 tensor2tensor/utils/expert_utils.py                 |  6 +++---
 tensor2tensor/utils/metrics.py                      |  2 +-
 21 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/README.md b/README.md
index 889d5ad64..e8d46add5 100644
--- a/README.md
+++ b/README.md
@@ -328,7 +328,7 @@ inference. Users can easily switch between problems, models, and hyperparameter
 sets by using the `--model`, `--problems`, and `--hparams_set` flags. Specific
 hyperparameters can be overridden with the `--hparams` flag. `--schedule` and
 related flags control local and distributed training/evaluation
-([distributed training documentation](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/g3doc/distributed_training.md)).
+([distributed training documentation](https://github.com/tensorflow/tensor2tensor/tree/master/docs/distributed_training.md)).
 
 ## Adding your own components
 
diff --git a/docs/walkthrough.md b/docs/walkthrough.md
index 889d5ad64..e8d46add5 100644
--- a/docs/walkthrough.md
+++ b/docs/walkthrough.md
@@ -328,7 +328,7 @@ inference. Users can easily switch between problems, models, and hyperparameter
 sets by using the `--model`, `--problems`, and `--hparams_set` flags. Specific
 hyperparameters can be overridden with the `--hparams` flag. `--schedule` and
 related flags control local and distributed training/evaluation
-([distributed training documentation](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/g3doc/distributed_training.md)).
+([distributed training documentation](https://github.com/tensorflow/tensor2tensor/tree/master/docs/distributed_training.md)).
 
 ## Adding your own components
 
diff --git a/tensor2tensor/bin/t2t_bleu.py b/tensor2tensor/bin/t2t_bleu.py
index 4eeb84eec..74117454d 100644
--- a/tensor2tensor/bin/t2t_bleu.py
+++ b/tensor2tensor/bin/t2t_bleu.py
@@ -74,7 +74,7 @@
 flags.DEFINE_string("translation", None,
                     "Path to the MT system translation file")
 flags.DEFINE_string("translations_dir", None,
-                    "Directory with translated files to be evaulated.")
+                    "Directory with translated files to be evaluated.")
 flags.DEFINE_string("event_dir", None, "Where to store the event file.")
 
 flags.DEFINE_string("bleu_variant", "both",
diff --git a/tensor2tensor/data_generators/algorithmic_math.py b/tensor2tensor/data_generators/algorithmic_math.py
index c3a028b12..689fa4b41 100644
--- a/tensor2tensor/data_generators/algorithmic_math.py
+++ b/tensor2tensor/data_generators/algorithmic_math.py
@@ -181,7 +181,7 @@ def algebra_inverse_solve(left, right, var, solve_ops):
           right- Expression on the right side of the op.
           to_tree- The tree on the other side of the equal sign. The canceled
               out expression will be moved here.
-          new_from_tree- The resuling from_tree after the algebraic
+          new_from_tree- The resulting from_tree after the algebraic
               manipulation.
           new_to_tree- The resulting to_tree after the algebraic manipulation.
 
@@ -355,8 +355,8 @@ def generate_calculus_integrate_sample(vlist, ops, min_depth, max_depth,
 # functions: Dict of special function names. Maps human readable string names to
 #     single char names used in flist.
 # ops: Dict mapping op symbols (chars) to ExprOp instances.
-# solve_ops: Encodes rules for how to algebraicly cancel out each operation. See
-#     doc-string for `algebra_inverse_solve`.
+# solve_ops: Encodes rules for how to algebraically cancel out each operation.
+#     See doc-string for `algebra_inverse_solve`.
 # int_encoder: Function that maps a string to a list of tokens. Use this to
 #     encode an expression to feed into a model.
 # int_decoder: Function that maps a list of tokens to a string. Use this to
@@ -377,7 +377,7 @@ def math_dataset_init(alphabet_size=26, digits=None, functions=None):
 
   Args:
     alphabet_size: How many possible variables there are. Max 52.
-    digits: How many numerical digits to encode as tokens, "0" throuh
+    digits: How many numerical digits to encode as tokens, "0" through
         str(digits-1), or None to encode no digits.
     functions: Defines special functions. A dict mapping human readable string
         names, like "log", "exp", "sin", "cos", etc., to single chars. Each
diff --git a/tensor2tensor/data_generators/speech_recognition.py b/tensor2tensor/data_generators/speech_recognition.py
index 6c4645a56..2777cd9cf 100644
--- a/tensor2tensor/data_generators/speech_recognition.py
+++ b/tensor2tensor/data_generators/speech_recognition.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Common classes for automatic speech recogntion (ASR) datasets.
+"""Common classes for automatic speech recognition (ASR) datasets.
 
 The audio import uses sox to generate normalized waveforms, please install
 it as appropriate (e.g. using apt-get or yum).
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index a0059845a..f80416fdd 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -348,7 +348,7 @@ def store_to_file(self, filename):
 def _escape_token(token, alphabet):
   """Escape away underscores and OOV characters and append '_'.
 
-  This allows the token to be experessed as the concatenation of a list
+  This allows the token to be expressed as the concatenation of a list
   of subtokens from the vocabulary. The underscore acts as a sentinel
   which allows us to invertibly concatenate multiple such lists.
 
diff --git a/tensor2tensor/data_generators/translate_enzh.py b/tensor2tensor/data_generators/translate_enzh.py
index 444fc9834..9e2f56e04 100644
--- a/tensor2tensor/data_generators/translate_enzh.py
+++ b/tensor2tensor/data_generators/translate_enzh.py
@@ -172,7 +172,7 @@ class TranslateEnzhWmt32k(translate.TranslateProblem):
 
   CWMT:
     - http://nlp.nju.edu.cn/cwmt-wmt/
-    - Website contrains instructions for FTP server access.
+    - Website contains instructions for FTP server access.
     - You'll need to download CASIA, CASICT, DATUM2015, DATUM2017,
         NEU datasets
 
diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py
index db18ced36..5222b5a62 100644
--- a/tensor2tensor/data_generators/wiki.py
+++ b/tensor2tensor/data_generators/wiki.py
@@ -190,7 +190,7 @@ def scramble_fraction(self):
 
 @registry.register_problem
 class LanguagemodelWikiScrambleL1k(LanguagemodelWikiScramble):
-  """Sequence length 1024, 50% scrambed."""
+  """Sequence length 1024, 50% scrambled."""
 
   @property
   def sequence_length(self):
diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index 8eda5b662..a6b4f919d 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -83,7 +83,7 @@ def register_layer(
       default_args (list): The default parameters to add to the function.
       default_kwargs (dict): The default parameters to add to the function.
         Those arguments can be overwritten when calling the function.
-      use_dp (bool): Wrap the function call within a dataparalellism object if
+      use_dp (bool): Wrap the function call within a dataparallelism object if
         dp is available. Some layers (like MOE) must be called without dp.
       recompute_grad (bool): If True, recompute the function during the
         backward pass to save memory
@@ -1378,7 +1378,7 @@ def _relative_attention_inner(x, y, z, transpose):
     x: Tensor with shape [batch_size, heads, length, length or depth].
     y: Tensor with shape [batch_size, heads, length, depth].
     z: Tensor with shape [length, length, depth].
-    transpose: Whether to tranpose inner matrices of y and z. Should be true if
+    transpose: Whether to transpose inner matrices of y and z. Should be true if
         last dimension of x is depth, not length.
 
   Returns:
@@ -1422,7 +1422,7 @@ def dot_product_attention_relative(q,
     k: a Tensor with shape [batch, heads, length, depth].
     v: a Tensor with shape [batch, heads, length, depth].
     bias: bias Tensor.
-    max_relative_position: an integer specifying the maxmimum distance between
+    max_relative_position: an integer specifying the maximum distance between
         inputs that unique position embeddings should be learned for.
     dropout_rate: a floating point number.
     image_shapes: optional tuple of integer scalars.
@@ -2141,7 +2141,7 @@ def gather_indices_2d(x, block_shape, block_stride):
 
 
 def make_2d_block_raster_mask(query_shape, memory_flange):
-  """creates a mask for 2d block raster scany.
+  """Creates a mask for 2d block raster scan.
 
   The query mask can look to the left, top left, top, and top right, but
   not to the right. Inside the query, we have the standard raster scan
@@ -2661,7 +2661,7 @@ def ffn_self_attention_layer(x,
   We use self-attention to do feedforward computations. We apply this function
   positionwise where for each position, we linearly transform the output to have
   depth filter_depth, and break up the result depth-wise into num_parts
-  contiguous parts.  The parts self-attentd, we concatenate the results
+  contiguous parts.  The parts self-attend, we concatenate the results
   depth-wise, and we linearly transform to a depth of output_depth. The
   goal is to get multiplicative interactions between components of a
   representation.
@@ -2764,7 +2764,7 @@ def parameter_attention(x,
         x, total_key_depth, use_bias=False, name="q_transform")
     if dropout_rate:
       # This is a cheaper form of attention dropout where we use to use
-      # the same dropout decisions across batch elemets and query positions,
+      # the same dropout decisions across batch elements and query positions,
       # but different decisions across heads and memory positions.
       v = tf.nn.dropout(
           v, 1.0 - dropout_rate, noise_shape=[num_heads, memory_rows, 1])
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
index 083634785..3b2d5f802 100644
--- a/tensor2tensor/layers/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -102,13 +102,13 @@ def basic_params1():
       moe_loss_coef=1e-2,
       # Sequences of operations to perform on layer input and layer output.
       # Used by common_layers.layer_preprocess, common_layers.layer_postprocess
-      # Each character repsesnts an operation:
+      # Each character represents an operation:
       # none: no preprocessing
       #    d: apply dropout
       #    n: apply normalization (see norm_type and norm_epsilon)
       #    a: add layer input (residual connection - only during postprocess)
       # The special string "none" is used instead of the empty string
-      # to indicate no pre/postprocesisng, since the empty string causes
+      # to indicate no pre/postprocessing, since the empty string causes
       # trouble for hyperparameter tuning.
       # TODO(noam): The current settings ("", "dan") are the published version
       # of the transformer.  ("n", "da") seems better for harder-to-learn
@@ -174,13 +174,13 @@ def basic_params1():
       # The maximum length of "input" sequence.
       # Sequences longer than this value will be truncated. 0 or negative values
       # mean there is no maximum or truncation.
-      # You can change this behavior by overridding preprocess_example() method
+      # You can change this behavior by overriding preprocess_example() method
       # in your problem class.
       max_input_seq_length=0,
       # The maximum length of "target" sequence.
       # Sequences longer than this value will be truncated. 0 or negative values
       # mean there is no maximum or truncation.
-      # You can change this behavior by overridding preprocess_example() method
+      # You can change this behavior by overriding preprocess_example() method
       # in your problem class.
       max_target_seq_length=0,
       # if nonzero, we split the target sequences on example read.
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index 16836adee..538f487ee 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -1201,7 +1201,7 @@ def add_timing_signal(x, min_timescale=1, max_timescale=1e4, num_timescales=16):
   and the target of the attention.
 
   The use of relative position is possible because sin(x+y) and cos(x+y) can be
-  experessed in terms of y, sin(x) and cos(x).
+  expressed in terms of y, sin(x) and cos(x).
 
   In particular, we use a geometric sequence of timescales starting with
   min_timescale and ending with max_timescale.  For each timescale, we
diff --git a/tensor2tensor/models/research/attention_lm.py b/tensor2tensor/models/research/attention_lm.py
index 30277d6f3..cbc45c4e7 100644
--- a/tensor2tensor/models/research/attention_lm.py
+++ b/tensor2tensor/models/research/attention_lm.py
@@ -70,7 +70,7 @@ def attention_lm_prepare_decoder(targets, hparams):
   Returns:
     decoder_input: a Tensor, bottom of decoder stack
     decoder_self_attention_bias: a Tensor, containing large negative values
-    to implement masked attention and possibly baises for diagonal alignments
+    to implement masked attention and possibly biases for diagonal alignments
   """
   if hparams.prepend_mode == "prepend_inputs_full_attention":
     decoder_self_attention_bias = (
diff --git a/tensor2tensor/models/research/attention_lm_moe.py b/tensor2tensor/models/research/attention_lm_moe.py
index ea65496cb..49ca3d20f 100644
--- a/tensor2tensor/models/research/attention_lm_moe.py
+++ b/tensor2tensor/models/research/attention_lm_moe.py
@@ -163,7 +163,7 @@ def _diet_expert(x):
     def print_shape(x, suffix, debug=False):
       # To help debugging, print the input/output shapes at inference and eval
       # Inference for long sequences can take a long time, so that's help to
-      # see the progession of the generation
+      # see the progression of the generation
       if not debug and hparams.mode == ModeKeys.TRAIN:
         return x
       return tf.Print(x, [tf.shape(x)], "shape_x_{}".format(suffix))
@@ -368,7 +368,7 @@ def attention_lm_moe_prepare_decoder(targets, hparams):
   Returns:
     decoder_input: a Tensor, bottom of decoder stack
     decoder_self_attention_bias: a Tensor, containing large negative values
-    to implement masked attention and possibly baises for diagonal alignments
+    to implement masked attention and possibly biases for diagonal alignments
     pad_remover (expert_utils.PadRemover): an util object to remove padding
   """
   targets_pad_mask = common_attention.embedding_to_padding(targets)
diff --git a/tensor2tensor/models/research/transformer_moe.py b/tensor2tensor/models/research/transformer_moe.py
index 57d82edf9..02a51dc08 100644
--- a/tensor2tensor/models/research/transformer_moe.py
+++ b/tensor2tensor/models/research/transformer_moe.py
@@ -46,7 +46,7 @@
 # "a/a/a#": Encoder only model (3 layers)
 # "#a/a/a": Decoder only model (3 layers)
 # "a/a-moe#a/a/a": Encoder (2 layers with 1 moe), decoder (3 layers)
-# Note that all combinaisons are not necessarily possibles (some attention
+# Note that all combinations are not necessarily possibles (some attention
 # types are not necessarily compatible with the encoder, or can't accept certain
 # types of masking)
 
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 32e2eaf9e..01f15a27c 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -51,7 +51,7 @@ class Transformer(t2t_model.T2TModel):
 
   def __init__(self, *args, **kwargs):
     super(Transformer, self).__init__(*args, **kwargs)
-    self.attention_weights = dict()  # For vizualizing attention heads.
+    self.attention_weights = dict()  # For visualizing attention heads.
 
   def encode(self, inputs, target_space, hparams, features=None):
     """Encode transformer inputs.
@@ -60,7 +60,7 @@ def encode(self, inputs, target_space, hparams, features=None):
       inputs: Transformer inputs [batch_size, input_length, input_height,
         hidden_dim] which will be flattened along the two spatial dimensions.
       target_space: scalar, target space ID.
-      hparams: hyperparmeters for model.
+      hparams: hyperparameters for model.
       features: optionally pass the entire features dictionary as well.
         This is needed now for "packed" datasets.
 
@@ -106,7 +106,7 @@ def decode(self,
           encoder-decoder attention. [batch_size, input_length]
       decoder_self_attention_bias: Bias and mask weights for decoder
           self-attention. [batch_size, decoder_length]
-      hparams: hyperparmeters for model.
+      hparams: hyperparameters for model.
       cache: dict, containing tensors which are the results of previous
           attentions, used for fast decoding.
       nonpadding: optional Tensor with shape [batch_size, decoder_length]
@@ -142,7 +142,7 @@ def body(self, features):
     Args:
       features: Map of features to the model. Should contain the following:
           "inputs": Transformer inputs [batch_size, input_length, hidden_dim]
-          "tragets": Target decoder outputs.
+          "targets": Target decoder outputs.
               [batch_size, decoder_length, hidden_dim]
           "target_space_id"
 
@@ -832,7 +832,7 @@ def transformer_ffn_layer(x,
 
   Args:
     x: a Tensor of shape [batch_size, length, hparams.hidden_size]
-    hparams: hyperparmeters for model
+    hparams: hyperparameters for model
     pad_remover: an expert_utils.PadRemover object tracking the padding
       positions. If provided, when using convolutional settings, the padding
       is removed before applying the convolution, and restored afterward. This
diff --git a/tensor2tensor/models/vanilla_gan.py b/tensor2tensor/models/vanilla_gan.py
index 100d60549..e78d56679 100644
--- a/tensor2tensor/models/vanilla_gan.py
+++ b/tensor2tensor/models/vanilla_gan.py
@@ -125,7 +125,7 @@ def body(self, features):
       features: a dictionary with the tensors.
 
     Returns:
-      A pair (predictions, losses) where preditions is the generated image
+      A pair (predictions, losses) where predictions is the generated image
       and losses is a dictionary of losses (that get added for the final loss).
     """
     features["targets"] = features["inputs"]
diff --git a/tensor2tensor/rl/README.md b/tensor2tensor/rl/README.md
index d925aef09..3c3e5f976 100644
--- a/tensor2tensor/rl/README.md
+++ b/tensor2tensor/rl/README.md
@@ -1,11 +1,11 @@
 # Tensor2Tensor experimental Model-Based Reinforcement Learning.
 
-The rl package intention is to provide possibility to run reinforcement
+The rl package intention is to provide the ability to run reinforcement
 algorithms within TensorFlow's computation graph, in order to do model-based
 RL using environment models from Tensor2Tensor. It's very experimental
 for now and under heavy development.
 
-Currently the only supported algorithm is Proximy Policy Optimization - PPO.
+Currently the only supported algorithm is Proximal Policy Optimization - PPO.
 
 # Sample usages
 
diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py
index fa09a1d1b..f4364550c 100644
--- a/tensor2tensor/utils/beam_search.py
+++ b/tensor2tensor/utils/beam_search.py
@@ -135,7 +135,7 @@ def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags,
       [batch_size, beam_size]. We will return the gathered scores from here.
       Scores to gather is different from scores because for grow_alive, we will
       need to return log_probs, while for grow_finished, we will need to return
-      the length penalized scors.
+      the length penalized scores.
     flags: Tensor of bools for sequences that say whether a sequence has reached
       EOS or not
     beam_size: int
@@ -229,7 +229,7 @@ def beam_search(symbols_to_logits_fn,
   Returns:
     Tuple of
     (decoded beams [batch_size, beam_size, decode_length]
-     decoding probablities [batch_size, beam_size])
+     decoding probabilities [batch_size, beam_size])
   """
   batch_size = common_layers.shape_list(initial_ids)[0]
 
@@ -495,17 +495,17 @@ def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq,
     # If the sequence isn't finished, we multiply it's score by 0. since
     # scores are all -ve, taking the min will give us the score of the lowest
     # finished item.
-    lowest_score_of_fininshed_in_finished = tf.reduce_min(
+    lowest_score_of_finished_in_finished = tf.reduce_min(
         finished_scores * tf.to_float(finished_in_finished), axis=1)
     # If none of the sequences have finished, then the min will be 0 and
     # we have to replace it by -ve INF if it is. The score of any seq in alive
     # will be much higher than -ve INF and the termination condition will not
     # be met.
-    lowest_score_of_fininshed_in_finished += (
+    lowest_score_of_finished_in_finished += (
         (1. - tf.to_float(tf.reduce_any(finished_in_finished, 1))) * -INF)
 
     bound_is_met = tf.reduce_all(
-        tf.greater(lowest_score_of_fininshed_in_finished,
+        tf.greater(lowest_score_of_finished_in_finished,
                    lower_bound_alive_scores))
 
     return tf.logical_and(
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index 09eb9417e..d7be24e7e 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -136,7 +136,7 @@ def _batching_scheme(batch_size,
                      min_length=0):
   """A batching scheme based on model hyperparameters.
 
-  Every batch containins a number of sequences divisible by `shard_multiplier`.
+  Every batch contains a number of sequences divisible by `shard_multiplier`.
 
   Args:
     batch_size: int, total number of tokens in a batch.
diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py
index 3f49aa6c2..2bfd35f01 100644
--- a/tensor2tensor/utils/expert_utils.py
+++ b/tensor2tensor/utils/expert_utils.py
@@ -66,7 +66,7 @@ def add_scope(scope=None, scope_fn=None):
   """Return a decorator which add a TF name/variable scope to a function.
 
   Note that the function returned by the decorator accept an additional 'name'
-  parameter, which can overwritte the name scope given when the function is
+  parameter, which can overwrite the name scope given when the function is
   created.
 
   Args:
@@ -587,12 +587,12 @@ def restore(self, x):
 
 @add_name_scope("map_ids")
 def map_ids(x, indices, map_fn):
-  """Apply a function to each coordinate ids of a multidimentional tensor.
+  """Apply a function to each coordinate ids of a multidimensional tensor.
 
   This allows to process each sequence of a batch independently. This is
   similar to tf.map_fn but with tensor where the batch dim has been flatten.
 
-  Warning: The indices ids have to be contigous and orderd in memory as the
+  Warning: The indices ids have to be contiguous and ordered in memory as the
   output vector for each of the ids are simply concatenated after being
   processed.
   Ex: if your indices are [0,2,2,1,2,0], the output will contains the processed
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index bb31a4dec..8d584f266 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -254,7 +254,7 @@ def image_summary(predictions, features, hparams):
 
   Returns:
     summary_proto: containing the summary images.
-    weights: A Tensor of zeros of the same shape as preditions.
+    weights: A Tensor of zeros of the same shape as predictions.
   """
   del hparams
   results = tf.cast(tf.argmax(predictions, axis=-1), tf.uint8)

From f504d63426c973b847049094638801468c84cf33 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Thu, 12 Apr 2018 10:00:49 -0700
Subject: [PATCH 16/29] Enable bfloat16 weights for Transformer model
 (Experimental verison).

PiperOrigin-RevId: 192626522
---
 tensor2tensor/layers/common_hparams.py |  4 ++
 tensor2tensor/layers/common_layers.py  | 21 +++++++
 tensor2tensor/models/transformer.py    | 21 +++++--
 tensor2tensor/utils/adafactor.py       | 85 +++++++++++++++++++++++---
 tensor2tensor/utils/t2t_model.py       | 19 +++---
 5 files changed, 132 insertions(+), 18 deletions(-)

diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
index 3b2d5f802..fffe674f7 100644
--- a/tensor2tensor/layers/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -232,6 +232,10 @@ def basic_params1():
       # Set this to the dtype used for activation. Variables will still be
       # stored in float32.
       activation_dtype="float32",
+      # Experimental: set weight_dtype="bfloat16" to use bfloat16 for both
+      # weights and activations. Model quality may be worse. Model quality
+      # appears to be close to baseline with large batch sizes (>4k).
+      weight_dtype="float32",
   )
 
 
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index 538f487ee..0248acb7d 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -79,6 +79,26 @@ def bfloat16_var_getter(getter, *args, **kwargs):
   return var
 
 
+def bfloat16_weights_var_getter(getter, *args, **kwargs):
+  """A custom getter function for bfloat16 variables.
+
+  Variables maintain storage in bfloat16.
+
+  Args:
+    getter: A custom getter.
+    *args: Arguments.
+    **kwargs: Keyword arguments.
+  Returns:
+    Variables with the correct dtype.
+  Raises:
+    KeyError: if "dtype" is not provided as a kwarg.
+  """
+  requested_dtype = kwargs["dtype"]
+  if requested_dtype in (tf.bfloat16, tf.float32):
+    kwargs["dtype"] = tf.bfloat16
+  return getter(*args, **kwargs)
+
+
 def dropout_with_broadcast_dims(x, keep_prob, broadcast_dims=None, **kwargs):
   """Like tf.nn.dropout but takes broadcast_dims instead of noise_shape.
 
@@ -1727,6 +1747,7 @@ def padded_cross_entropy(logits,
       labels = tf.reshape(labels, [-1])
     else:
       logits, labels = pad_with_zeros(logits, labels)
+    logits = tf.cast(logits, tf.float32)
     xent = smoothing_cross_entropy(logits, labels, vocab_size, confidence,
                                    gaussian=gaussian)
     weights = weights_fn(labels)
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 01f15a27c..bd736f7bb 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -595,8 +595,8 @@ def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
       32,
       ishape_static[-1],
       name="target_space_embedding",
-      dtype=tf.bfloat16
-      if hparams.activation_dtype == "bfloat16" else tf.float32)
+      dtype=tf.bfloat16 if hparams.activation_dtype == "bfloat16" or
+      hparams.weight_dtype == "bfloat16" else tf.float32)
   emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
   encoder_input += emb_target_space
   if hparams.pos == "timing":
@@ -605,7 +605,8 @@ def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
           encoder_input, inputs_position)
     else:
       encoder_input = common_attention.add_timing_signal_1d(encoder_input)
-  if hparams.activation_dtype == "bfloat16":
+  if (hparams.activation_dtype == "bfloat16" or
+      hparams.weight_dtype == "bfloat16"):
     encoder_self_attention_bias = tf.cast(encoder_self_attention_bias,
                                           tf.bfloat16)
     encoder_decoder_attention_bias = tf.cast(encoder_decoder_attention_bias,
@@ -654,7 +655,8 @@ def transformer_prepare_decoder(targets, hparams, features=None):
           decoder_input, targets_position)
     else:
       decoder_input = common_attention.add_timing_signal_1d(decoder_input)
-  if hparams.activation_dtype == "bfloat16":
+  if (hparams.activation_dtype == "bfloat16" or
+      hparams.weight_dtype == "bfloat16"):
     decoder_self_attention_bias = tf.cast(decoder_self_attention_bias,
                                           tf.bfloat16)
   return (decoder_input, decoder_self_attention_bias)
@@ -1517,3 +1519,14 @@ def transformer_supervised_attention():
   # Multiplier to the encoder-decoder expected attention loss.
   hparams.add_hparam("expected_attention_loss_multiplier", 1.0)
   return hparams
+
+
+@registry.register_hparams
+def transformer_tpu_1b():
+  """Hparams for training with 1B parameters."""
+  hparams = transformer_tpu()
+  hparams.hidden_size = 2048
+  hparams.filter_size = 8192
+  hparams.num_hidden_layers = 8
+  hparams.batch_size = 1024
+  return hparams
diff --git a/tensor2tensor/utils/adafactor.py b/tensor2tensor/utils/adafactor.py
index 979f1b3be..58e4f2752 100644
--- a/tensor2tensor/utils/adafactor.py
+++ b/tensor2tensor/utils/adafactor.py
@@ -169,7 +169,8 @@ def _create_slots(self, var_list):
         self._get_or_make_slot(var, r_val, "vr", self._name)
         self._get_or_make_slot(var, c_val, "vc", self._name)
       else:
-        self._zeros_slot(var, "v", self._name)
+        v_val = tf.zeros(shape, dtype=tf.float32)
+        self._get_or_make_slot(var, v_val, "v", self._name)
 
   def _apply_dense(self, grad, var):
     return self._resource_apply_dense(grad, var)
@@ -194,12 +195,13 @@ def _parameter_scale(self, var):
     return tf.maximum(reduce_rms(var), 0.001)
 
   def _resource_apply_dense(self, grad, var):
+    grad = tf.to_float(grad)
     grad_squared = tf.square(grad) + 1e-30
     grad_squared_mean = tf.reduce_mean(grad_squared)
     decay_rate = self._decay_rate
     update_scale = self._learning_rate
     if self._multiply_by_parameter_scale:
-      update_scale *= self._parameter_scale(var)
+      update_scale *= tf.to_float(self._parameter_scale(var))
     # HACK: Make things dependent on grad.
     # This confounds the XLA rewriter and keeps it from fusing computations
     # across different variables.  This fusion is a bad for HBM usage, since
@@ -236,16 +238,18 @@ def _resource_apply_dense(self, grad, var):
     subtrahend = update_scale * x
     if self._beta1:
       m = self.get_slot(var, "m")
-      new_m = self._beta1 * m + (1.0 - self._beta1) * subtrahend
-      updates.append(tf.assign(m, new_m, use_locking=self._use_locking))
+      new_m = self._beta1 * tf.to_float(m) + (1.0 - self._beta1) * subtrahend
       subtrahend = new_m
+      new_m = tf.cast(new_m, var.dtype)
+      updates.append(tf.assign(m, new_m, use_locking=self._use_locking))
+    new_val = tf.to_float(var) - subtrahend
+    if var.dtype == tf.bfloat16:
+      new_val = _to_bfloat16_unbiased(new_val)
     if self._simulated_quantize_bits:
       new_val = _simulated_quantize(
           var - subtrahend, self._simulated_quantize_bits,
           self._quantization_noise)
-      var_update = tf.assign(var, new_val, use_locking=self._use_locking)
-    else:
-      var_update = tf.assign_sub(var, subtrahend, use_locking=self._use_locking)
+    var_update = tf.assign(var, new_val, use_locking=self._use_locking)
     updates = [var_update] + updates
     return tf.group(*updates)
 
@@ -392,3 +396,70 @@ def _quantization_noise_from_step_num():
     ret += (((phi * (2 ** i)) % 1.0)  # double-precision computation in python
             * tf.to_float(tf.mod(step // (2 ** i), 2)))
   return tf.mod(ret, 1.0)
+
+
+def _randomized_roundoff_to_bfloat16(x, quantization_noise, cand1, cand2):
+  """Round-off x to cand1 or to cand2 in an unbiased way.
+
+  Cand1 and cand2 are the same shape as x.
+  For every element of x, the corresponding elements of cand1 and cand2 should
+  be the two closest bfloat16 values to x.  Order does not matter.
+  cand1 and cand2 must differ from each other.
+
+  Args:
+    x: A float32 Tensor.
+    quantization_noise: A Tensor broadcastable to the shape of x containing
+    random uniform values in [0.0, 1.0].
+    cand1: A bfloat16 Tensor the same shape as x.
+    cand2: A bfloat16 Tensor the same shape as x.
+
+  Returns:
+    A bfloat16 Tensor.
+  """
+  cand1_f = tf.to_float(cand1)
+  cand2_f = tf.to_float(cand2)
+  step_size = cand2_f - cand1_f
+  fpart = (x - cand1_f) / step_size
+  ret = tf.where(tf.greater(fpart, quantization_noise), cand2, cand1)
+  return ret
+
+
+def _to_bfloat16_unbiased(x):
+  """Convert a float32 to a bfloat16 using randomized roundoff.
+
+  Note: If this ever produces worse results than using float32 all the way
+  through, we should try to diagnose and fix it.  There are several things
+  to try:
+
+  1. Encode parameter x for storage purposes as
+     _to_bfloat16_unbiased(tf.pow(x, 5)) .  This gives 5x the
+     resolution while incurring overflow and underflow at 10^9 and 10^-9
+     instead of 10^37 and 10^-37.  Comes at a cost of extracting fifth roots
+     to decode parameters.  Or use some other such scheme.
+
+  2. In this function, use actual random numbers, different for each parameter
+     as opposed to the same for every parameter in the graph.
+
+  3. Look for bugs in this function.
+
+  Args:
+    x: A float32 Tensor.
+  Returns:
+    A float32 Tensor.
+  """
+  # Not using random_uniform here due to a problem on TPU in that random seeds
+  # are not respected, which may cause the parameters on different replicas
+  # to go out-of-sync.
+  quantization_noise = _quantization_noise_from_step_num()
+  x_sign = tf.sign(x)
+  # Make sure x is positive.  If it is zero, the two candidates are identical.
+  x = x * x_sign + 1e-30
+  cand1 = tf.to_bfloat16(x)
+  cand1_f = tf.to_float(cand1)
+  # This relies on the fact that for a positive bfloat16 b,
+  # b * 1.005 gives you the next higher bfloat16 and b*0.995 gives you the
+  # next lower one. Both 1.005 and 0.995 are ballpark estimation.
+  cand2 = tf.to_bfloat16(
+      tf.where(tf.greater(x, cand1_f), cand1_f * 1.005, cand1_f * 0.995))
+  ret = _randomized_roundoff_to_bfloat16(x, quantization_noise, cand1, cand2)
+  return ret * tf.to_bfloat16(x_sign)
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index a9c18399f..0af523340 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -130,9 +130,12 @@ def has_input(self):
       return True
 
   def call(self, features):
-    tf.get_variable_scope().set_custom_getter(common_layers.bfloat16_var_getter
-                                              if self.hparams.activation_dtype
-                                              == "bfloat16" else None)
+    custom_getter = None
+    if self.hparams.activation_dtype == "bfloat16":
+      custom_getter = common_layers.bfloat16_var_getter
+    if self.hparams.weight_dtype == "bfloat16":
+      custom_getter = common_layers.bfloat16_weights_var_getter
+    tf.get_variable_scope().set_custom_getter(custom_getter)
     tf.get_variable_scope().set_initializer(
         optimize.get_variable_initializer(self.hparams))
     with self._eager_var_store.as_default():
@@ -219,7 +222,8 @@ def model_fn_sharded(self, sharded_features):
   def model_fn(self, features):
     transformed_features = self.bottom(features)
 
-    if self.hparams.activation_dtype == "bfloat16":
+    if (self.hparams.activation_dtype == "bfloat16" or
+        self.hparams.weight_dtype == "bfloat16"):
       for k, v in six.iteritems(transformed_features):
         if v.dtype == tf.float32:
           transformed_features[k] = tf.cast(v, tf.bfloat16)
@@ -356,7 +360,6 @@ def _loss_single(self, logits, target_modality, features):
     # The current bfloat16 version still uses float32 for most parts of backward
     # propagation to keep model quality, so cast back before computing the loss
     # value.
-    logits = tf.cast(logits, tf.float32)
     if not target_modality:
       log_warn(_no_problem_err("loss"))
       return (tf.constant(0., dtype=tf.float32),
@@ -1263,7 +1266,8 @@ def host_call_fn(**kwargs):
     with tf.contrib.summary.create_file_writer(model_dir).as_default():
       with tf.contrib.summary.always_record_summaries():
         for name, value in six.iteritems(kwargs):
-          tf.contrib.summary.scalar(name, tf.reduce_mean(value), step=gs)
+          tf.contrib.summary.scalar(
+              name, tf.reduce_mean(tf.to_float(value)), step=gs)
 
         return tf.contrib.summary.all_summary_ops()
 
@@ -1359,7 +1363,8 @@ def average_sharded_losses(sharded_losses):
     if isinstance(all_shards[0], tuple):
       sharded_num, sharded_den = zip(*all_shards)
       mean_loss = (
-          tf.add_n(sharded_num) / tf.maximum(1.0, tf.add_n(sharded_den)))
+          tf.add_n(sharded_num) / tf.maximum(
+              tf.cast(1.0, sharded_den[0].dtype), tf.add_n(sharded_den)))
     else:
       mean_loss = tf.reduce_mean(all_shards)
 

From 6abc9395d59cbc7b53bdeadcde2a09d1e50ef70b Mon Sep 17 00:00:00 2001
From: Noam Shazeer <noam@google.com>
Date: Thu, 12 Apr 2018 10:48:58 -0700
Subject: [PATCH 17/29] Starting some experiments using Transformer on lm1b
 language modeling benchmark.

PiperOrigin-RevId: 192634354
---
 tensor2tensor/models/__init__.py              |  1 +
 .../models/research/lm_experiments.py         | 79 +++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 tensor2tensor/models/research/lm_experiments.py

diff --git a/tensor2tensor/models/__init__.py b/tensor2tensor/models/__init__.py
index c76570d9a..76c51f581 100644
--- a/tensor2tensor/models/__init__.py
+++ b/tensor2tensor/models/__init__.py
@@ -46,6 +46,7 @@
 from tensor2tensor.models.research import basic_conv_gen
 from tensor2tensor.models.research import cycle_gan
 from tensor2tensor.models.research import gene_expression
+from tensor2tensor.models.research import lm_experiments
 from tensor2tensor.models.research import multimodel
 from tensor2tensor.models.research import rl
 from tensor2tensor.models.research import super_lm
diff --git a/tensor2tensor/models/research/lm_experiments.py b/tensor2tensor/models/research/lm_experiments.py
new file mode 100644
index 000000000..a8d68583d
--- /dev/null
+++ b/tensor2tensor/models/research/lm_experiments.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Experiments with Language Models.
+
+Train languagemodel_lm1b32k_packed and measure log-ppl/token (dev).
+These numbers need to be multiplied by 1.107893 to get log-ppl/word
+ for comparison with published results.
+
+Basic training regimen is 300k steps * 8 cores * batch_size=4096
+   = about 10 epochs
+
+Make sure to eval on CPU or GPU using a large number of steps (1000), since the
+TPU eval code doesn't know how to stop at the end of the dev data.  Also need
+to set activation_type=float32 for eval, since there is currently a conflict
+between daisy_chain_getter and activation_type=bfloat16.
+
+RESULTS:
+  lmx_base:      log-ppl/tok=3.40   PPL/word=43.2   (10 hours*8 cores)
+  lmx_h1k_f4k:
+  lmx_h2k_f8k:
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensor2tensor.models import transformer
+from tensor2tensor.utils import registry
+
+
+@registry.register_hparams
+def lmx_base():
+  """Transformer on languagemodel_lm1b32k_packed.  50M Params."""
+  hparams = transformer.transformer_tpu()
+  # sharing is counterproductive when underparameterized
+  hparams.shared_embedding_and_softmax_weights = False
+  # we judge by log-ppl, so label smoothing hurts.
+  hparams.label_smoothing = 0.0
+  # This makes the batch size on GPU the same as on TPU for a packed problem
+  # with sequence length 256.
+  # TODO(noam): fix the mess that is the data reading pipeline.
+  hparams.max_length = 256
+  # larger batch since we only have a decoder
+  hparams.batch_size = 4096
+  # save some memory so we can have a larger model
+  hparams.activation_dtype = "bfloat16"
+  return hparams
+
+
+@registry.register_hparams
+def lmx_h1k_f4k():
+  """Transformer on languagemodel_lm1b32k_packed.  140M Params."""
+  hparams = lmx_base()
+  hparams.hidden_size = 1024
+  hparams.filter_size = 4096
+  return hparams
+
+
+@registry.register_hparams
+def lmx_h2k_f8k():
+  """HParams for training languagemodel_lm1b32k_packed.  430M Params."""
+  hparams = lmx_base()
+  hparams.hidden_size = 2048
+  hparams.filter_size = 8192
+  return hparams
+

From b13b0eb51000fdc8ee886fa02612c034c3599bfe Mon Sep 17 00:00:00 2001
From: Noam Shazeer <noam@google.com>
Date: Thu, 12 Apr 2018 10:52:55 -0700
Subject: [PATCH 18/29] global gradient norm is expensive to compute - don't do
 it by default.

PiperOrigin-RevId: 192634994
---
 tensor2tensor/utils/optimize.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensor2tensor/utils/optimize.py b/tensor2tensor/utils/optimize.py
index 7b976131f..86d8a3b7c 100644
--- a/tensor2tensor/utils/optimize.py
+++ b/tensor2tensor/utils/optimize.py
@@ -45,10 +45,10 @@ def optimize(loss, learning_rate, hparams, use_tpu=False):
     opt = tf.contrib.tpu.CrossShardOptimizer(opt)
 
   tf.summary.scalar("learning_rate", learning_rate)
-  opt_summaries = ["loss", "global_gradient_norm"]
+  opt_summaries = ["loss"]
   if hparams.summarize_grads:
     tf.logging.info("Summarizing gradients")
-    opt_summaries.extend(["gradients", "gradient_norm"])
+    opt_summaries.extend(["gradients", "gradient_norm", "global_gradient_norm"])
 
   if hparams.clip_grad_norm:
     tf.logging.info("Clipping gradients, norm: %0.5f", hparams.clip_grad_norm)
@@ -129,7 +129,8 @@ def weight_decay_and_noise(loss, hparams, learning_rate, var_list=None):
   noise_vars = [v for v in var_list if "/body/" in v.name]
 
   weight_decay_loss = weight_decay(hparams.weight_decay, decay_vars)
-  tf.summary.scalar("losses/weight_decay", weight_decay_loss)
+  if hparams.weight_decay:
+    tf.summary.scalar("losses/weight_decay", weight_decay_loss)
   weight_noise_ops = weight_noise(hparams.weight_noise, learning_rate,
                                   noise_vars)
 

From 81fe9c565c7fb1dc022597e9d794e7d31fc9764c Mon Sep 17 00:00:00 2001
From: Etienne Pot <epot@google.com>
Date: Thu, 12 Apr 2018 11:13:56 -0700
Subject: [PATCH 19/29] Fix SymbolModality for prepend_inputs_masked_attention

PiperOrigin-RevId: 192638880
---
 tensor2tensor/layers/modalities.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
index 992ea5b95..e60726cde 100644
--- a/tensor2tensor/layers/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -59,7 +59,15 @@ def targets_weights_fn(self):
     if hp and hp.prepend_mode != "none":
       assert (hp.prepend_mode == "prepend_inputs_masked_attention" or
               hp.prepend_mode == "prepend_inputs_full_attention")
-      weights_fn = common_layers.weights_prepend_inputs_to_targets
+
+      if (
+          # In masked attention mode, during training, the network try to
+          # autoregressively predicting the inputs portion, while the
+          # evaluation is only done on the output
+          hp.prepend_mode != "prepend_inputs_masked_attention" or
+          hp.mode != tf.estimator.ModeKeys.TRAIN
+      ):
+        weights_fn = common_layers.weights_prepend_inputs_to_targets
 
     return weights_fn
 

From 43da5c3006e09f29c75784e07950c0563d926c69 Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Thu, 12 Apr 2018 11:14:57 -0700
Subject: [PATCH 20/29] Clean up RL code.

PiperOrigin-RevId: 192639051
---
 tensor2tensor/data_generators/gym.py          | 334 +++++++++---------
 .../models/research/basic_conv_gen.py         |  78 +++-
 tensor2tensor/models/research/rl.py           |   3 +
 tensor2tensor/notebooks/hello_t2t-rl.ipynb    |  10 +-
 tensor2tensor/rl/README.md                    |   2 +-
 tensor2tensor/rl/collect.py                   |  17 +-
 tensor2tensor/rl/envs/atari_wrappers.py       | 139 --------
 tensor2tensor/rl/envs/in_graph_batch_env.py   |  91 +----
 tensor2tensor/rl/envs/py_func_batch_env.py    | 169 +++++++++
 tensor2tensor/rl/envs/simulated_batch_env.py  | 150 ++++++++
 tensor2tensor/rl/envs/tf_atari_wrappers.py    | 187 ++++++++++
 tensor2tensor/rl/envs/utils.py                |  53 ++-
 tensor2tensor/rl/model_rl_experiment.py       | 119 +++++++
 tensor2tensor/rl/ppo.py                       |  81 +++--
 tensor2tensor/rl/rl_trainer_lib.py            |  66 ++--
 tensor2tensor/utils/metrics.py                |   4 +-
 tensor2tensor/utils/t2t_model.py              |  14 +-
 17 files changed, 1027 insertions(+), 490 deletions(-)
 delete mode 100644 tensor2tensor/rl/envs/atari_wrappers.py
 create mode 100644 tensor2tensor/rl/envs/py_func_batch_env.py
 create mode 100644 tensor2tensor/rl/envs/simulated_batch_env.py
 create mode 100644 tensor2tensor/rl/envs/tf_atari_wrappers.py
 create mode 100644 tensor2tensor/rl/model_rl_experiment.py

diff --git a/tensor2tensor/data_generators/gym.py b/tensor2tensor/data_generators/gym.py
index 2ff5ba934..6a9756cba 100644
--- a/tensor2tensor/data_generators/gym.py
+++ b/tensor2tensor/data_generators/gym.py
@@ -19,79 +19,132 @@
 from __future__ import division
 from __future__ import print_function
 
-import functools
+from collections import deque
 
+import functools
+import os
 # Dependency imports
-
 import gym
-import numpy as np
 
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
-
 from tensor2tensor.models.research import rl
-from tensor2tensor.rl import rl_trainer_lib  # pylint: disable=unused-import
-from tensor2tensor.rl.envs import atari_wrappers
-
-from tensor2tensor.utils import metrics
+from tensor2tensor.rl import collect
+from tensor2tensor.rl.envs import tf_atari_wrappers as atari
+from tensor2tensor.rl.envs.utils import batch_env_factory
 from tensor2tensor.utils import registry
 
 import tensorflow as tf
 
+from tensorflow.contrib.training import HParams
+
 
 flags = tf.flags
 FLAGS = flags.FLAGS
 
-flags.DEFINE_string("model_path", "", "File with model for pong")
+flags.DEFINE_string("agent_policy_path", "", "File with model for pong")
+
+
+def moviepy_editor():
+  """Access to moviepy that fails gracefully without a moviepy install."""
+  try:
+    from moviepy import editor  # pylint: disable=g-import-not-at-top
+  except ImportError:
+    raise ImportError("pip install moviepy to record videos")
+  return editor
 
 
+@registry.register_problem
 class GymDiscreteProblem(problem.Problem):
   """Gym environment with discrete actions and rewards."""
 
   def __init__(self, *args, **kwargs):
     super(GymDiscreteProblem, self).__init__(*args, **kwargs)
-    self._env = None
+    self.num_channels = 3
+    self.history_size = 2
+
+    # defaults
+    self.environment_spec = lambda: gym.make("PongNoFrameskip-v4")
+    self.in_graph_wrappers = [(atari.MaxAndSkipWrapper, {"skip": 4})]
+    self.collect_hparams = rl.atari_base()
+    self.num_steps = 1000
+    self.movies = True
+    self.movies_fps = 24
+    self.simulated_environment = None
+    self.warm_up = 70
+
+  def _setup(self):
+    in_graph_wrappers = [(atari.ShiftRewardWrapper, {"add_value": 2}),
+                         (atari.MemoryWrapper, {})] + self.in_graph_wrappers
+    env_hparams = HParams(in_graph_wrappers=in_graph_wrappers,
+                          simulated_environment=self.simulated_environment)
+
+    generator_batch_env = batch_env_factory(
+        self.environment_spec, env_hparams, num_agents=1, xvfb=False)
+
+    with tf.variable_scope("", reuse=tf.AUTO_REUSE):
+      policy_lambda = self.collect_hparams.network
+      policy_factory = tf.make_template(
+          "network",
+          functools.partial(policy_lambda, self.environment_spec().action_space,
+                            self.collect_hparams),
+          create_scope_now_=True,
+          unique_name_="network")
 
-  def example_reading_spec(self, label_repr=None):
+    with tf.variable_scope("", reuse=tf.AUTO_REUSE):
+      sample_policy = lambda policy: 0 * policy.sample()
 
+      self.collect_hparams.epoch_length = 10
+      _, self.collect_trigger_op = collect.define_collect(
+          policy_factory, generator_batch_env, self.collect_hparams,
+          eval_phase=False, policy_to_actions_lambda=sample_policy,
+          scope="define_collect")
+
+    self.avilable_data_size_op = atari.MemoryWrapper.singleton.speculum.size()
+    self.data_get_op = atari.MemoryWrapper.singleton.speculum.dequeue()
+    self.history_buffer = deque(maxlen=self.history_size+1)
+
+  def example_reading_spec(self, label_repr=None):
     data_fields = {
-        "frame": tf.FixedLenFeature([210, 160, 3], tf.int64),
+        "targets_encoded": tf.FixedLenFeature((), tf.string),
+        "image/format": tf.FixedLenFeature((), tf.string),
         "action": tf.FixedLenFeature([1], tf.int64),
-        "reward": tf.FixedLenFeature([1], tf.int64)
+        "reward": tf.FixedLenFeature([1], tf.int64),
+        # "done": tf.FixedLenFeature([1], tf.int64)
     }
 
-    return data_fields, None
-
-  def eval_metrics(self):
-    return [metrics.Metrics.ACC, metrics.Metrics.ACC_PER_SEQ,
-            metrics.Metrics.NEG_LOG_PERPLEXITY, metrics.Metrics.IMAGE_SUMMARY]
-
-  @property
-  def env_name(self):
-    # This is the name of the Gym environment for this problem.
-    raise NotImplementedError()
+    for x in range(self.history_size):
+      data_fields["inputs_encoded_{}".format(x)] = tf.FixedLenFeature(
+          (), tf.string)
+
+    data_items_to_decoders = {
+        "targets": tf.contrib.slim.tfexample_decoder.Image(
+            image_key="targets_encoded",
+            format_key="image/format",
+            shape=[210, 160, 3],
+            channels=3),
+        # Just do a pass through.
+        "action": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="action"),
+        "reward": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="reward"),
+    }
 
-  @property
-  def env(self):
-    if self._env is None:
-      self._env = gym.make(self.env_name)
-    return self._env
+    for x in range(self.history_size):
+      key = "inputs_{}".format(x)
+      data_items_to_decoders[key] = tf.contrib.slim.tfexample_decoder.Image(
+          image_key="inputs_encoded_{}".format(x),
+          format_key="image/format",
+          shape=[210, 160, 3],
+          channels=3)
 
-  @property
-  def num_channels(self):
-    return 3
+    return data_fields, data_items_to_decoders
 
   @property
   def num_actions(self):
-    raise NotImplementedError()
+    return 4
 
   @property
   def num_rewards(self):
-    raise NotImplementedError()
-
-  @property
-  def num_steps(self):
-    raise NotImplementedError()
+    return 2
 
   @property
   def num_shards(self):
@@ -101,57 +154,73 @@ def num_shards(self):
   def num_dev_shards(self):
     return 1
 
-  def preprocess(self, dataset, mode, hparams):
-    def unbatch(batched_features, n):
-      """Split each feature in batched_features into a list of n tensors."""
-      result = {}
-      for k, v in batched_features.iteritems():
-        result[k] = [tf.squeeze(t, axis=0) for t in tf.split(v, n)]
-      return result
-
-    def features_from_batch(batched_prefeatures):
-      """Construct final features from the batched inputs."""
-      unbatched = unbatch(batched_prefeatures, 3)
-      frames = unbatched["frame"]
-      return {"inputs_prev": frames[0],
-              "inputs": frames[1],
-              "targets": frames[2],
-              "action": unbatched["action"][1],
-              "reward": unbatched["reward"][1]}
-
-    # Batch and construct features.
-    batch_dataset = dataset.apply(
-        tf.contrib.data.batch_and_drop_remainder(3))
-    return batch_dataset.map(features_from_batch)
-
   def get_action(self, observation=None):
     return self.env.action_space.sample()
 
   def hparams(self, defaults, unused_model_hparams):
     p = defaults
-    p.input_modality = {"inputs": ("image", 256),
-                        "inputs_prev": ("image", 256),
-                        "reward": ("symbol", self.num_rewards),
-                        "action": ("symbol", self.num_actions)}
-    p.target_modality = ("image", 256)
+    # The hard coded +1 after "symbol" refers to the fact
+    # that 0 is a special symbol meaning padding
+    # when symbols are e.g. 0, 1, 2, 3 we
+    # shift them to 0, 1, 2, 3, 4.
+    p.input_modality = {"action": ("symbol:identity", self.num_actions)}
+
+    for x in range(self.history_size):
+      p.input_modality["inputs_{}".format(x)] = ("image", 256)
+
+    p.target_modality = {"targets": ("image", 256),
+                         "reward": ("symbol", self.num_rewards + 1)}
+
     p.input_space_id = problem.SpaceID.IMAGE
     p.target_space_id = problem.SpaceID.IMAGE
 
+  def restore_networks(self, sess):
+    model_saver = tf.train.Saver(
+        tf.global_variables(".*network_parameters.*"))
+    if FLAGS.agent_policy_path:
+      model_saver.restore(sess, FLAGS.agent_policy_path)
+
   def generator(self, data_dir, tmp_dir):
-    self.env.reset()
-    action = self.get_action()
-    for _ in range(self.num_steps):
-      observation, reward, done, _ = self.env.step(action)
-      action = self.get_action(observation)
-      if done:
-        self.env.reset()
-      def flatten(nparray):
-        flat1 = [x for sublist in nparray.tolist() for x in sublist]
-        return [x for sublist in flat1 for x in sublist]
-      yield {"frame": flatten(observation),
-             "action": [action],
-             "done": [done],
-             "reward": [int(reward)]}
+    self._setup()
+    clip_files = []
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      self.restore_networks(sess)
+
+      pieces_generated = 0
+      while pieces_generated < self.num_steps + self.warm_up:
+        avilable_data_size = sess.run(self.avilable_data_size_op)
+        if avilable_data_size > 0:
+          observ, reward, action, _ = sess.run(self.data_get_op)
+          self.history_buffer.append(observ)
+
+          if self.movies and pieces_generated > self.warm_up:
+            file_name = os.path.join(tmp_dir,
+                                     "output_{}.png".format(pieces_generated))
+            clip_files.append(file_name)
+            with open(file_name, "wb") as f:
+              f.write(observ)
+
+          if len(self.history_buffer) == self.history_size+1:
+            pieces_generated += 1
+            ret_dict = {
+                "targets_encoded": [observ],
+                "image/format": ["png"],
+                "action": [int(action)],
+                # "done": [bool(done)],
+                "reward": [int(reward)],
+            }
+            for i, v in enumerate(list(self.history_buffer)[:-1]):
+              ret_dict["inputs_encoded_{}".format(i)] = [v]
+            if pieces_generated > self.warm_up:
+              yield ret_dict
+        else:
+          sess.run(self.collect_trigger_op)
+
+    if self.movies:
+      clip = moviepy_editor().ImageSequenceClip(clip_files, fps=self.movies_fps)
+      clip_path = os.path.join(data_dir, "output_{}.mp4".format(self.name))
+      clip.write_videofile(clip_path, fps=self.movies_fps, codec="mpeg4")
 
   def generate_data(self, data_dir, tmp_dir, task_id=-1):
     train_paths = self.training_filepaths(
@@ -165,93 +234,24 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
 
 
 @registry.register_problem
-class GymPongRandom5k(GymDiscreteProblem):
-  """Pong game, random actions."""
-
-  @property
-  def env_name(self):
-    return "PongDeterministic-v4"
-
-  @property
-  def num_actions(self):
-    return 4
-
-  @property
-  def num_rewards(self):
-    return 3
-
-  @property
-  def num_steps(self):
-    return 5000
-
-
-@registry.register_problem
-class GymPongTrajectoriesFromPolicy(GymDiscreteProblem):
-  """Pong game, loaded actions."""
+class GymSimulatedDiscreteProblem(GymDiscreteProblem):
+  """Simulated gym environment with discrete actions and rewards."""
 
   def __init__(self, *args, **kwargs):
-    super(GymPongTrajectoriesFromPolicy, self).__init__(*args, **kwargs)
-    self._env = None
-    self._last_policy_op = None
-    self._max_frame_pl = None
-    self._last_action = self.env.action_space.sample()
-    self._skip = 4
-    self._skip_step = 0
-    self._obs_buffer = np.zeros((2,) + self.env.observation_space.shape,
-                                dtype=np.uint8)
-
-  def generator(self, data_dir, tmp_dir):
-    env_spec = lambda: atari_wrappers.wrap_atari(  # pylint: disable=g-long-lambda
-        gym.make(self.env_name),
-        warp=False,
-        frame_skip=4,
-        frame_stack=False)
-    hparams = rl.atari_base()
-    with tf.variable_scope("train", reuse=tf.AUTO_REUSE):
-      policy_lambda = hparams.network
-      policy_factory = tf.make_template(
-          "network",
-          functools.partial(policy_lambda, env_spec().action_space, hparams))
-      self._max_frame_pl = tf.placeholder(
-          tf.float32, self.env.observation_space.shape)
-      actor_critic = policy_factory(tf.expand_dims(tf.expand_dims(
-          self._max_frame_pl, 0), 0))
-      policy = actor_critic.policy
-      self._last_policy_op = policy.mode()
-      with tf.Session() as sess:
-        model_saver = tf.train.Saver(
-            tf.global_variables(".*network_parameters.*"))
-        model_saver.restore(sess, FLAGS.model_path)
-        for item in super(GymPongTrajectoriesFromPolicy,
-                          self).generator(data_dir, tmp_dir):
-          yield item
-
-  # TODO(blazej0): For training of atari agents wrappers are usually used.
-  # Below we have a hacky solution which is a workaround to be used together
-  # with atari_wrappers.MaxAndSkipEnv.
-  def get_action(self, observation=None):
-    if self._skip_step == self._skip - 2: self._obs_buffer[0] = observation
-    if self._skip_step == self._skip - 1: self._obs_buffer[1] = observation
-    self._skip_step = (self._skip_step + 1) % self._skip
-    if self._skip_step == 0:
-      max_frame = self._obs_buffer.max(axis=0)
-      self._last_action = int(tf.get_default_session().run(
-          self._last_policy_op,
-          feed_dict={self._max_frame_pl: max_frame})[0, 0])
-    return self._last_action
-
-  @property
-  def env_name(self):
-    return "PongDeterministic-v4"
-
-  @property
-  def num_actions(self):
-    return 4
-
-  @property
-  def num_rewards(self):
-    return 2
-
-  @property
-  def num_steps(self):
-    return 5000
+    super(GymSimulatedDiscreteProblem, self).__init__(*args, **kwargs)
+    # TODO(lukaszkaiser): pull it outside
+    self.in_graph_wrappers = [(atari.TimeLimitWrapper, {"timelimit": 150}),
+                              (atari.MaxAndSkipWrapper, {"skip": 4})]
+    self.simulated_environment = True
+    self.movies_fps = 2
+
+  def restore_networks(self, sess):
+    super(GymSimulatedDiscreteProblem, self).restore_networks(sess)
+
+    # TODO(lukaszkaiser): adjust regexp for different models
+    env_model_loader = tf.train.Saver(tf.global_variables(".*basic_conv_gen.*"))
+    sess = tf.get_default_session()
+
+    ckpts = tf.train.get_checkpoint_state(FLAGS.output_dir)
+    ckpt = ckpts.model_checkpoint_path
+    env_model_loader.restore(sess, ckpt)
diff --git a/tensor2tensor/models/research/basic_conv_gen.py b/tensor2tensor/models/research/basic_conv_gen.py
index 144042896..129c71f07 100644
--- a/tensor2tensor/models/research/basic_conv_gen.py
+++ b/tensor2tensor/models/research/basic_conv_gen.py
@@ -41,7 +41,7 @@ def body(self, features):
     cur_frame = tf.to_float(features["inputs"])
     prev_frame = tf.to_float(features["inputs_prev"])
     x = tf.concat([cur_frame, prev_frame], axis=-1)
-    for _ in xrange(hparams.num_compress_steps):
+    for _ in range(hparams.num_compress_steps):
       x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu,
                            strides=(2, 2), padding="SAME")
       x = common_layers.layer_norm(x)
@@ -52,7 +52,7 @@ def body(self, features):
     x = tf.concat([x, action + zeros], axis=-1)
 
     # Run a stack of convolutions.
-    for i in xrange(hparams.num_hidden_layers):
+    for i in range(hparams.num_hidden_layers):
       with tf.variable_scope("layer%d" % i):
         y = tf.layers.conv2d(x, filters, kernel1, activation=common_layers.belu,
                              strides=(1, 1), padding="SAME")
@@ -61,7 +61,7 @@ def body(self, features):
         else:
           x = common_layers.layer_norm(x + y)
     # Up-convolve.
-    for _ in xrange(hparams.num_compress_steps):
+    for _ in range(hparams.num_compress_steps):
       filters //= 2
       x = tf.layers.conv2d_transpose(
           x, filters, kernel2, activation=common_layers.belu,
@@ -101,6 +101,76 @@ def basic_conv():
 @registry.register_hparams
 def basic_conv_small():
   """Small conv model."""
-  hparams = common_hparams.basic_params1()
+  hparams = basic_conv()
   hparams.hidden_size = 32
   return hparams
+
+
+@registry.register_hparams
+def basic_conv_small_per_image_standardization():
+  """Small conv model."""
+  hparams = common_hparams.basic_params1()
+  hparams.kernel_sizes = [(3, 3), (5, 5)]
+  hparams.filter_numbers = [32, 3*256]
+  hparams.batch_size = 2
+  hparams.add_hparam("per_image_standardization", True)
+  return hparams
+
+
+@registry.register_model
+class MichiganBasicConvGen(t2t_model.T2TModel):
+
+  def body(self, features):
+    def standardize_images(x):
+      """Image standardization on batches."""
+      with tf.name_scope("standardize_images", [x]):
+        x = tf.to_float(x)
+        x_mean = tf.reduce_mean(x, axis=[1, 2, 3], keep_dims=True)
+        x_variance = tf.reduce_mean(
+            tf.square(x - x_mean), axis=[1, 2, 3], keep_dims=True)
+        x_shape = common_layers.shape_list(x)
+        num_pixels = tf.to_float(x_shape[1] * x_shape[2] * 3)
+        x = (x - x_mean) / tf.maximum(tf.sqrt(x_variance), tf.rsqrt(num_pixels))
+        return x
+
+    def deconv2d(cur, i, kernel_size, output_filters, activation=tf.nn.relu):
+      thicker = common_layers.conv(
+          cur,
+          output_filters * 4,
+          kernel_size,
+          padding="SAME",
+          activation=activation,
+          name="deconv2d" + str(i))
+      return tf.depth_to_space(thicker, 2)
+
+    cur_frame = standardize_images(features["inputs_0"])
+    prev_frame = standardize_images(features["inputs_1"])
+
+    frames = tf.concat([cur_frame, prev_frame], axis=3)
+    frames = tf.reshape(frames, [-1, 210, 160, 6])
+
+    h1 = tf.layers.conv2d(frames, filters=64, strides=2, kernel_size=(8, 8),
+                          padding="SAME", activation=tf.nn.relu)
+    h2 = tf.layers.conv2d(h1, filters=128, strides=2, kernel_size=(6, 6),
+                          padding="SAME", activation=tf.nn.relu)
+    h3 = tf.layers.conv2d(h2, filters=128, strides=2, kernel_size=(6, 6),
+                          padding="SAME", activation=tf.nn.relu)
+    h4 = tf.layers.conv2d(h3, filters=128, strides=2, kernel_size=(4, 4),
+                          padding="SAME", activation=tf.nn.relu)
+    h45 = tf.reshape(h4, [-1, 14 * 10 * 128])
+    h5 = tf.layers.dense(h45, 2048, activation=tf.nn.relu)
+    h6 = tf.layers.dense(h5, 2048, activation=tf.nn.relu)
+    h7 = tf.layers.dense(h6, 14 * 10 * 128, activation=tf.nn.relu)
+    h8 = tf.reshape(h7, [-1, 14, 10, 128])
+
+    h9 = deconv2d(h8, 1, (4, 4), 128, activation=tf.nn.relu)
+    h9 = h9[:, :27, :, :]
+    h10 = deconv2d(h9, 2, (6, 6), 128, activation=tf.nn.relu)
+    h10 = h10[:, :53, :, :]
+    h11 = deconv2d(h10, 3, (6, 6), 128, activation=tf.nn.relu)
+    h11 = h11[:, :105, :, :]
+    h12 = deconv2d(h11, 4, (8, 8), 3 * 256, activation=tf.identity)
+
+    reward = tf.layers.flatten(h12)
+
+    return {"targets": h12, "reward": reward}
diff --git a/tensor2tensor/models/research/rl.py b/tensor2tensor/models/research/rl.py
index 858d6964e..2c5181d95 100644
--- a/tensor2tensor/models/research/rl.py
+++ b/tensor2tensor/models/research/rl.py
@@ -50,6 +50,9 @@ def ppo_base_v1():
   hparams.add_hparam("num_eval_agents", 3)
   hparams.add_hparam("video_during_eval", True)
   hparams.add_hparam("save_models_every_epochs", 30)
+  hparams.add_hparam("optimization_batch_size", 50)
+  hparams.add_hparam("max_gradients_norm", 0.5)
+  hparams.add_hparam("simulated_environment", False)
   return hparams
 
 
diff --git a/tensor2tensor/notebooks/hello_t2t-rl.ipynb b/tensor2tensor/notebooks/hello_t2t-rl.ipynb
index d7e0eb6e1..b209007d6 100644
--- a/tensor2tensor/notebooks/hello_t2t-rl.ipynb
+++ b/tensor2tensor/notebooks/hello_t2t-rl.ipynb
@@ -7,7 +7,7 @@
     "colab": {
      "autoexec": {
       "startup": false,
-      "wait_interval": 0
+      "wait_interval": 0.0
      }
     },
     "colab_type": "code",
@@ -50,7 +50,7 @@
     "colab": {
      "autoexec": {
       "startup": false,
-      "wait_interval": 0
+      "wait_interval": 0.0
      }
     },
     "colab_type": "code",
@@ -157,7 +157,7 @@
    },
    "outputs": [],
    "source": [
-    "model_path = os.path.join(ppo_dir, \"model{}.ckpt.index\".format(iteration_num))[:-6]"
+    "agent_policy_path = os.path.join(ppo_dir, \"model{}.ckpt.index\".format(iteration_num))[:-6]"
    ]
   },
   {
@@ -175,7 +175,7 @@
    },
    "outputs": [],
    "source": [
-    "sys.argv = [sys.argv[0], \"--model_path\", model_path]"
+    "sys.argv = [sys.argv[0], \"--agent_policy_path\", agent_policy_path]"
    ]
   },
   {
@@ -325,7 +325,7 @@
    "provenance": [
     {
      "file_id": "1-VScmaLkMqWiSbqgUCFWefzisSREd8l1",
-     "timestamp": 1512175750497
+     "timestamp": 1.512175750497E12
     }
    ],
    "version": "0.3.2",
diff --git a/tensor2tensor/rl/README.md b/tensor2tensor/rl/README.md
index 3c3e5f976..b163a16a5 100644
--- a/tensor2tensor/rl/README.md
+++ b/tensor2tensor/rl/README.md
@@ -35,7 +35,7 @@ python tensor2tensor/bin/t2t-datagen \
   --data_dir=~/t2t_data \
   --tmp_dir=~/t2t_data/tmp \
   --problem=gym_pong_trajectories_from_policy \
-  --model_path [model]
+  --agent_policy_path [model]
 ```
 
 ## Training model for frames generation based on randomly played games
diff --git a/tensor2tensor/rl/collect.py b/tensor2tensor/rl/collect.py
index 8e81dafa7..a43b3a551 100644
--- a/tensor2tensor/rl/collect.py
+++ b/tensor2tensor/rl/collect.py
@@ -18,7 +18,8 @@
 import tensorflow as tf
 
 
-def define_collect(policy_factory, batch_env, hparams, eval_phase):
+def define_collect(policy_factory, batch_env, hparams,
+                   eval_phase, policy_to_actions_lambda=None, scope=""):
   """Collect trajectories."""
   eval_phase = tf.convert_to_tensor(eval_phase)
   memory_shape = [hparams.epoch_length] + [batch_env.observ.shape.as_list()[0]]
@@ -34,8 +35,9 @@ def define_collect(policy_factory, batch_env, hparams, eval_phase):
   ]
   memory = [tf.Variable(tf.zeros(shape, dtype), trainable=False)
             for (shape, dtype) in memories_shapes_and_types]
-  cumulative_rewards = tf.get_variable("cumulative_rewards", len(batch_env),
-                                       trainable=False)
+  with tf.variable_scope(scope):
+    cumulative_rewards = tf.get_variable("cumulative_rewards", len(batch_env),
+                                         trainable=False)
 
   should_reset_var = tf.Variable(True, trainable=False)
 
@@ -59,9 +61,12 @@ def step(index, scores_sum, scores_num):
       obs_copy = batch_env.observ + 0
       actor_critic = policy_factory(tf.expand_dims(obs_copy, 0))
       policy = actor_critic.policy
-      action = tf.cond(eval_phase,
-                       policy.mode,
-                       policy.sample)
+      if policy_to_actions_lambda:
+        action = policy_to_actions_lambda(policy)
+      else:
+        action = tf.cond(eval_phase,
+                         policy.mode,
+                         policy.sample)
       postprocessed_action = actor_critic.action_postprocessing(action)
       simulate_output = batch_env.simulate(postprocessed_action[0, ...])
       pdf = policy.prob(action)[0]
diff --git a/tensor2tensor/rl/envs/atari_wrappers.py b/tensor2tensor/rl/envs/atari_wrappers.py
deleted file mode 100644
index b8dd425ec..000000000
--- a/tensor2tensor/rl/envs/atari_wrappers.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Various wrappers copied for Gym Baselines."""
-
-from collections import deque
-import gym
-import numpy as np
-
-
-# Adapted from the link below.
-# https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
-
-
-class WarpFrame(gym.ObservationWrapper):
-  """Wrap a frame."""
-
-  def __init__(self, env):
-    """Warp frames to 84x84 as done in the Nature paper and later work."""
-    gym.ObservationWrapper.__init__(self, env)
-    self.width = 84
-    self.height = 84
-    self.observation_space = gym.spaces.Box(
-        low=0, high=255,
-        shape=(self.height, self.width, 1), dtype=np.uint8)
-
-  def observation(self, frame):
-    import cv2  # pylint: disable=g-import-not-at-top
-    cv2.ocl.setUseOpenCL(False)
-    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
-    frame = cv2.resize(frame, (self.width, self.height),
-                       interpolation=cv2.INTER_AREA)
-    return frame[:, :, None]
-
-
-class LazyFrames(object):
-  """Lazy frame storage."""
-
-  def __init__(self, frames):
-    """Lazy frame storage.
-
-      This object ensures that common frames between the observations
-      are only stored once. It exists purely to optimize memory usage
-      which can be huge for DQN's 1M frames replay buffers.
-      This object should only be converted to numpy array before being passed
-      to the model.
-
-    Args:
-      frames: the frames.
-    """
-    self._frames = frames
-
-  def __array__(self, dtype=None):
-    out = np.concatenate(self._frames, axis=2)
-    if dtype is not None:
-      out = out.astype(dtype)
-    return out
-
-
-class FrameStack(gym.Wrapper):
-  """Stack frames."""
-
-  def __init__(self, env, k):
-    """Stack k last frames. Returns lazy array, memory efficient."""
-    gym.Wrapper.__init__(self, env)
-    self.k = k
-    self.frames = deque([], maxlen=k)
-    shp = env.observation_space.shape
-    self.observation_space = gym.spaces.Box(
-        low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
-
-  def reset(self):
-    ob = self.env.reset()
-    for _ in range(self.k):
-      self.frames.append(ob)
-    return self._get_ob()
-
-  def step(self, action):
-    ob, reward, done, info = self.env.step(action)
-    self.frames.append(ob)
-    return self._get_ob(), reward, done, info
-
-  def _get_ob(self):
-    assert len(self.frames) == self.k
-    return LazyFrames(list(self.frames))
-
-
-class MaxAndSkipEnv(gym.Wrapper):
-  """Max and skip env."""
-
-  def __init__(self, env, skip=4):
-    """Return only every `skip`-th frame."""
-    gym.Wrapper.__init__(self, env)
-    # Most recent raw observations (for max pooling across time steps).
-    self._obs_buffer = np.zeros((2,) + env.observation_space.shape,
-                                dtype=np.uint8)
-    self._skip = skip
-
-  def reset(self, **kwargs):
-    return self.env.reset(**kwargs)
-
-  def step(self, action):
-    """Repeat action, sum reward, and max over last observations."""
-    total_reward = 0.0
-    done = None
-    for i in range(self._skip):
-      obs, reward, done, info = self.env.step(action)
-      if i == self._skip - 2: self._obs_buffer[0] = obs
-      if i == self._skip - 1: self._obs_buffer[1] = obs
-      total_reward += reward
-      if done:
-        break
-    # Note that the observation on the done=True frame
-    # doesn't matter
-    max_frame = self._obs_buffer.max(axis=0)
-
-    return max_frame, total_reward, done, info
-
-
-def wrap_atari(env, warp=False, frame_skip=False, frame_stack=False):
-  if warp:
-    env = WarpFrame(env)
-  if frame_skip:
-    env = MaxAndSkipEnv(env, frame_skip)
-  if frame_stack:
-    env = FrameStack(env, frame_stack)
-  return env
diff --git a/tensor2tensor/rl/envs/in_graph_batch_env.py b/tensor2tensor/rl/envs/in_graph_batch_env.py
index 44e30ddd9..e671d8f1b 100644
--- a/tensor2tensor/rl/envs/in_graph_batch_env.py
+++ b/tensor2tensor/rl/envs/in_graph_batch_env.py
@@ -22,37 +22,11 @@
 from __future__ import division
 from __future__ import print_function
 
-# Dependency imports
-
-import gym
-
-import tensorflow as tf
-
 
 class InGraphBatchEnv(object):
-  """Batch of environments inside the TensorFlow graph.
-
-  The batch of environments will be stepped and reset inside of the graph using
-  a tf.py_func(). The current batch of observations, actions, rewards, and done
-  flags are held in according variables.
+  """Abstract class for batch of environments inside the TensorFlow graph.
   """
 
-  def __init__(self, batch_env):
-    """Batch of environments inside the TensorFlow graph.
-
-    Args:
-      batch_env: Batch environment.
-    """
-    self._batch_env = batch_env
-    observ_shape = self._parse_shape(self._batch_env.observation_space)
-    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
-    self.action_shape = list(self._parse_shape(self._batch_env.action_space))
-    self.action_dtype = self._parse_dtype(self._batch_env.action_space)
-    with tf.variable_scope('env_temporary'):
-      self._observ = tf.Variable(
-          tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
-          name='observ', trainable=False)
-
   def __getattr__(self, name):
     """Forward unimplemented attributes to one of the original environments.
 
@@ -83,17 +57,7 @@ def simulate(self, action):
     Returns:
       Operation.
     """
-    with tf.name_scope('environment/simulate'):
-      if action.dtype in (tf.float16, tf.float32, tf.float64):
-        action = tf.check_numerics(action, 'action')
-      observ_dtype = self._parse_dtype(self._batch_env.observation_space)
-      observ, reward, done = tf.py_func(
-          lambda a: self._batch_env.step(a)[:3], [action],
-          [observ_dtype, tf.float32, tf.bool], name='step')
-      observ = tf.check_numerics(observ, 'observ')
-      reward = tf.check_numerics(reward, 'reward')
-      with tf.control_dependencies([self._observ.assign(observ)]):
-        return tf.identity(reward), tf.identity(done)
+    raise NotImplementedError
 
   def reset(self, indices=None):
     """Reset the batch of environments.
@@ -104,26 +68,7 @@ def reset(self, indices=None):
     Returns:
       Batch tensor of the new observations.
     """
-    return tf.cond(
-        tf.cast(tf.shape(indices)[0], tf.bool),
-        lambda: self._reset_non_empty(indices), lambda: 0.0)
-
-  def _reset_non_empty(self, indices):
-    """Reset the batch of environments.
-
-    Args:
-      indices: The batch indices of the environments to reset; defaults to all.
-
-    Returns:
-      Batch tensor of the new observations.
-    """
-    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
-    observ = tf.py_func(
-        self._batch_env.reset, [indices], observ_dtype, name='reset')
-    observ = tf.check_numerics(observ, 'observ')
-    with tf.control_dependencies([
-        tf.scatter_update(self._observ, indices, observ)]):
-      return tf.identity(observ)
+    raise NotImplementedError
 
   @property
   def observ(self):
@@ -133,33 +78,3 @@ def observ(self):
   def close(self):
     """Send close messages to the external process and join them."""
     self._batch_env.close()
-
-  def _parse_shape(self, space):
-    """Get a tensor shape from a OpenAI Gym space.
-
-    Args:
-      space: Gym space.
-
-    Returns:
-      Shape tuple.
-    """
-    if isinstance(space, gym.spaces.Discrete):
-      return ()
-    if isinstance(space, gym.spaces.Box):
-      return space.shape
-    raise NotImplementedError()
-
-  def _parse_dtype(self, space):
-    """Get a tensor dtype from a OpenAI Gym space.
-
-    Args:
-      space: Gym space.
-
-    Returns:
-      TensorFlow data type.
-    """
-    if isinstance(space, gym.spaces.Discrete):
-      return tf.int32
-    if isinstance(space, gym.spaces.Box):
-      return tf.float32
-    raise NotImplementedError()
diff --git a/tensor2tensor/rl/envs/py_func_batch_env.py b/tensor2tensor/rl/envs/py_func_batch_env.py
new file mode 100644
index 000000000..518c7bf29
--- /dev/null
+++ b/tensor2tensor/rl/envs/py_func_batch_env.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Batch of environments inside the TensorFlow graph."""
+
+# The code was based on Danijar Hafner's code from tf.agents:
+# https://github.com/tensorflow/agents/blob/master/agents/tools/in_graph_batch_env.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import gym
+
+from tensor2tensor.rl.envs.in_graph_batch_env import InGraphBatchEnv
+
+import tensorflow as tf
+
+
+class PyFuncBatchEnv(InGraphBatchEnv):
+  """Batch of environments inside the TensorFlow graph.
+
+  The batch of environments will be stepped and reset inside of the graph using
+  a tf.py_func(). The current batch of observations, actions, rewards, and done
+  flags are held in according variables.
+  """
+
+  def __init__(self, batch_env):
+    """Batch of environments inside the TensorFlow graph.
+
+    Args:
+      batch_env: Batch environment.
+    """
+    self._batch_env = batch_env
+    observ_shape = self._parse_shape(self._batch_env.observation_space)
+    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+    self.action_shape = list(self._parse_shape(self._batch_env.action_space))
+    self.action_dtype = self._parse_dtype(self._batch_env.action_space)
+    with tf.variable_scope('env_temporary'):
+      self._observ = tf.Variable(
+          tf.zeros((len(self._batch_env),) + observ_shape, observ_dtype),
+          name='observ', trainable=False)
+
+  def __getattr__(self, name):
+    """Forward unimplemented attributes to one of the original environments.
+
+    Args:
+      name: Attribute that was accessed.
+
+    Returns:
+      Value behind the attribute name in one of the original environments.
+    """
+    return getattr(self._batch_env, name)
+
+  def __len__(self):
+    """Number of combined environments."""
+    return len(self._batch_env)
+
+  def __getitem__(self, index):
+    """Access an underlying environment by index."""
+    return self._batch_env[index]
+
+  def simulate(self, action):
+    """Step the batch of environments.
+
+    The results of the step can be accessed from the variables defined below.
+
+    Args:
+      action: Tensor holding the batch of actions to apply.
+
+    Returns:
+      Operation.
+    """
+    with tf.name_scope('environment/simulate'):
+      if action.dtype in (tf.float16, tf.float32, tf.float64):
+        action = tf.check_numerics(action, 'action')
+      observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+      observ, reward, done = tf.py_func(
+          lambda a: self._batch_env.step(a)[:3], [action],
+          [observ_dtype, tf.float32, tf.bool], name='step')
+      observ = tf.check_numerics(observ, 'observ')
+      reward = tf.check_numerics(reward, 'reward')
+      reward.set_shape((len(self),))
+      done.set_shape((len(self),))
+      with tf.control_dependencies([self._observ.assign(observ)]):
+        return tf.identity(reward), tf.identity(done)
+
+  def reset(self, indices=None):
+    """Reset the batch of environments.
+
+    Args:
+      indices: The batch indices of the environments to reset.
+
+    Returns:
+      Batch tensor of the new observations.
+    """
+    return tf.cond(
+        tf.cast(tf.shape(indices)[0], tf.bool),
+        lambda: self._reset_non_empty(indices), lambda: 0.0)
+
+  def _reset_non_empty(self, indices):
+    """Reset the batch of environments.
+
+    Args:
+      indices: The batch indices of the environments to reset; defaults to all.
+
+    Returns:
+      Batch tensor of the new observations.
+    """
+    observ_dtype = self._parse_dtype(self._batch_env.observation_space)
+    observ = tf.py_func(
+        self._batch_env.reset, [indices], observ_dtype, name='reset')
+    observ = tf.check_numerics(observ, 'observ')
+    with tf.control_dependencies([
+        tf.scatter_update(self._observ, indices, observ)]):
+      return tf.identity(observ)
+
+  @property
+  def observ(self):
+    """Access the variable holding the current observation."""
+    return self._observ
+
+  def close(self):
+    """Send close messages to the external process and join them."""
+    self._batch_env.close()
+
+  def _parse_shape(self, space):
+    """Get a tensor shape from a OpenAI Gym space.
+
+    Args:
+      space: Gym space.
+
+    Returns:
+      Shape tuple.
+    """
+    if isinstance(space, gym.spaces.Discrete):
+      return ()
+    if isinstance(space, gym.spaces.Box):
+      return space.shape
+    raise NotImplementedError()
+
+  def _parse_dtype(self, space):
+    """Get a tensor dtype from a OpenAI Gym space.
+
+    Args:
+      space: Gym space.
+
+    Returns:
+      TensorFlow data type.
+    """
+    if isinstance(space, gym.spaces.Discrete):
+      return tf.int32
+    if isinstance(space, gym.spaces.Box):
+      return tf.float32
+    raise NotImplementedError()
diff --git a/tensor2tensor/rl/envs/simulated_batch_env.py b/tensor2tensor/rl/envs/simulated_batch_env.py
new file mode 100644
index 000000000..69dfcff94
--- /dev/null
+++ b/tensor2tensor/rl/envs/simulated_batch_env.py
@@ -0,0 +1,150 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Batch of environments inside the TensorFlow graph."""
+
+# The code was based on Danijar Hafner's code from tf.agents:
+# https://github.com/tensorflow/agents/blob/master/agents/tools/in_graph_batch_env.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import gym
+
+import pkg_resources
+
+from tensor2tensor.rl.envs.in_graph_batch_env import InGraphBatchEnv
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import trainer_lib
+
+import tensorflow as tf
+
+
+flags = tf.flags
+FLAGS = flags.FLAGS
+
+
+class SimulatedBatchEnv(InGraphBatchEnv):
+  """Batch of environments inside the TensorFlow graph.
+
+  The batch of environments will be stepped and reset inside of the graph using
+  a tf.py_func(). The current batch of observations, actions, rewards, and done
+  flags are held in according variables.
+  """
+
+  def __init__(self, length, observ_shape, observ_dtype, action_shape,
+               action_dtype):
+    """Batch of environments inside the TensorFlow graph."""
+    self.length = length
+    hparams = trainer_lib.create_hparams(
+        FLAGS.hparams_set, problem_name=FLAGS.problems, data_dir="UNUSED")
+    hparams.force_full_predict = True
+    self._model = registry.model(FLAGS.model)(
+        hparams, tf.estimator.ModeKeys.PREDICT)
+
+    self.action_shape = action_shape
+    self.action_dtype = action_dtype
+
+    with open(pkg_resources.resource_filename(
+        "tensor2tensor.rl.envs", "frame1.png"), "rb") as f:
+      png_frame_1_raw = f.read()
+
+    with open(pkg_resources.resource_filename(
+        "tensor2tensor.rl.envs", "frame2.png"), "rb") as f:
+      png_frame_2_raw = f.read()
+
+    self.frame_1 = tf.expand_dims(tf.cast(tf.image.decode_png(png_frame_1_raw),
+                                          tf.float32), 0)
+    self.frame_2 = tf.expand_dims(tf.cast(tf.image.decode_png(png_frame_2_raw),
+                                          tf.float32), 0)
+
+    shape = (self.length,) + observ_shape
+    self._observ = tf.Variable(tf.zeros(shape, observ_dtype), trainable=False)
+    self._prev_observ = tf.Variable(tf.zeros(shape, observ_dtype),
+                                    trainable=False)
+    self._starting_observ = tf.Variable(tf.zeros(shape, observ_dtype),
+                                        trainable=False)
+
+    observ_dtype = tf.int64
+    self._observ_not_sure_why_we_need_this = tf.Variable(
+        tf.zeros((self.length,) + observ_shape, observ_dtype),
+        name="observ_new", trainable=False)
+
+    self._reward_not_sure_why_we_need_this = tf.Variable(
+        tf.zeros((self.length, 1), observ_dtype),
+        name="reward_new", trainable=False)
+
+  @property
+  def action_space(self):
+    return gym.make("PongNoFrameskip-v4").action_space
+
+  def __len__(self):
+    """Number of combined environments."""
+    return self.length
+
+  def simulate(self, action):
+    with tf.name_scope("environment/simulate"):
+      inputs = {"inputs_0": self._prev_observ.read_value(),
+                "inputs_1": self._observ.read_value(),
+                "action": action,
+                "targets": self._observ_not_sure_why_we_need_this,
+                "reward": self._reward_not_sure_why_we_need_this}
+      model_output = self._model(inputs)
+      observ_expaned = model_output[0]["targets"]
+      observ = tf.cast(tf.argmax(observ_expaned, axis=-1), tf.float32)
+      reward = tf.constant(0, tf.float32, shape=(self.length,))
+      done = tf.constant(False, tf.bool, shape=(self.length,))
+
+      with tf.control_dependencies([observ]):
+        with tf.control_dependencies([self._prev_observ.assign(self._observ)]):
+          with tf.control_dependencies([self._observ.assign(observ)]):
+            return tf.identity(reward), tf.identity(done)
+
+  def reset(self, indices=None):
+    """Reset the batch of environments.
+
+    Args:
+      indices: The batch indices of the environments to reset.
+
+    Returns:
+      Batch tensor of the new observations.
+    """
+    return tf.cond(
+        tf.cast(tf.shape(indices)[0], tf.bool),
+        lambda: self._reset_non_empty(indices), lambda: 0.0)
+
+  def _reset_non_empty(self, indices):
+    """Reset the batch of environments.
+
+    Args:
+      indices: The batch indices of the environments to reset; defaults to all.
+
+    Returns:
+      Batch tensor of the new observations.
+    """
+    observ = tf.gather(self._observ, indices)
+    observ = 0.0 * tf.check_numerics(observ, "observ")
+    with tf.control_dependencies([
+        tf.scatter_update(self._observ, indices, observ + self.frame_2),
+        tf.scatter_update(self._prev_observ, indices, observ + self.frame_1)]):
+      return tf.identity(self._observ.read_value())
+
+  @property
+  def observ(self):
+    """Access the variable holding the current observation."""
+    return self._observ
diff --git a/tensor2tensor/rl/envs/tf_atari_wrappers.py b/tensor2tensor/rl/envs/tf_atari_wrappers.py
new file mode 100644
index 000000000..61bff7ab2
--- /dev/null
+++ b/tensor2tensor/rl/envs/tf_atari_wrappers.py
@@ -0,0 +1,187 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Batch of environments inside the TensorFlow graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.rl.envs.in_graph_batch_env import InGraphBatchEnv
+
+import tensorflow as tf
+
+
+class WrapperBase(InGraphBatchEnv):
+  """Base wrapper class."""
+
+  def __init__(self, batch_env):
+    self._length = len(batch_env)
+    self._batch_env = batch_env
+    self.action_shape = batch_env.action_shape
+    self.action_dtype = batch_env.action_dtype
+
+  @property
+  def observ(self):
+    """Access the variable holding the current observation."""
+    return self._observ
+
+  def __len__(self):
+    """Number of combined environments."""
+    return self._length
+
+  def reset(self, indices=None):
+    return self._batch_env.reset(indices)
+
+
+class TransformWrapper(WrapperBase):
+  """Transform wrapper."""
+
+  def __init__(self, batch_env, transform_observation=None,
+               transform_reward=tf.identity, transform_done=tf.identity):
+    super().__init__(batch_env)
+    if transform_observation is not None:
+      _, observ_shape, observ_dtype = transform_observation  # pylint: disable=unpacking-non-sequence
+      self._observ = tf.Variable(
+          tf.zeros(len(self) + observ_shape, observ_dtype), trainable=False)
+    else:
+      self._observ = self._batch_env.observ
+
+    self.transform_observation = transform_observation
+    self.transform_reward = transform_reward
+    self.transform_done = transform_done
+
+  def simulate(self, action):
+    with tf.name_scope("environment/simulate"):  # Do we need this?
+      reward, done = self._batch_env.simulate(action)
+      with tf.control_dependencies([reward]):
+        if self.transform_observation:
+          observ = self.transform_observation[0](self._batch_env.observ)
+          assign_op = self._observ.assign(observ)
+        else:
+          assign_op = tf.no_op()  # TODO(lukaszkaiser): looks as if it's broken.
+        with tf.control_dependencies([assign_op]):
+          return self.transform_reward(reward), self.transform_done(done)
+
+
+class WarpFrameWrapper(TransformWrapper):
+  """Wrap frames."""
+
+  def __init__(self, batch_env):
+    """Warp frames to 84x84 as done in the Nature paper and later work."""
+
+    dims = [84, 84]
+    nature_transform = lambda o: tf.image.rgb_to_grayscale(  # pylint: disable=g-long-lambda
+        tf.image.resize_images(o, dims))
+
+    super().__init__(batch_env, transform_observation=(
+        nature_transform, dims, tf.float32))
+
+
+class ShiftRewardWrapper(TransformWrapper):
+  """Shift the reward."""
+
+  def __init__(self, batch_env, add_value):
+    shift_reward = lambda r: tf.add(r, add_value)
+    super().__init__(batch_env, transform_reward=shift_reward)
+
+
+class MaxAndSkipWrapper(WrapperBase):
+  """Max and skip wrapper."""
+
+  def __init__(self, batch_env, skip=4):
+    super().__init__(batch_env)
+    self.skip = skip
+    self._observ = None
+    observs_shape = batch_env.observ.shape
+    observ_dtype = tf.float32
+
+    self._observ = tf.Variable(tf.zeros(observs_shape, observ_dtype),
+                               trainable=False)
+
+  def simulate(self, action):
+    with tf.name_scope("environment/simulate"):  # Do we need this?
+      initializer = (tf.zeros_like(self._observ),
+                     tf.fill((len(self),), 0.0), tf.fill((len(self),), False))
+
+      def not_done_step(a, _):
+        reward, done = self._batch_env.simulate(action)
+        with tf.control_dependencies([reward, done]):
+          r0 = tf.maximum(a[0], self._batch_env.observ)
+          r1 = tf.add(a[1], reward)
+          r2 = tf.logical_or(a[2], done)
+
+          return (r0, r1, r2)
+
+      simulate_ret = tf.scan(not_done_step, tf.range(self.skip),
+                             initializer=initializer, parallel_iterations=1,
+                             infer_shape=False)
+      simulate_ret = [ret[-1, ...] for ret in simulate_ret]
+
+      with tf.control_dependencies([self._observ.assign(simulate_ret[0])]):
+        return tf.identity(simulate_ret[1]), tf.identity(simulate_ret[2])
+
+
+class TimeLimitWrapper(WrapperBase):
+  """Time limit wrapper."""
+
+  # TODO(lukaszkaiser): Check if TimeLimitWrapper does what it's supposed to do.
+  def __init__(self, batch_env, timelimit=100):
+    super().__init__(batch_env)
+    self.timelimit = timelimit
+    self._time_elapsed = tf.Variable(tf.zeros((len(self),), tf.int32),
+                                     trainable=False)
+
+  def simulate(self, action):
+    with tf.name_scope("environment/simulate"):
+      reward, done = self._batch_env.simulate(action)
+      with tf.control_dependencies([reward, done]):
+        new_done = tf.logical_or(done, self._time_elapsed > self.timelimit)
+        inc = self._time_elapsed.assign_add(tf.ones_like(self._time_elapsed))
+
+        with tf.control_dependencies([inc]):
+          return tf.identity(reward), tf.identity(new_done)
+
+  def reset(self, indices=None):
+    op_zero = tf.scatter_update(self._time_elapsed, indices,
+                                tf.zeros(tf.shape(indices), dtype=tf.int32))
+    with tf.control_dependencies([op_zero]):
+      return self._batch_env.reset(indices)
+
+
+class MemoryWrapper(WrapperBase):
+  """Memory wrapper."""
+
+  def __init__(self, batch_env):
+    super().__init__(batch_env)
+    MemoryWrapper.singleton = self
+    assert self._length == 1, "We support only one environment"
+    infinity = 10000000
+    self.speculum = tf.FIFOQueue(infinity, dtypes=[
+        tf.string, tf.float32, tf.int32, tf.bool])
+    self._observ = self._batch_env.observ
+
+  def simulate(self, action):
+    with tf.name_scope("environment/simulate"):  # Do we need this?
+      reward, done = self._batch_env.simulate(action)
+      encoded_image = tf.image.encode_png(
+          tf.cast(self._batch_env.observ[0, ...], tf.uint8))
+      with tf.control_dependencies([reward, done]):
+        enqueue_op = self.speculum.enqueue(
+            [encoded_image, reward, action, done])
+        with tf.control_dependencies([enqueue_op]):
+          return tf.identity(reward), tf.identity(done)
diff --git a/tensor2tensor/rl/envs/utils.py b/tensor2tensor/rl/envs/utils.py
index 4f6fb8891..971581f46 100644
--- a/tensor2tensor/rl/envs/utils.py
+++ b/tensor2tensor/rl/envs/utils.py
@@ -33,7 +33,10 @@
 import gym
 
 from tensor2tensor.rl.envs import batch_env
-from tensor2tensor.rl.envs import in_graph_batch_env
+
+from tensor2tensor.rl.envs import py_func_batch_env
+from tensor2tensor.rl.envs import simulated_batch_env
+
 import tensorflow as tf
 
 
@@ -278,25 +281,37 @@ def _worker(self, constructor, conn):
     conn.close()
 
 
-def define_batch_env(constructor, num_agents, xvfb=False, env_processes=True):
-  """Create environments and apply all desired wrappers.
+def batch_env_factory(environment_lambda, hparams, num_agents, xvfb=False):
+  """Factory of batch envs."""
+  wrappers = hparams.in_graph_wrappers if hasattr(
+      hparams, "in_graph_wrappers") else []
 
-  Args:
-    constructor: Constructor of an OpenAI gym environment.
-    num_agents: Number of environments to combine in the batch.
-    xvfb: Frame buffer.
-    env_processes: Whether to step environment in external processes.
+  if hparams.simulated_environment:
+    cur_batch_env = define_simulated_batch_env(num_agents)
+  else:
+    cur_batch_env = define_batch_env(environment_lambda, num_agents, xvfb=xvfb)
+  for w in wrappers:
+    cur_batch_env = w[0](batch_env, **w[1])
+  return cur_batch_env
 
-  Returns:
-    In-graph environments object.
-  """
+
+def define_batch_env(constructor, num_agents, xvfb=False):
+  """Create environments and apply all desired wrappers."""
   with tf.variable_scope("environments"):
-    if env_processes:
-      envs = [
-          ExternalProcessEnv(constructor, xvfb)
-          for _ in range(num_agents)]
-    else:
-      envs = [constructor() for _ in range(num_agents)]
-    env = batch_env.BatchEnv(envs, blocking=not env_processes)
-    env = in_graph_batch_env.InGraphBatchEnv(env)
+    envs = [
+        ExternalProcessEnv(constructor, xvfb)
+        for _ in range(num_agents)]
+    env = batch_env.BatchEnv(envs, blocking=False)
+    env = py_func_batch_env.PyFuncBatchEnv(env)
     return env
+
+
+def define_simulated_batch_env(num_agents):
+  # TODO(blazej0): the parameters should be infered.
+  observ_shape = (210, 160, 3)
+  observ_dtype = tf.float32
+  action_shape = []
+  action_dtype = tf.int32
+  cur_batch_env = simulated_batch_env.SimulatedBatchEnv(
+      num_agents, observ_shape, observ_dtype, action_shape, action_dtype)
+  return cur_batch_env
diff --git a/tensor2tensor/rl/model_rl_experiment.py b/tensor2tensor/rl/model_rl_experiment.py
new file mode 100644
index 000000000..ec543815f
--- /dev/null
+++ b/tensor2tensor/rl/model_rl_experiment.py
@@ -0,0 +1,119 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Training of model-based RL agents."""
+
+import datetime
+import os
+import tempfile
+import time
+
+# Dependency imports
+
+from tensor2tensor import problems
+from tensor2tensor.bin import t2t_trainer
+from tensor2tensor.rl import rl_trainer_lib
+from tensor2tensor.rl.envs.tf_atari_wrappers import PongT2TGeneratorHackWrapper
+from tensor2tensor.rl.envs.tf_atari_wrappers import TimeLimitWrapper
+from tensor2tensor.utils import trainer_lib
+
+import tensorflow as tf
+
+
+flags = tf.flags
+FLAGS = flags.FLAGS
+
+
+def train(hparams, output_dir):
+  """Training function."""
+  prefix = output_dir
+  data_dir = os.path.expanduser(prefix + "/data")
+  tmp_dir = os.path.expanduser(prefix + "/tmp")
+  output_dir = os.path.expanduser(prefix + "/output")
+  tf.gfile.MakeDirs(data_dir)
+  tf.gfile.MakeDirs(tmp_dir)
+  tf.gfile.MakeDirs(output_dir)
+  last_model = ""
+  start_time = time.time()
+  line = ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>    "
+  for iloop in range(hparams.epochs):
+    time_delta = time.time() - start_time
+    print(line+"Step {}.1. - generate data from policy. "
+          "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
+    FLAGS.problems = "gym_discrete_problem"
+    FLAGS.agent_policy_path = last_model
+    gym_problem = problems.problem(FLAGS.problems)
+    gym_problem.num_steps = hparams.true_env_generator_num_steps
+    iter_data_dir = os.path.join(data_dir, str(iloop))
+    tf.gfile.MakeDirs(iter_data_dir)
+    gym_problem.generate_data(iter_data_dir, tmp_dir)
+
+    time_delta = time.time() - start_time
+    print(line+"Step {}.2. - generate env model. "
+          "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
+    # 2. generate env model
+    FLAGS.data_dir = iter_data_dir
+    FLAGS.output_dir = output_dir
+    FLAGS.model = hparams.generative_model
+    FLAGS.hparams_set = hparams.generative_model_params
+    FLAGS.train_steps = hparams.model_train_steps
+    FLAGS.eval_steps = 1
+    t2t_trainer.main([])
+
+    time_delta = time.time() - start_time
+    print(line+"Step {}.3. - evalue env model. "
+          "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
+    gym_simulated_problem = problems.problem("gym_simulated_discrete_problem")
+    gym_simulated_problem.num_steps = hparams.simulated_env_generator_num_steps
+    gym_simulated_problem.generate_data(iter_data_dir, tmp_dir)
+
+    # time_delta = time.time() - start_time
+    print(line + "Step {}.4. - train PPO in model env."
+          " Time: {}".format(iloop,
+                             str(datetime.timedelta(seconds=time_delta))))
+    ppo_epochs_num = hparams.ppo_epochs_num
+    ppo_hparams = trainer_lib.create_hparams(
+        "atari_base",
+        "epochs_num={},simulated_environment=True,eval_every_epochs=0,"
+        "save_models_every_epochs={}".format(ppo_epochs_num+1, ppo_epochs_num),
+        data_dir=output_dir)
+    ppo_hparams.epoch_length = hparams.ppo_epoch_length
+    ppo_dir = tempfile.mkdtemp(dir=data_dir, prefix="ppo_")
+    in_graph_wrappers = [
+        (TimeLimitWrapper, {"timelimit": 150}),
+        (PongT2TGeneratorHackWrapper, {"add_value": -2})]
+    in_graph_wrappers += gym_problem.in_graph_wrappers
+    ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers)
+    rl_trainer_lib.train(ppo_hparams, "PongNoFrameskip-v4", ppo_dir)
+
+    last_model = ppo_dir + "/model{}.ckpt".format(ppo_epochs_num)
+
+
+def main(_):
+  hparams = tf.contrib.training.HParams(
+      epochs=100,
+      true_env_generator_num_steps=100,
+      generative_model="static_basic_conv_gen",
+      generative_model_params="basic_conv_small",
+      model_train_steps=80,
+      simulated_env_generator_num_steps=300,
+      ppo_epochs_num=2,
+      ppo_epoch_length=300,
+  )
+  train(hparams, tempfile.mkdtemp())
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/rl/ppo.py b/tensor2tensor/rl/ppo.py
index a1a677147..ca0481e9e 100644
--- a/tensor2tensor/rl/ppo.py
+++ b/tensor2tensor/rl/ppo.py
@@ -21,15 +21,15 @@
 import tensorflow as tf
 
 
-def get_optimizer(config):
+def get_optimiser(config):
   if config.optimizer == "Adam":
-    return tf.train.AdamOptimizer(config.learning_rate)
-  return config.optimizer(config.learning_rate)
+    return tf.train.AdamOptimizer(learning_rate=config.learning_rate)
+  return config.optimizer(learning_rate=config.learning_rate)
 
 
-def define_ppo_step(observation, action, reward, done, value, old_pdf,
-                    policy_factory, config):
-  """Step of PPO."""
+def define_ppo_step(data_points, policy_factory, optimizer, config):
+  """Define ppo step."""
+  observation, action, discounted_reward, norm_advantage, old_pdf = data_points
   new_policy_dist, new_value, _ = policy_factory(observation)
   new_pdf = new_policy_dist.prob(action)
 
@@ -37,35 +37,30 @@ def define_ppo_step(observation, action, reward, done, value, old_pdf,
   clipped_ratio = tf.clip_by_value(ratio, 1 - config.clipping_coef,
                                    1 + config.clipping_coef)
 
-  advantage = calculate_generalized_advantage_estimator(
-      reward, value, done, config.gae_gamma, config.gae_lambda)
-
-  advantage_mean, advantage_variance = tf.nn.moments(advantage, axes=[0, 1],
-                                                     keep_dims=True)
-  advantage_normalized = tf.stop_gradient(
-      (advantage - advantage_mean)/(tf.sqrt(advantage_variance) + 1e-8))
-
-  surrogate_objective = tf.minimum(clipped_ratio * advantage_normalized,
-                                   ratio * advantage_normalized)
+  surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
+                                   ratio * norm_advantage)
   policy_loss = -tf.reduce_mean(surrogate_objective)
 
-  value_error = calculate_generalized_advantage_estimator(
-      reward, new_value, done, config.gae_gamma, config.gae_lambda)
+  value_error = new_value - discounted_reward
   value_loss = config.value_loss_coef * tf.reduce_mean(value_error ** 2)
 
   entropy = new_policy_dist.entropy()
   entropy_loss = -config.entropy_loss_coef * tf.reduce_mean(entropy)
 
-  optimizer = get_optimizer(config)
   losses = [policy_loss, value_loss, entropy_loss]
 
-  gradients = [list(zip(*optimizer.compute_gradients(loss))) for loss in losses]
+  gradients = [list(zip(*optimizer.compute_gradients(loss)))
+               for loss in losses]
 
   gradients_norms = [tf.global_norm(gradient[0]) for gradient in gradients]
 
   gradients_flat = sum([gradient[0] for gradient in gradients], ())
   gradients_variables_flat = sum([gradient[1] for gradient in gradients], ())
 
+  if config.max_gradients_norm:
+    gradients_flat, _ = tf.clip_by_global_norm(gradients_flat,
+                                               config.max_gradients_norm)
+
   optimize_op = optimizer.apply_gradients(zip(gradients_flat,
                                               gradients_variables_flat))
 
@@ -77,23 +72,51 @@ def define_ppo_epoch(memory, policy_factory, config):
   """PPO epoch."""
   observation, reward, done, action, old_pdf, value = memory
 
-  # This is to avoid propagating gradients though simulation of simulation
+  # This is to avoid propagating gradients through simulated environment.
   observation = tf.stop_gradient(observation)
   action = tf.stop_gradient(action)
   reward = tf.stop_gradient(reward)
+  if hasattr(config, "rewards_preprocessing_fun"):
+    reward = config.rewards_preprocessing_fun(reward)
   done = tf.stop_gradient(done)
   value = tf.stop_gradient(value)
   old_pdf = tf.stop_gradient(old_pdf)
 
-  ppo_step_rets = tf.scan(
-      lambda _1, _2: define_ppo_step(  # pylint: disable=g-long-lambda
-          observation, action, reward, done, value,
-          old_pdf, policy_factory, config),
-      tf.range(config.optimization_epochs),
-      [0., 0., 0., 0., 0., 0.],
-      parallel_iterations=1)
+  advantage = calculate_generalized_advantage_estimator(
+      reward, value, done, config.gae_gamma, config.gae_lambda)
+
+  discounted_reward = tf.stop_gradient(advantage + value)
+
+  advantage_mean, advantage_variance = tf.nn.moments(advantage, axes=[0, 1],
+                                                     keep_dims=True)
+  advantage_normalized = tf.stop_gradient(
+      (advantage - advantage_mean)/(tf.sqrt(advantage_variance) + 1e-8))
 
-  ppo_summaries = [tf.reduce_mean(ret) for ret in ppo_step_rets]
+  add_lists_elementwise = lambda l1, l2: [x + y for x, y in zip(l1, l2)]
+
+  number_of_batches = (config.epoch_length * config.optimization_epochs
+                       / config.optimization_batch_size)
+
+  dataset = tf.data.Dataset.from_tensor_slices(
+      (observation, action, discounted_reward, advantage_normalized, old_pdf))
+  dataset = dataset.shuffle(buffer_size=config.epoch_length,
+                            reshuffle_each_iteration=True)
+  dataset = dataset.repeat(config.optimization_epochs)
+  dataset = dataset.batch(config.optimization_batch_size)
+  iterator = dataset.make_initializable_iterator()
+  optimizer = get_optimiser(config)
+
+  with tf.control_dependencies([iterator.initializer]):
+    ppo_step_rets = tf.scan(
+        lambda a, i: add_lists_elementwise(  # pylint: disable=g-long-lambda
+            a, define_ppo_step(iterator.get_next(), policy_factory, optimizer,
+                               config)),
+        tf.range(number_of_batches),
+        [0., 0., 0., 0., 0., 0.],
+        parallel_iterations=1)
+
+  ppo_summaries = [tf.reduce_mean(ret) / number_of_batches
+                   for ret in ppo_step_rets]
   summaries_names = ["policy_loss", "value_loss", "entropy_loss",
                      "policy_gradient", "value_gradient", "entropy_gradient"]
 
diff --git a/tensor2tensor/rl/rl_trainer_lib.py b/tensor2tensor/rl/rl_trainer_lib.py
index 4ff386362..e710a2eeb 100644
--- a/tensor2tensor/rl/rl_trainer_lib.py
+++ b/tensor2tensor/rl/rl_trainer_lib.py
@@ -29,37 +29,41 @@
 from tensor2tensor.models.research import rl  # pylint: disable=unused-import
 from tensor2tensor.rl import collect
 from tensor2tensor.rl import ppo
-from tensor2tensor.rl.envs import atari_wrappers
+from tensor2tensor.rl.envs import tf_atari_wrappers
 from tensor2tensor.rl.envs import utils
 
 import tensorflow as tf
 
 
-
-
 def define_train(hparams, environment_spec, event_dir):
   """Define the training setup."""
+  policy_lambda = hparams.network
+
+  if environment_spec == "stacked_pong":
+    environment_spec = lambda: gym.make("PongNoFrameskip-v4")
+    wrappers = hparams.in_graph_wrappers if hasattr(
+        hparams, "in_graph_wrappers") else []
+    wrappers.append((tf_atari_wrappers.MaxAndSkipEnv, {"skip": 4}))
+    hparams.in_graph_wrappers = wrappers
   if isinstance(environment_spec, str):
     env_lambda = lambda: gym.make(environment_spec)
   else:
     env_lambda = environment_spec
-  policy_lambda = hparams.network
-  env = env_lambda()
-  action_space = env.action_space
 
-  batch_env = utils.define_batch_env(env_lambda, hparams.num_agents)
+  batch_env = utils.batch_env_factory(
+      env_lambda, hparams, num_agents=hparams.num_agents)
 
   policy_factory = tf.make_template(
       "network",
-      functools.partial(policy_lambda, action_space, hparams))
+      functools.partial(policy_lambda, batch_env.action_space, hparams))
 
-  with tf.variable_scope("train"):
+  with tf.variable_scope("", reuse=tf.AUTO_REUSE):
     memory, collect_summary = collect.define_collect(
         policy_factory, batch_env, hparams, eval_phase=False)
-  ppo_summary = ppo.define_ppo_epoch(memory, policy_factory, hparams)
-  summary = tf.summary.merge([collect_summary, ppo_summary])
+    ppo_summary = ppo.define_ppo_epoch(memory, policy_factory, hparams)
+    summary = tf.summary.merge([collect_summary, ppo_summary])
 
-  with tf.variable_scope("eval"):
+  with tf.variable_scope("eval", reuse=tf.AUTO_REUSE):
     eval_env_lambda = env_lambda
     if event_dir and hparams.video_during_eval:
       # Some environments reset environments automatically, when reached done
@@ -68,32 +72,46 @@ def define_train(hparams, environment_spec, event_dir):
       eval_env_lambda = lambda: gym.wrappers.Monitor(  # pylint: disable=g-long-lambda
           env_lambda(), event_dir, video_callable=lambda i: i % d == 0)
     wrapped_eval_env_lambda = lambda: utils.EvalVideoWrapper(eval_env_lambda())
-    _, eval_summary = collect.define_collect(
-        policy_factory,
-        utils.define_batch_env(wrapped_eval_env_lambda, hparams.num_eval_agents,
-                               xvfb=hparams.video_during_eval),
-        hparams, eval_phase=True)
-  return summary, eval_summary, policy_factory
+    # eval_batch_env = utils.define_batch_env(
+    #     wrapped_eval_env_lambda, hparams.num_eval_agents,
+    #     xvfb=hparams.video_during_eval)
+    eval_batch_env = utils.batch_env_factory(
+        wrapped_eval_env_lambda, hparams,
+        num_agents=hparams.num_eval_agents, xvfb=hparams.video_during_eval)
+
+    # TODO(blazej0): correct to the version below.
+    corrected = False
+    eval_summary = tf.no_op()
+    if corrected:
+      _, eval_summary = collect.define_collect(
+          policy_factory, eval_batch_env, hparams, eval_phase=True)
+  return summary, eval_summary
 
 
 def train(hparams, environment_spec, event_dir=None):
   """Train."""
-  if environment_spec == "stacked_pong":
-    environment_spec = lambda: atari_wrappers.wrap_atari(  # pylint: disable=g-long-lambda
-        gym.make("PongNoFrameskip-v4"),
-        warp=False, frame_skip=4, frame_stack=False)
-  train_summary_op, eval_summary_op, _ = define_train(hparams, environment_spec,
-                                                      event_dir)
+  train_summary_op, eval_summary_op = define_train(hparams, environment_spec,
+                                                   event_dir)
   if event_dir:
     summary_writer = tf.summary.FileWriter(
         event_dir, graph=tf.get_default_graph(), flush_secs=60)
     model_saver = tf.train.Saver(tf.global_variables(".*network_parameters.*"))
+    # TODO(blazej): Make sure that policy is restored properly.
   else:
     summary_writer = None
     model_saver = None
 
+  if hparams.simulated_environment:
+    env_model_loader = tf.train.Saver(tf.global_variables(".*basic_conv_gen.*"))
+  else:
+    env_model_loader = None
+
   with tf.Session() as sess:
     sess.run(tf.global_variables_initializer())
+    if env_model_loader:
+      ckpts = tf.train.get_checkpoint_state(hparams.data_dir)
+      ckpt = ckpts.model_checkpoint_path
+      env_model_loader.restore(sess, ckpt)
     for epoch_index in range(hparams.epochs_num):
       summary = sess.run(train_summary_op)
       if summary_writer:
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index 8d584f266..5bcd77388 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -285,9 +285,9 @@ def create_evaluation_metrics(problems, model_hparams):
   def make_problem_specific_metric_fn(metric_fn, problem_idx, weights_fn):
     """Create a metric fn conditioned on problem_idx."""
 
-    def problem_metric_fn(predictions, features):
+    def problem_metric_fn(predictions, features, labels):
       """Metric fn."""
-      labels = features.get("targets", None)
+      # labels = features.get("targets", None)
       problem_choice = features.get("problem_choice", 0)
 
       # Send along the entire features dict if the metric fn has the kwarg
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index 0af523340..6efc393e8 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -356,7 +356,7 @@ def top(self, body_output, features):
           "problem_hparams.target_modality is a dict.")
       return self._top_single(body_output, target_modality, features)
 
-  def _loss_single(self, logits, target_modality, features):
+  def _loss_single(self, logits, target_modality, feature):
     # The current bfloat16 version still uses float32 for most parts of backward
     # propagation to keep model quality, so cast back before computing the loss
     # value.
@@ -365,7 +365,7 @@ def _loss_single(self, logits, target_modality, features):
       return (tf.constant(0., dtype=tf.float32),
               tf.constant(1., dtype=tf.float32))
 
-    loss_num, loss_den = target_modality.loss(logits, features["targets"])
+    loss_num, loss_den = target_modality.loss(logits, feature)
     loss_num *= self._problem_hparams.loss_multiplier
     return loss_num, loss_den
 
@@ -380,7 +380,7 @@ def loss(self, logits, features):
           "of problem_hparams.target_modality's dict.")
       losses = {}
       for k, v in six.iteritems(logits):
-        losses[k] = self._loss_single(v, target_modality[k], features)
+        losses[k] = self._loss_single(v, target_modality[k], features[k])
       return tf.add_n([n / d for n, d in losses.values()])
     else:
       if self._problem_hparams:
@@ -390,7 +390,7 @@ def loss(self, logits, features):
       assert not isinstance(target_modality, dict), (
           "model_body must return a dictionary of logits when "
           "problem_hparams.target_modality is a dict.")
-      return self._loss_single(logits, target_modality, features)
+      return self._loss_single(logits, target_modality, features["targets"])
 
   def optimize(self, loss, num_async_replicas=1):
     """Return a training op minimizing loss."""
@@ -1030,9 +1030,11 @@ def estimator_spec_eval(self, features, logits, labels, loss, losses_dict):
         if isinstance(logits, dict):
           # the key is located in the center of metric_name: "metrics-%s/%s/%s"
           k = metric_name.split("/")[1]
-          eval_metrics[metric_name] = metric_fn(logits[k], features)
+          eval_metrics[metric_name] = metric_fn(
+              logits[k], features, features[k])
         else:
-          eval_metrics[metric_name] = metric_fn(logits, features)
+          eval_metrics[metric_name] = metric_fn(
+              logits, features, features["targets"])
       if isinstance(logits, dict):
         predictions = logits
       else:

From 1ffdf1fe112f966bb62ccaace140c7e8a71bad34 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Thu, 12 Apr 2018 12:15:01 -0700
Subject: [PATCH 21/29] Add metrics for sigmoidal outputs and one-hot labels.
 Change SigmoidClassLabelModality to work with binary labels for each class
 (generate data for it, with targets being tf.FixedLenSequenceFeature)

PiperOrigin-RevId: 192649471
---
 tensor2tensor/data_generators/text_encoder.py | 33 +++++++
 tensor2tensor/layers/modalities.py            | 44 ++++++++-
 tensor2tensor/utils/metrics.py                | 91 ++++++++++++++++++
 tensor2tensor/utils/metrics_test.py           | 92 +++++++++++++++++++
 4 files changed, 257 insertions(+), 3 deletions(-)

diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index f80416fdd..5398c3930 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -224,6 +224,39 @@ def vocab_size(self):
     return len(self._class_labels)
 
 
+class OneHotClassLabelEncoder(TextEncoder):
+  """One-hot encoder for class labels."""
+
+  def __init__(self, class_labels=None, class_labels_fname=None):
+    super(OneHotClassLabelEncoder, self).__init__()
+    assert class_labels or class_labels_fname
+    assert not (class_labels and class_labels_fname)
+
+    if class_labels_fname:
+      with tf.gfile.Open(class_labels_fname) as f:
+        class_labels = [label.strip() for label in f.readlines()]
+
+    self._class_labels = class_labels
+
+  def encode(self, label_str, on_value=1, off_value=0):
+    e = np.zeros(self.vocab_size, dtype=np.int32)
+    if off_value != 0:
+      e.fill(off_value)
+    e[self._class_labels.index(label_str)] = on_value
+    return e.tolist()
+
+  def decode(self, label_id):
+    if isinstance(label_id, np.ndarray):
+      label_id = np.squeeze(label_id).astype(np.int8).tolist()
+    assert isinstance(label_id, list)
+    assert len(label_id) == self.vocab_size
+    return self._class_labels[label_id.index(1)]
+
+  @property
+  def vocab_size(self):
+    return len(self._class_labels)
+
+
 class TokenTextEncoder(TextEncoder):
   """Encoder based on a user-supplied vocabulary (file or list)."""
 
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
index e60726cde..050d5a79e 100644
--- a/tensor2tensor/layers/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -586,11 +586,49 @@ class SigmoidClassLabelModality(ClassLabelModality):
   @property
   def name(self):
     return "sigmoid_class_symbol_modality_%d_%d" % (self._vocab_size,
-                                                    self.body_input_depth)
+                                                    self._body_input_depth)
 
   def loss(self, top_out, targets):
-    loss_scale = tf.nn.sigmoid_cross_entropy_with_logits(
-        labels=targets, logits=top_out, name="SigmoidCrossEntropy")
+    # Expect inputs of size [batch-size, timesteps, 1, num-classes], where the
+    # last dimension of num-classes represents logits for binary labels
+    loss_scale = tf.losses.sigmoid_cross_entropy(
+        multi_class_labels=targets, logits=top_out)
+    # Weigh all classes equally
+    weights = self.targets_weights_fn(targets)
+    loss_denom = tf.reduce_sum(weights)
+    return loss_scale, loss_denom
+
+
+@registry.register_class_label_modality("sigmoid_pooling")
+class SigmoidMaxPoolingClassLabelModality(ClassLabelModality):
+  """Sigmoid cross-entropy applied on max-pooling over timesteps."""
+
+  @property
+  def name(self):
+    return "sigmoid_max_pooling_class_symbol_modality_%d_%d" % (
+        self._vocab_size, self._body_input_depth)
+
+  def top(self, body_output, _):
+    """Transform inputs from model space to target space.
+
+    Average over inner dims and a linear layer to logits.
+
+    Args:
+      body_output: A Tensor with shape [batch, timesteps, 1, body_output_size].
+
+    Returns:
+      a Tensors, each with shape [batch_size, 1, 1, vocab_size]
+    """
+    with tf.variable_scope(self.name):
+      x = body_output
+      x = tf.reduce_max(x, axis=1, keepdims=True)
+      return tf.layers.dense(x, self._vocab_size)
+
+  def loss(self, top_out, targets):
+    # Expect inputs of size [batch-size, 1, 1, num-classes], where the
+    # last dimension of num-classes represents binary labels for each class
+    loss_scale = tf.losses.sigmoid_cross_entropy(
+        multi_class_labels=targets, logits=top_out)
     # Weigh all classes equally
     weights = self.targets_weights_fn(targets)
     loss_denom = tf.reduce_sum(weights)
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index 5bcd77388..0cba1adda 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -51,6 +51,10 @@ class Metrics(object):
   EDIT_DISTANCE = "edit_distance"
   SET_PRECISION = "set_precision"
   SET_RECALL = "set_recall"
+  SIGMOID_ACCURACY_ONE_HOT = "sigmoid_accuracy_one_hot"
+  SIGMOID_RECALL_ONE_HOT = "sigmoid_recall_one_hot"
+  SIGMOID_PRECISION_ONE_HOT = "sigmoid_precision_one_hot"
+  SIGMOID_CROSS_ENTROPY_ONE_HOT = "sigmoid_cross_entropy_one_hot"
   IMAGE_SUMMARY = "image_summary"
 
 
@@ -265,6 +269,89 @@ def image_summary(predictions, features, hparams):
   return summary, tf.zeros_like(predictions)
 
 
+def sigmoid_accuracy_one_hot(logits, labels, weights_fn=None):
+  """Calculate accuracy for a set, given one-hot labels and logits.
+
+  Args:
+    logits: Tensor of size [batch-size, o=1, p=1, num-classes]
+    labels: Tensor of size [batch-size, o=1, p=1, num-classes]
+    weights_fn: Function that takes in labels and weighs examples (unused)
+  Returns:
+    accuracy (scalar), weights
+  """
+  with tf.variable_scope("sigmoid_accuracy_one_hot", values=[logits, labels]):
+    del weights_fn
+    predictions = tf.nn.sigmoid(logits)
+    labels = tf.argmax(labels, -1)
+    predictions = tf.argmax(predictions, -1)
+    _, accuracy = tf.metrics.accuracy(labels=labels, predictions=predictions)
+    return accuracy, tf.constant(1.0)
+
+
+def sigmoid_precision_one_hot(logits, labels, weights_fn=None):
+  """Calculate precision for a set, given one-hot labels and logits.
+
+  Predictions are converted to one-hot,
+  as predictions[example][arg-max(example)] = 1
+
+  Args:
+    logits: Tensor of size [batch-size, o=1, p=1, num-classes]
+    labels: Tensor of size [batch-size, o=1, p=1, num-classes]
+    weights_fn: Function that takes in labels and weighs examples (unused)
+  Returns:
+    precision (scalar), weights
+  """
+  with tf.variable_scope("sigmoid_precision_one_hot", values=[logits, labels]):
+    del weights_fn
+    num_classes = logits.shape[-1]
+    predictions = tf.nn.sigmoid(logits)
+    predictions = tf.argmax(predictions, -1)
+    predictions = tf.one_hot(predictions, num_classes)
+    _, precision = tf.metrics.precision(labels=labels, predictions=predictions)
+    return precision, tf.constant(1.0)
+
+
+def sigmoid_recall_one_hot(logits, labels, weights_fn=None):
+  """Calculate recall for a set, given one-hot labels and logits.
+
+  Predictions are converted to one-hot,
+  as predictions[example][arg-max(example)] = 1
+
+  Args:
+    logits: Tensor of size [batch-size, o=1, p=1, num-classes]
+    labels: Tensor of size [batch-size, o=1, p=1, num-classes]
+    weights_fn: Function that takes in labels and weighs examples (unused)
+  Returns:
+    recall (scalar), weights
+  """
+  with tf.variable_scope("sigmoid_recall_one_hot", values=[logits, labels]):
+    del weights_fn
+    num_classes = logits.shape[-1]
+    predictions = tf.nn.sigmoid(logits)
+    predictions = tf.argmax(predictions, -1)
+    predictions = tf.one_hot(predictions, num_classes)
+    _, recall = tf.metrics.recall(labels=labels, predictions=predictions)
+    return recall, tf.constant(1.0)
+
+
+def sigmoid_cross_entropy_one_hot(logits, labels, weights_fn=None):
+  """Calculate sigmoid cross entropy for one-hot lanels and logits.
+
+  Args:
+    logits: Tensor of size [batch-size, o=1, p=1, num-classes]
+    labels: Tensor of size [batch-size, o=1, p=1, num-classes]
+    weights_fn: Function that takes in labels and weighs examples (unused)
+  Returns:
+    cross_entropy (scalar), weights
+  """
+  with tf.variable_scope("sigmoid_cross_entropy_one_hot",
+                         values=[logits, labels]):
+    del weights_fn
+    cross_entropy = tf.losses.sigmoid_cross_entropy(
+        multi_class_labels=labels, logits=logits)
+    return cross_entropy, tf.constant(1.0)
+
+
 def create_evaluation_metrics(problems, model_hparams):
   """Creates the evaluation metrics for the model.
 
@@ -421,6 +508,10 @@ def metric_means():
     Metrics.ROUGE_2_F: rouge.rouge_2_fscore,
     Metrics.ROUGE_L_F: rouge.rouge_l_fscore,
     Metrics.EDIT_DISTANCE: sequence_edit_distance,
+    Metrics.SIGMOID_ACCURACY_ONE_HOT: sigmoid_accuracy_one_hot,
+    Metrics.SIGMOID_RECALL_ONE_HOT: sigmoid_recall_one_hot,
+    Metrics.SIGMOID_PRECISION_ONE_HOT: sigmoid_precision_one_hot,
+    Metrics.SIGMOID_CROSS_ENTROPY_ONE_HOT: sigmoid_cross_entropy_one_hot,
     Metrics.SET_PRECISION: set_precision,
     Metrics.SET_RECALL: set_recall,
     Metrics.IMAGE_SUMMARY: image_summary,
diff --git a/tensor2tensor/utils/metrics_test.py b/tensor2tensor/utils/metrics_test.py
index 2aee025c1..8d37788d4 100644
--- a/tensor2tensor/utils/metrics_test.py
+++ b/tensor2tensor/utils/metrics_test.py
@@ -107,6 +107,98 @@ def testNegativeLogPerplexity(self):
       actual = session.run(a)
     self.assertEqual(actual.shape, ())
 
+  def testSigmoidAccuracyOneHot(self):
+    logits = np.array([
+        [-1., 1.],
+        [1., -1.],
+        [-1., 1.],
+        [1., -1.]
+    ])
+    labels = np.array([
+        [0, 1],
+        [1, 0],
+        [1, 0],
+        [0, 1]
+    ])
+    logits = np.expand_dims(np.expand_dims(logits, 1), 1)
+    labels = np.expand_dims(np.expand_dims(labels, 1), 1)
+
+    with self.test_session() as session:
+      score, _ = metrics.sigmoid_accuracy_one_hot(logits, labels)
+      session.run(tf.global_variables_initializer())
+      session.run(tf.local_variables_initializer())
+      s = session.run(score)
+    self.assertEqual(s, 0.5)
+
+  def testSigmoidPrecisionOneHot(self):
+    logits = np.array([
+        [-1., 1.],
+        [1., -1.],
+        [1., -1.],
+        [1., -1.]
+    ])
+    labels = np.array([
+        [0, 1],
+        [0, 1],
+        [0, 1],
+        [0, 1]
+    ])
+    logits = np.expand_dims(np.expand_dims(logits, 1), 1)
+    labels = np.expand_dims(np.expand_dims(labels, 1), 1)
+
+    with self.test_session() as session:
+      score, _ = metrics.sigmoid_precision_one_hot(logits, labels)
+      session.run(tf.global_variables_initializer())
+      session.run(tf.local_variables_initializer())
+      s = session.run(score)
+    self.assertEqual(s, 0.25)
+
+  def testSigmoidRecallOneHot(self):
+    logits = np.array([
+        [-1., 1.],
+        [1., -1.],
+        [1., -1.],
+        [1., -1.]
+    ])
+    labels = np.array([
+        [0, 1],
+        [0, 1],
+        [0, 1],
+        [0, 1]
+    ])
+    logits = np.expand_dims(np.expand_dims(logits, 1), 1)
+    labels = np.expand_dims(np.expand_dims(labels, 1), 1)
+
+    with self.test_session() as session:
+      score, _ = metrics.sigmoid_recall_one_hot(logits, labels)
+      session.run(tf.global_variables_initializer())
+      session.run(tf.local_variables_initializer())
+      s = session.run(score)
+    self.assertEqual(s, 0.25)
+
+  def testSigmoidCrossEntropyOneHot(self):
+    logits = np.array([
+        [-1., 1.],
+        [1., -1.],
+        [1., -1.],
+        [1., -1.]
+    ])
+    labels = np.array([
+        [0, 1],
+        [1, 0],
+        [0, 0],
+        [0, 1]
+    ])
+    logits = np.expand_dims(np.expand_dims(logits, 1), 1)
+    labels = np.expand_dims(np.expand_dims(labels, 1), 1)
+
+    with self.test_session() as session:
+      score, _ = metrics.sigmoid_cross_entropy_one_hot(logits, labels)
+      session.run(tf.global_variables_initializer())
+      session.run(tf.local_variables_initializer())
+      s = session.run(score)
+    self.assertAlmostEqual(s, 0.688, places=3)
+
 
 if __name__ == '__main__':
   tf.test.main()

From bc115993677cb0d6503caafa032ae3dd9a97102d Mon Sep 17 00:00:00 2001
From: Etienne Pot <epot@google.com>
Date: Thu, 12 Apr 2018 12:16:22 -0700
Subject: [PATCH 22/29] Cleaning and minor update to transformer_moe model

PiperOrigin-RevId: 192649661
---
 .../models/research/transformer_moe.py        | 176 ++++++++++--------
 1 file changed, 103 insertions(+), 73 deletions(-)

diff --git a/tensor2tensor/models/research/transformer_moe.py b/tensor2tensor/models/research/transformer_moe.py
index 02a51dc08..7baeeb691 100644
--- a/tensor2tensor/models/research/transformer_moe.py
+++ b/tensor2tensor/models/research/transformer_moe.py
@@ -68,12 +68,21 @@ def body_sharded(self, sharded_features):
 
     hparams = self._hparams
     dp = self._data_parallelism
-    targets = sharded_features["targets"]
+
+    # Process input
     inputs = sharded_features["inputs"]
     target_space = sharded_features["target_space_id"]
+    (
+        encoder_input,
+        encoder_self_attention_bias,
+        encoder_decoder_attention_bias,
+    ) = dp(self._prepare_encoder, inputs, target_space)
 
-    inputs = dp(common_layers.flatten4d3d, inputs)
-    targets = dp(common_layers.flatten4d3d, targets)
+    # Process output
+    targets = sharded_features["targets"]
+    decoder_input, decoder_self_attention_bias = dp(
+        self._prepare_decoder, targets
+    )
 
     def dp_preprocess(x):
       return dp(common_layers.layer_preprocess, x, hparams)
@@ -81,17 +90,6 @@ def dp_preprocess(x):
     def dp_postprocess(x, y):
       return dp(common_layers.layer_postprocess, x, y, hparams)
 
-    (encoder_input, encoder_self_attention_bias,
-     encoder_decoder_attention_bias) = dp(
-         transformer.transformer_prepare_encoder,
-         inputs, target_space, hparams)
-    (decoder_input, decoder_self_attention_bias) = dp(
-        transformer.transformer_prepare_decoder, targets, hparams)
-    encoder_input = dp(tf.nn.dropout, encoder_input,
-                       1.0 - hparams.layer_prepostprocess_dropout)
-    decoder_input = dp(tf.nn.dropout, decoder_input,
-                       1.0 - hparams.layer_prepostprocess_dropout)
-
     cache = dict(extra_loss=0.0)
 
     def prepostprocess(fct):
@@ -106,62 +104,7 @@ def decorated(x, *args, **kwargs):
 
     # ========= Compute the transformer architecture =========
 
-    def extract_layer_types(layer_types):
-      """Parse the layer string.
-
-      Args:
-        layer_types (str): String containing the network architecture. See
-          top file comment for examples of format.
-
-      Returns:
-        list[tuple[str, str]]: Encoder layers: list of (attention, feed-forward)
-        list[tuple[str, str, str]]: Decoder layers: list of (self-attention,
-          enc-dec attention, feed-forward)
-      """
-      # If the architecture has not explicitly been set, we just construct a
-      # standard transformer with the fallback values
-      if not layer_types:
-        layer_types = SEP_LAYER.join(
-            [hparams.default_att] * hparams.num_hidden_layers)
-
-      # If encoder not explicitly defined, the encoder will have the same
-      # structure as the decoder
-      layer_types = layer_types.split(SEP_ENCODEC)
-      if len(layer_types) == 1:
-        layer_types *= 2
-
-      # Some models don't need the encoder (ex: language modeling)
-      # TODO(epot): What are the other conditions (has_input ?)
-      if hparams.prepend_mode != "none":
-        layer_types[0] = ""
-
-      # Extend the blocks and fill them with the default values if not specified
-      final_layers = ([], [])
-      for i, blocks_str in enumerate(layer_types):
-        for blocks_str in blocks_str.split(SEP_LAYER):
-          if not blocks_str:
-            continue
-          blocks_list = blocks_str.split(SEP_FF)
-          # Eventually use the fallback values for the layer_types. If the
-          # encoder is empty, do not use the enco-deco attention.
-          self_att = blocks_list[0] or hparams.default_att
-          ende_att = hparams.default_att if layer_types[0] else "_"
-          ff = hparams.default_ff
-          if len(blocks_list) > 1:
-            ff = blocks_list[-1]
-          if len(blocks_list) == 3:
-            ende_att = blocks_list[1]
-          if i == 0:  # Encoder
-            blocks_tuple = (self_att, ff)
-          elif i == 1:  # Decoder
-            blocks_tuple = (self_att, ende_att, ff)
-          final_layers[i].append(blocks_tuple)
-
-      return final_layers
-
-    # ========= Construct the transformer encoder and decoder =========
-
-    encoder_layers, decoder_layers = extract_layer_types(hparams.layer_types)
+    encoder_layers, decoder_layers = self._extract_layer_types()
 
     layers = common_attention.get_standardized_layers(
         hparams=hparams,
@@ -179,6 +122,8 @@ def print_layer(name, layers):
       print_layer("Encoder", encoder_layers)
       print_layer("Decoder", decoder_layers)
 
+    # ========= Construct the transformer encoder and decoder =========
+
     encoder_outputs = []
 
     x = encoder_input
@@ -236,6 +181,91 @@ def print_layer(name, layers):
     decoder_output = dp(tf.expand_dims, x, 2)
     return decoder_output, cache["extra_loss"]
 
+  @expert_utils.add_name_scope()
+  def _prepare_encoder(self, inputs, target_space):
+    """Process the transformer encoder inputs."""
+    inputs = common_layers.flatten4d3d(inputs)
+
+    output = transformer.transformer_prepare_encoder(
+        inputs,
+        target_space,
+        self._hparams,
+        features=None,
+    )
+    enco_input, enco_self_att_bias, enco_deco_att_bias = output
+
+    enco_input = tf.nn.dropout(
+        enco_input, 1.0 - self._hparams.layer_prepostprocess_dropout)
+
+    return enco_input, enco_self_att_bias, enco_deco_att_bias
+
+  @expert_utils.add_name_scope()
+  def _prepare_decoder(self, targets):
+    """Process the transformer decoder input."""
+    targets = common_layers.flatten4d3d(targets)
+
+    output = transformer.transformer_prepare_decoder(
+        targets, self._hparams, features=None,
+    )
+    deco_input, deco_self_attention_bias = output
+
+    deco_input = tf.nn.dropout(
+        deco_input, 1.0 - self._hparams.layer_prepostprocess_dropout
+    )
+    return deco_input, deco_self_attention_bias
+
+  def _extract_layer_types(self):
+    """Parse the layer string.
+
+    Returns:
+      list[tuple[str, str]]: Encoder layers: list of (attention, feed-forward)
+      list[tuple[str, str, str]]: Decoder layers: list of (self-attention,
+        enc-dec attention, feed-forward)
+    """
+    hparams = self._hparams
+    layer_types = hparams.layer_types
+
+    # If the architecture has not explicitly been set, we just construct a
+    # standard transformer with the fallback values
+    if not layer_types:
+      layer_types = SEP_LAYER.join(
+          [hparams.default_att] * hparams.num_hidden_layers)
+
+    # If encoder not explicitly defined, the encoder will have the same
+    # structure as the decoder
+    layer_types = layer_types.split(SEP_ENCODEC)
+    if len(layer_types) == 1:
+      layer_types *= 2
+
+    # Some models don't need the encoder (ex: language modeling)
+    # TODO(epot): What are the other conditions (has_input ?)
+    if hparams.prepend_mode != "none":
+      layer_types[0] = ""
+
+    # Extend the blocks and fill them with the default values if not specified
+    final_layers = ([], [])
+    for i, blocks_str in enumerate(layer_types):
+      for blocks_str in blocks_str.split(SEP_LAYER):
+        if not blocks_str:
+          continue
+        blocks_list = blocks_str.split(SEP_FF)
+        # Eventually use the fallback values for the layer_types. If the
+        # encoder is empty, do not use the enco-deco attention.
+        self_att = blocks_list[0] or hparams.default_att
+        ende_att = hparams.default_att if layer_types[0] else "_"
+        ff = hparams.default_ff
+        if len(blocks_list) > 1:
+          ff = blocks_list[-1]
+        if len(blocks_list) == 3:
+          ende_att = blocks_list[1]
+        if i == 0:  # Encoder
+          blocks_tuple = (self_att, ff)
+        elif i == 1:  # Decoder
+          blocks_tuple = (self_att, ende_att, ff)
+        final_layers[i].append(blocks_tuple)
+
+    return final_layers
+
 
 @registry.register_hparams
 def transformer_moe_base():
@@ -252,7 +282,7 @@ def transformer_moe_base():
   hparams.optimizer_adam_epsilon = 1e-9
   hparams.learning_rate_decay_scheme = "noam"
   hparams.learning_rate = 0.1
-  hparams.learning_rate_warmup_steps = 4000
+  hparams.learning_rate_warmup_steps = 2000
   hparams.initializer_gain = 1.0
   hparams.num_hidden_layers = 5
   hparams.initializer = "uniform_unit_scaling"
@@ -266,8 +296,8 @@ def transformer_moe_base():
   hparams.layer_preprocess_sequence = "n"
   hparams.layer_postprocess_sequence = "da"
 
+  # Hparams used by transformer_prepare_decoder() function
   hparams.add_hparam("pos", "timing")  # timing, none
-  hparams.add_hparam("nbr_decoder_problems", 1)
   hparams.add_hparam("proximity_bias", False)
 
   hparams = common_attention.add_standard_attention_hparams(hparams)
@@ -384,7 +414,7 @@ def transformer_moe_prepend_8k():
   hparams = transformer_moe_8k()
   hparams.prepend_mode = "prepend_inputs_masked_attention"
   hparams.eval_drop_long_sequences = False
-  hparams.max_input_seq_length = 7500,
+  hparams.max_input_seq_length = 7500
   hparams.default_ff = "sepm"
   hparams.layer_types = "locm/redm/locm-moe/redm/locm"
   hparams.moe_num_experts = 256

From e791b2719b62bf779ca8e3a954a89a50db6c3d58 Mon Sep 17 00:00:00 2001
From: Aurko Roy <aurkor@google.com>
Date: Thu, 12 Apr 2018 19:39:15 -0700
Subject: [PATCH 23/29] Add the option of not using the decoder to generate the
 targets

PiperOrigin-RevId: 192710402
---
 tensor2tensor/layers/discretization.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensor2tensor/layers/discretization.py b/tensor2tensor/layers/discretization.py
index 75519f5a5..78577db89 100644
--- a/tensor2tensor/layers/discretization.py
+++ b/tensor2tensor/layers/discretization.py
@@ -108,7 +108,6 @@ def nearest_neighbor(x,
     nearest_hot = tf.exp(-inv_temp * dist) * c_probs
     nearest_hot /= tf.reduce_sum(nearest_hot, 2, keepdims=True)
   else:
-    dist = tf.Print(dist, [dist], message="dist=")
     if random_top_k > 1:
       _, top_k_idx = tf.nn.top_k(-dist, k=random_top_k)
       nearest_idx = tf.gather(

From cce8d07f88379d053f0067726ec02f4637ebf493 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Fri, 13 Apr 2018 05:57:05 -0700
Subject: [PATCH 24/29] LibrispeechTrainFullTestClean fix

PiperOrigin-RevId: 192756763
---
 tensor2tensor/data_generators/librispeech.py | 33 ++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tensor2tensor/data_generators/librispeech.py b/tensor2tensor/data_generators/librispeech.py
index 81c532286..978672ad2 100644
--- a/tensor2tensor/data_generators/librispeech.py
+++ b/tensor2tensor/data_generators/librispeech.py
@@ -21,9 +21,11 @@
 # Dependency imports
 
 from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import speech_recognition
 from tensor2tensor.utils import registry
 
+import tensorflow as tf
 
 _LIBRISPEECH_TRAIN_DATASETS = [
     [
@@ -195,6 +197,37 @@ def test_filepaths(self, data_dir, num_shards, shuffled):
   def generate_data(self, data_dir, tmp_dir, task_id=-1):
     raise Exception("Generate librispeech and librispeech_clean data.")
 
+  def filepattern(self, data_dir, mode, shard=None):
+    """Get filepattern for data files for mode.
+
+    Matches mode to a suffix.
+    * DatasetSplit.TRAIN: train
+    * DatasetSplit.EVAL: dev
+    * DatasetSplit.TEST: test
+    * tf.estimator.ModeKeys.PREDICT: dev
+
+    Args:
+      data_dir: str, data directory.
+      mode: DatasetSplit
+      shard: int, if provided, will only read data from the specified shard.
+
+    Returns:
+      filepattern str
+    """
+    shard_str = "-%05d" % shard if shard is not None else ""
+    if mode == problem.DatasetSplit.TRAIN:
+      path = os.path.join(data_dir, "librispeech")
+      suffix = "train"
+    elif mode in [problem.DatasetSplit.EVAL, tf.estimator.ModeKeys.PREDICT]:
+      path = os.path.join(data_dir, "librispeech_clean")
+      suffix = "dev"
+    else:
+      assert mode == problem.DatasetSplit.TEST
+      path = os.path.join(data_dir, "librispeech_clean")
+      suffix = "test"
+
+    return "%s-%s%s*" % (path, suffix, shard_str)
+
 
 @registry.register_problem()
 class LibrispeechCleanSmall(Librispeech):

From 6c629ea6612c10bb25d093c359bd8f1801548069 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Fri, 13 Apr 2018 11:07:59 -0700
Subject: [PATCH 25/29] Support LM serving in general and for Transformer.

PiperOrigin-RevId: 192794156
---
 tensor2tensor/layers/common_layers.py  | 11 ++++++
 tensor2tensor/models/transformer.py    | 46 +++++++++++++++++---------
 tensor2tensor/serving/query.py         | 37 +++++++++++++--------
 tensor2tensor/serving/serving_utils.py | 15 ++++++---
 tensor2tensor/utils/beam_search.py     |  4 +--
 tensor2tensor/utils/t2t_model.py       |  8 +++--
 6 files changed, 84 insertions(+), 37 deletions(-)

diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index 0248acb7d..fc6a84f93 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -236,6 +236,17 @@ def convert_rgb_to_real(x):
     return x
 
 
+def expand_squeeze_to_nd(x, n, squeeze_dim=2, expand_dim=-1):
+  """Make x n-d with squeeze and expand_dims."""
+  if len(x.shape) > n:
+    while len(x.shape) != n:
+      x = tf.squeeze(x, [squeeze_dim])
+  else:
+    while len(x.shape) != n:
+      x = tf.expand_dims(x, expand_dim)
+  return x
+
+
 def flatten4d3d(x):
   """Flatten a 4d-tensor into a 3d-tensor by joining width and height."""
   xshape = shape_list(x)
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index bd736f7bb..8497913ee 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -301,17 +301,21 @@ def _fast_decode(self,
       partial_targets = None
     else:
       # The problem has no inputs.
-      # In this case, features["inputs"] contains partial targets.
-      # We force the outputs to begin with these sequences.
       encoder_output = None
       encoder_decoder_attention_bias = None
-      if len(features["inputs"].shape) >= 4:
-        partial_targets = tf.squeeze(tf.to_int64(features["inputs"]), [2, 3])
-      else:
-        partial_targets = tf.squeeze(tf.to_int64(features["inputs"]), [2])
-      partial_targets_length = common_layers.shape_list(partial_targets)[1]
+
+      # Prepare partial targets.
+      # In either features["inputs"] or features["targets"].
+      # We force the outputs to begin with these sequences.
+      partial_targets = features.get("inputs")
+      if partial_targets is None:
+        partial_targets = features["targets"]
+      partial_targets = common_layers.expand_squeeze_to_nd(partial_targets, 2)
+      partial_targets = tf.to_int64(partial_targets)
+      partial_targets_shape = common_layers.shape_list(partial_targets)
+      partial_targets_length = partial_targets_shape[1]
       decode_length += partial_targets_length
-      batch_size = tf.shape(partial_targets)[0]
+      batch_size = partial_targets_shape[0]
 
     if hparams.pos == "timing":
       timing_signal = common_attention.get_timing_signal_1d(
@@ -397,7 +401,7 @@ def forced_logits():
         alpha=alpha,
         batch_size=batch_size)
     if partial_targets is not None:
-      if beam_size <= 1:
+      if beam_size <= 1 or top_beams <= 1:
         ret["outputs"] = ret["outputs"][:, partial_targets_length:]
       else:
         ret["outputs"] = ret["outputs"][:, :, partial_targets_length:]
@@ -482,20 +486,28 @@ def fast_decode(encoder_output,
 
     if top_beams == 1:
       decoded_ids = decoded_ids[:, 0, 1:]
+      scores = scores[:, 0]
     else:
       decoded_ids = decoded_ids[:, :top_beams, 1:]
+      scores = scores[:, :top_beams]
   else:  # Greedy
 
-    def inner_loop(i, finished, next_id, decoded_ids, cache):
+    def inner_loop(i, finished, next_id, decoded_ids, cache, log_prob):
       """One step of greedy decoding."""
       logits, cache = symbols_to_logits_fn(next_id, i, cache)
+      log_probs = beam_search.log_prob_from_logits(logits)
       temperature = (0.0 if hparams.sampling_method == "argmax" else
                      hparams.sampling_temp)
       next_id = common_layers.sample_with_temperature(logits, temperature)
       finished |= tf.equal(next_id, eos_id)
+
+      log_prob_indices = tf.stack(
+          [tf.range(tf.to_int64(batch_size)), next_id], axis=1)
+      log_prob += tf.gather_nd(log_probs, log_prob_indices)
+
       next_id = tf.expand_dims(next_id, axis=1)
       decoded_ids = tf.concat([decoded_ids, next_id], axis=1)
-      return i + 1, finished, next_id, decoded_ids, cache
+      return i + 1, finished, next_id, decoded_ids, cache, log_prob
 
     def is_not_finished(i, finished, *_):
       return (i < decode_length) & tf.logical_not(tf.reduce_all(finished))
@@ -503,18 +515,22 @@ def is_not_finished(i, finished, *_):
     decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int64)
     finished = tf.fill([batch_size], False)
     next_id = tf.zeros([batch_size, 1], dtype=tf.int64)
-    _, _, _, decoded_ids, _ = tf.while_loop(
+    initial_log_prob = tf.zeros([batch_size], dtype=tf.float32)
+    _, _, _, decoded_ids, _, log_prob = tf.while_loop(
         is_not_finished,
-        inner_loop,
-        [tf.constant(0), finished, next_id, decoded_ids, cache],
+        inner_loop, [
+            tf.constant(0), finished, next_id, decoded_ids, cache,
+            initial_log_prob
+        ],
         shape_invariants=[
             tf.TensorShape([]),
             tf.TensorShape([None]),
             tf.TensorShape([None, None]),
             tf.TensorShape([None, None]),
             nest.map_structure(beam_search.get_state_shape_invariants, cache),
+            tf.TensorShape([None]),
         ])
-    scores = None
+    scores = log_prob
 
   return {"outputs": decoded_ids, "scores": scores}
 
diff --git a/tensor2tensor/serving/query.py b/tensor2tensor/serving/query.py
index ea0721faf..9d8eed092 100644
--- a/tensor2tensor/serving/query.py
+++ b/tensor2tensor/serving/query.py
@@ -32,12 +32,6 @@
 flags = tf.flags
 FLAGS = flags.FLAGS
 
-flags.DEFINE_string("cloud_mlengine_model_name", None,
-                    "Name of model deployed on Cloud ML Engine.")
-flags.DEFINE_string(
-    "cloud_mlengine_model_version", None,
-    "Version of the model to use. If None, requests will be "
-    "sent to the default version.")
 flags.DEFINE_string("server", None, "Address to Tensorflow Serving server.")
 flags.DEFINE_string("servable_name", None, "Name of served model.")
 flags.DEFINE_string("problem", None, "Problem name.")
@@ -46,6 +40,14 @@
 flags.DEFINE_string("inputs_once", None, "Query once with this input.")
 flags.DEFINE_integer("timeout_secs", 10, "Timeout for query.")
 
+# For Cloud ML Engine predictions.
+flags.DEFINE_string("cloud_mlengine_model_name", None,
+                    "Name of model deployed on Cloud ML Engine.")
+flags.DEFINE_string(
+    "cloud_mlengine_model_version", None,
+    "Version of the model to use. If None, requests will be "
+    "sent to the default version.")
+
 
 def validate_flags():
   """Validates flags are set to acceptable values."""
@@ -57,24 +59,31 @@ def validate_flags():
     assert FLAGS.servable_name
 
 
-def main(_):
-  tf.logging.set_verbosity(tf.logging.INFO)
-  validate_flags()
-  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
-  problem = registry.problem(FLAGS.problem)
-  hparams = tf.contrib.training.HParams(
-      data_dir=os.path.expanduser(FLAGS.data_dir))
-  problem.get_hparams(hparams)
+def make_request_fn():
+  """Returns a request function."""
   if FLAGS.cloud_mlengine_model_name:
     request_fn = serving_utils.make_cloud_mlengine_request_fn(
         credentials=GoogleCredentials.get_application_default(),
         model_name=FLAGS.cloud_mlengine_model_name,
         version=FLAGS.cloud_mlengine_model_version)
   else:
+
     request_fn = serving_utils.make_grpc_request_fn(
         servable_name=FLAGS.servable_name,
         server=FLAGS.server,
         timeout_secs=FLAGS.timeout_secs)
+  return request_fn
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+  validate_flags()
+  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
+  problem = registry.problem(FLAGS.problem)
+  hparams = tf.contrib.training.HParams(
+      data_dir=os.path.expanduser(FLAGS.data_dir))
+  problem.get_hparams(hparams)
+  request_fn = make_request_fn()
   while True:
     inputs = FLAGS.inputs_once if FLAGS.inputs_once else input(">> ")
     outputs = serving_utils.predict([inputs], problem, request_fn)
diff --git a/tensor2tensor/serving/serving_utils.py b/tensor2tensor/serving/serving_utils.py
index 805521cbc..e22ddfb2c 100644
--- a/tensor2tensor/serving/serving_utils.py
+++ b/tensor2tensor/serving/serving_utils.py
@@ -33,6 +33,7 @@
 from tensorflow_serving.apis import prediction_service_pb2
 
 
+
 def _make_example(input_ids, feature_name="inputs"):
   features = {
       feature_name:
@@ -48,9 +49,10 @@ def _create_stub(server):
   return prediction_service_pb2.beta_create_PredictionService_stub(channel)
 
 
-def _encode(inputs, encoder):
+def _encode(inputs, encoder, add_eos=True):
   input_ids = encoder.encode(inputs)
-  input_ids.append(text_encoder.EOS_ID)
+  if add_eos:
+    input_ids.append(text_encoder.EOS_ID)
   return input_ids
 
 
@@ -58,12 +60,14 @@ def _decode(output_ids, output_decoder):
   return output_decoder.decode(output_ids)
 
 
+
+
 def make_grpc_request_fn(servable_name, server, timeout_secs):
   """Wraps function to make grpc requests with runtime args."""
+  stub = _create_stub(server)
 
   def _make_grpc_request(examples):
     """Builds and sends request to TensorFlow model server."""
-    stub = _create_stub(server)
     request = predict_pb2.PredictRequest()
     request.model_spec.name = servable_name
     request.inputs["input"].CopyFrom(
@@ -107,7 +111,10 @@ def predict(inputs_list, problem, request_fn):
   assert isinstance(inputs_list, list)
   fname = "inputs" if problem.has_inputs else "targets"
   input_encoder = problem.feature_info[fname].encoder
-  input_ids_list = [_encode(inputs, input_encoder) for inputs in inputs_list]
+  input_ids_list = [
+      _encode(inputs, input_encoder, add_eos=problem.has_inputs)
+      for inputs in inputs_list
+  ]
   examples = [_make_example(input_ids, fname) for input_ids in input_ids_list]
   predictions = request_fn(examples)
   output_decoder = problem.feature_info["targets"].encoder
diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py
index f4364550c..c65a1abd3 100644
--- a/tensor2tensor/utils/beam_search.py
+++ b/tensor2tensor/utils/beam_search.py
@@ -89,8 +89,8 @@ def get_state_shape_invariants(tensor):
   return tf.TensorShape(shape)
 
 
-def log_prob_from_logits(logits):
-  return logits - tf.reduce_logsumexp(logits, axis=2, keep_dims=True)
+def log_prob_from_logits(logits, reduce_axis=-1):
+  return logits - tf.reduce_logsumexp(logits, axis=reduce_axis, keep_dims=True)
 
 
 def compute_batch_indices(batch_size, beam_size):
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index 6efc393e8..29f60cead 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -1062,13 +1062,17 @@ def estimator_spec_predict(self, features):
       outputs = infer_out
       scores = None
 
+    inputs = features.get("inputs")
+    if inputs is None:
+      inputs = features["targets"]
+
     batched_problem_choice = (
         features["problem_choice"] * tf.ones(
-            (common_layers.shape_list(features["inputs"])[0],), dtype=tf.int32))
+            (common_layers.shape_list(inputs)[0],), dtype=tf.int32))
     predictions = {
         "outputs": outputs,
         "scores": scores,
-        "inputs": features.get("inputs"),
+        "inputs": inputs,
         "targets": features.get("infer_targets"),
         "problem_choice": batched_problem_choice,
         "batch_prediction_key": features.get("batch_prediction_key"),

From 7058edac0ea691cf4ef06e5963ca7f81ef4ececf Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Fri, 13 Apr 2018 11:30:50 -0700
Subject: [PATCH 26/29] VideoProblem API with dynamic frame composition, video
 modality and making gym problem and one model work.

PiperOrigin-RevId: 192798025
---
 tensor2tensor/data_generators/all_problems.py |  71 +++---
 tensor2tensor/data_generators/gym.py          | 150 ++++++++++-
 tensor2tensor/data_generators/image_utils.py  |   4 +-
 tensor2tensor/data_generators/problem.py      |   4 +-
 tensor2tensor/data_generators/video_utils.py  | 240 +++++++++++++++++-
 tensor2tensor/layers/common_layers.py         |  13 +
 tensor2tensor/layers/modalities.py            |  85 +++++++
 .../models/research/basic_conv_gen.py         |  50 ++--
 tensor2tensor/utils/metrics.py                |  24 +-
 tensor2tensor/utils/registry.py               |  16 +-
 10 files changed, 576 insertions(+), 81 deletions(-)

diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py
index 495f85fc8..85d4c4abd 100644
--- a/tensor2tensor/data_generators/all_problems.py
+++ b/tensor2tensor/data_generators/all_problems.py
@@ -20,45 +20,46 @@
 
 import importlib
 
+
 modules = [
-    'tensor2tensor.data_generators.algorithmic',
-    'tensor2tensor.data_generators.algorithmic_math',
-    'tensor2tensor.data_generators.audio',
-    'tensor2tensor.data_generators.celeba',
-    'tensor2tensor.data_generators.cifar',
-    'tensor2tensor.data_generators.cipher',
-    'tensor2tensor.data_generators.cnn_dailymail',
-    'tensor2tensor.data_generators.desc2code',
-    'tensor2tensor.data_generators.fsns',
-    'tensor2tensor.data_generators.gene_expression',
-    'tensor2tensor.data_generators.gym',
-    'tensor2tensor.data_generators.ice_parsing',
-    'tensor2tensor.data_generators.imagenet',
-    'tensor2tensor.data_generators.imdb',
-    'tensor2tensor.data_generators.librispeech',
-    'tensor2tensor.data_generators.lm1b',
-    'tensor2tensor.data_generators.mnist',
-    'tensor2tensor.data_generators.mscoco',
-    'tensor2tensor.data_generators.multinli',
-    'tensor2tensor.data_generators.ocr',
-    'tensor2tensor.data_generators.problem_hparams',
-    'tensor2tensor.data_generators.ptb',
-    'tensor2tensor.data_generators.snli',
-    'tensor2tensor.data_generators.squad',
-    'tensor2tensor.data_generators.translate_encs',
-    'tensor2tensor.data_generators.translate_ende',
-    'tensor2tensor.data_generators.translate_enfr',
-    'tensor2tensor.data_generators.translate_enmk',
-    'tensor2tensor.data_generators.translate_envi',
-    'tensor2tensor.data_generators.translate_enzh',
-    'tensor2tensor.data_generators.twentybn',
-    'tensor2tensor.data_generators.wiki',
-    'tensor2tensor.data_generators.wsj_parsing',
+    "tensor2tensor.data_generators.algorithmic",
+    "tensor2tensor.data_generators.algorithmic_math",
+    "tensor2tensor.data_generators.audio",
+    "tensor2tensor.data_generators.celeba",
+    "tensor2tensor.data_generators.cifar",
+    "tensor2tensor.data_generators.cipher",
+    "tensor2tensor.data_generators.cnn_dailymail",
+    "tensor2tensor.data_generators.desc2code",
+    "tensor2tensor.data_generators.fsns",
+    "tensor2tensor.data_generators.gene_expression",
+    "tensor2tensor.data_generators.gym",
+    "tensor2tensor.data_generators.ice_parsing",
+    "tensor2tensor.data_generators.imagenet",
+    "tensor2tensor.data_generators.imdb",
+    "tensor2tensor.data_generators.librispeech",
+    "tensor2tensor.data_generators.lm1b",
+    "tensor2tensor.data_generators.mnist",
+    "tensor2tensor.data_generators.mscoco",
+    "tensor2tensor.data_generators.multinli",
+    "tensor2tensor.data_generators.ocr",
+    "tensor2tensor.data_generators.problem_hparams",
+    "tensor2tensor.data_generators.ptb",
+    "tensor2tensor.data_generators.snli",
+    "tensor2tensor.data_generators.squad",
+    "tensor2tensor.data_generators.translate_encs",
+    "tensor2tensor.data_generators.translate_ende",
+    "tensor2tensor.data_generators.translate_enfr",
+    "tensor2tensor.data_generators.translate_enmk",
+    "tensor2tensor.data_generators.translate_envi",
+    "tensor2tensor.data_generators.translate_enzh",
+    "tensor2tensor.data_generators.twentybn",
+    "tensor2tensor.data_generators.wiki",
+    "tensor2tensor.data_generators.wsj_parsing",
 ]
 
 
 for module in modules:
   try:
     importlib.import_module(module)
-  except ImportError:
-    pass
+  except ImportError as error:
+    print("Did not import module: %s; Cause: %s" % (module, str(error)))
diff --git a/tensor2tensor/data_generators/gym.py b/tensor2tensor/data_generators/gym.py
index 6a9756cba..7d346f7ae 100644
--- a/tensor2tensor/data_generators/gym.py
+++ b/tensor2tensor/data_generators/gym.py
@@ -28,16 +28,17 @@
 
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
+from tensor2tensor.data_generators import video_utils
+
 from tensor2tensor.models.research import rl
 from tensor2tensor.rl import collect
 from tensor2tensor.rl.envs import tf_atari_wrappers as atari
 from tensor2tensor.rl.envs.utils import batch_env_factory
+
 from tensor2tensor.utils import registry
 
 import tensorflow as tf
 
-from tensorflow.contrib.training import HParams
-
 
 flags = tf.flags
 FLAGS = flags.FLAGS
@@ -45,6 +46,134 @@
 flags.DEFINE_string("agent_policy_path", "", "File with model for pong")
 
 
+class GymDiscreteProblem(video_utils.VideoProblem):
+  """Gym environment with discrete actions and rewards."""
+
+  def __init__(self, *args, **kwargs):
+    super(GymDiscreteProblem, self).__init__(*args, **kwargs)
+    self._env = None
+
+  @property
+  def num_input_frames(self):
+    """Number of frames to batch on one input."""
+    return 2
+
+  @property
+  def num_target_frames(self):
+    """Number of frames to batch on one target."""
+    return 1
+
+  @property
+  def extra_reading_spec(self):
+    """Additional data fields to store on disk and their decoders."""
+    data_fields = {
+        "action": tf.FixedLenFeature([1], tf.int64),
+        "reward": tf.FixedLenFeature([1], tf.int64)
+    }
+    decoders = {
+        "action": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="action"),
+        "reward": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="reward"),
+    }
+    return data_fields, decoders
+
+  @property
+  def is_generate_per_split(self):
+    """Whether we have a train/test split or just hold out data."""
+    return False  # Just hold out some generated data for evals.
+
+  @property
+  def env_name(self):
+    """This is the name of the Gym environment for this problem."""
+    raise NotImplementedError()
+
+  @property
+  def env(self):
+    if self._env is None:
+      self._env = gym.make(self.env_name)
+    return self._env
+
+  @property
+  def num_actions(self):
+    raise NotImplementedError()
+
+  @property
+  def num_rewards(self):
+    raise NotImplementedError()
+
+  @property
+  def num_steps(self):
+    raise NotImplementedError()
+
+  @property
+  def min_reward(self):
+    raise NotImplementedError()
+
+  def get_action(self, observation=None):
+    return self.env.action_space.sample()
+
+  def hparams(self, defaults, unused_model_hparams):
+    p = defaults
+    p.input_modality = {"inputs": ("video", 256),
+                        "input_reward": ("symbol", self.num_rewards),
+                        "input_action": ("symbol", self.num_actions)}
+    p.target_modality = ("video", 256)
+    p.input_space_id = problem.SpaceID.IMAGE
+    p.target_space_id = problem.SpaceID.IMAGE
+
+  def generate_samples(self, data_dir, tmp_dir, unused_dataset_split):
+    self.env.reset()
+    action = self.get_action()
+    for _ in range(self.num_steps):
+      observation, reward, done, _ = self.env.step(action)
+      action = self.get_action(observation)
+      yield {"frame": observation,
+             "action": [action],
+             "done": [done],
+             "reward": [int(reward - self.min_reward)]}
+
+
+@registry.register_problem
+class GymPongRandom5k(GymDiscreteProblem):
+  """Pong game, random actions."""
+
+  @property
+  def env_name(self):
+    return "PongDeterministic-v4"
+
+  @property
+  def frame_height(self):
+    return 210
+
+  @property
+  def frame_width(self):
+    return 160
+
+  @property
+  def num_actions(self):
+    return 4
+
+  @property
+  def min_reward(self):
+    return -1
+
+  @property
+  def num_rewards(self):
+    return 3
+
+  @property
+  def num_steps(self):
+    return 5000
+
+
+@registry.register_problem
+class GymPongRandom50k(GymPongRandom5k):
+  """Pong game, random actions."""
+
+  @property
+  def num_steps(self):
+    return 50000
+
+
 def moviepy_editor():
   """Access to moviepy that fails gracefully without a moviepy install."""
   try:
@@ -55,11 +184,11 @@ def moviepy_editor():
 
 
 @registry.register_problem
-class GymDiscreteProblem(problem.Problem):
+class GymDiscreteProblemWithAgent(problem.Problem):
   """Gym environment with discrete actions and rewards."""
 
   def __init__(self, *args, **kwargs):
-    super(GymDiscreteProblem, self).__init__(*args, **kwargs)
+    super(GymDiscreteProblemWithAgent, self).__init__(*args, **kwargs)
     self.num_channels = 3
     self.history_size = 2
 
@@ -68,7 +197,7 @@ def __init__(self, *args, **kwargs):
     self.in_graph_wrappers = [(atari.MaxAndSkipWrapper, {"skip": 4})]
     self.collect_hparams = rl.atari_base()
     self.num_steps = 1000
-    self.movies = True
+    self.movies = False
     self.movies_fps = 24
     self.simulated_environment = None
     self.warm_up = 70
@@ -76,8 +205,9 @@ def __init__(self, *args, **kwargs):
   def _setup(self):
     in_graph_wrappers = [(atari.ShiftRewardWrapper, {"add_value": 2}),
                          (atari.MemoryWrapper, {})] + self.in_graph_wrappers
-    env_hparams = HParams(in_graph_wrappers=in_graph_wrappers,
-                          simulated_environment=self.simulated_environment)
+    env_hparams = tf.contrib.training.HParams(
+        in_graph_wrappers=in_graph_wrappers,
+        simulated_environment=self.simulated_environment)
 
     generator_batch_env = batch_env_factory(
         self.environment_spec, env_hparams, num_agents=1, xvfb=False)
@@ -234,11 +364,11 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
 
 
 @registry.register_problem
-class GymSimulatedDiscreteProblem(GymDiscreteProblem):
+class GymSimulatedDiscreteProblemWithAgent(GymDiscreteProblemWithAgent):
   """Simulated gym environment with discrete actions and rewards."""
 
   def __init__(self, *args, **kwargs):
-    super(GymSimulatedDiscreteProblem, self).__init__(*args, **kwargs)
+    super(GymSimulatedDiscreteProblemWithAgent, self).__init__(*args, **kwargs)
     # TODO(lukaszkaiser): pull it outside
     self.in_graph_wrappers = [(atari.TimeLimitWrapper, {"timelimit": 150}),
                               (atari.MaxAndSkipWrapper, {"skip": 4})]
@@ -246,7 +376,7 @@ def __init__(self, *args, **kwargs):
     self.movies_fps = 2
 
   def restore_networks(self, sess):
-    super(GymSimulatedDiscreteProblem, self).restore_networks(sess)
+    super(GymSimulatedDiscreteProblemWithAgent, self).restore_networks(sess)
 
     # TODO(lukaszkaiser): adjust regexp for different models
     env_model_loader = tf.train.Saver(tf.global_variables(".*basic_conv_gen.*"))
diff --git a/tensor2tensor/data_generators/image_utils.py b/tensor2tensor/data_generators/image_utils.py
index bb33109c7..d65cfa4ba 100644
--- a/tensor2tensor/data_generators/image_utils.py
+++ b/tensor2tensor/data_generators/image_utils.py
@@ -158,7 +158,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
         self.dev_filepaths(data_dir, self.dev_shards, shuffled=False))
 
 
-def _encoded_images(images):
+def encode_images_as_png(images):
   if context.in_eager_mode():
     for image in images:
       yield tf.image.encode_png(image).numpy()
@@ -195,7 +195,7 @@ def image_generator(images, labels):
   if not images:
     raise ValueError("Must provide some images for the generator.")
   width, height, _ = images[0].shape
-  for (enc_image, label) in zip(_encoded_images(images), labels):
+  for (enc_image, label) in zip(encode_images_as_png(images), labels):
     yield {
         "image/encoded": [enc_image],
         "image/format": ["png"],
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index a4e56c0ed..43ef66a4d 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -781,9 +781,7 @@ def define_shapes(example):
         batch_size_means_tokens = False
       else:
         tf.logging.warning(
-            "Shapes are not fully defined. Assuming batch_size means tokens. "
-            "Override batch_size_means_tokens() "
-            "in your problem subclass if this is undesired behavior.")
+            "Shapes are not fully defined. Assuming batch_size means tokens.")
         batch_size_means_tokens = True
 
     # Batching
diff --git a/tensor2tensor/data_generators/video_utils.py b/tensor2tensor/data_generators/video_utils.py
index 136673d8a..7aabacdc3 100644
--- a/tensor2tensor/data_generators/video_utils.py
+++ b/tensor2tensor/data_generators/video_utils.py
@@ -22,6 +22,7 @@
 # Dependency imports
 
 from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.data_generators import image_utils
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.utils import metrics
@@ -42,6 +43,242 @@ def resize_video_frames(images, size):
 class VideoProblem(problem.Problem):
   """Base class for problems with videos."""
 
+  @property
+  def num_channels(self):
+    """Number of color channels in each frame."""
+    return 3
+
+  @property
+  def frame_height(self):
+    """Height of each frame."""
+    raise NotImplementedError
+
+  @property
+  def frame_width(self):
+    """Width of each frame."""
+    raise NotImplementedError
+
+  @property
+  def num_input_frames(self):
+    """Number of frames to batch on one input."""
+    return 1
+
+  @property
+  def num_target_frames(self):
+    """Number of frames to batch on one target."""
+    return 1
+
+  @property
+  def extra_reading_spec(self):
+    """Additional data fields to store on disk and their decoders."""
+    return {}, {}
+
+  @property
+  def dataset_splits(self):
+    """Splits of data to produce and number of output shards for each."""
+    return [{
+        "split": problem.DatasetSplit.TRAIN,
+        "shards": 10,
+    }, {
+        "split": problem.DatasetSplit.EVAL,
+        "shards": 1,
+    }]
+
+  def preprocess_example(self, example, mode, hparams):
+    """Runtime preprocessing, e.g., resize example["frame"]."""
+    return example
+
+  @property
+  def is_generate_per_split(self):
+    """A single call to `generate_samples` generates for all `dataset_splits`.
+
+    Set to True if you already have distinct subsets of data for each dataset
+    split specified in `self.dataset_splits`. `self.generate_samples` will be
+    called once for each split.
+
+    Set to False if you have a unified dataset that you'd like to have split out
+    into training and evaluation data automatically. `self.generate_samples`
+    will be called only once and the data will be sharded across the dataset
+    splits specified in `self.dataset_splits`.
+
+    Returns:
+      bool
+    """
+    raise NotImplementedError()
+
+  def example_reading_spec(self, label_repr=None):
+    extra_data_fields, extra_data_items_to_decoders = self.extra_reading_spec
+
+    data_fields = {
+        "image/encoded": tf.FixedLenFeature((), tf.string),
+        "image/format": tf.FixedLenFeature((), tf.string),
+    }
+    data_fields.update(extra_data_fields)
+
+    data_items_to_decoders = {
+        "frame":
+            tf.contrib.slim.tfexample_decoder.Image(
+                image_key="image/encoded",
+                format_key="image/format",
+                shape=[self.frame_height, self.frame_width, self.num_channels],
+                channels=self.num_channels),
+    }
+    data_items_to_decoders.update(extra_data_items_to_decoders)
+
+    return data_fields, data_items_to_decoders
+
+  def preprocess(self, dataset, mode, hparams):
+    def split_on_batch(x):
+      """Split x on batch dimension into x[:size, ...] and x[size:, ...]."""
+      length = len(x.get_shape())
+      size = self.num_input_frames
+      if length < 1:
+        raise ValueError("Batched tensor of length < 1.")
+      if length == 1:
+        return x[:size], x[size:]
+      if length == 2:
+        return x[:size, :], x[size:, :]
+      if length == 3:
+        return x[:size, :, :], x[size:, :, :]
+      if length == 4:
+        return x[:size, :, :, :], x[size:, :, :, :]
+      # TODO(lukaszkaiser): use tf.split for the general case.
+      raise ValueError("Batch splitting on general dimensions not done yet.")
+
+    def features_from_batch(batched_prefeatures):
+      """Construct final features from the batched inputs.
+
+      This function gets prefeatures.
+
+      Args:
+        batched_prefeatures: single-frame features (from disk) as batch tensors.
+
+      Returns:
+        Features dictionary with joint features per-frame.
+      """
+      features = {}
+      for k, v in batched_prefeatures.iteritems():
+        if k == "frame":  # We rename past frames to inputs and targets.
+          s1, s2 = split_on_batch(v)
+          # Reshape just to make sure shapes are right and set.
+          s1 = tf.reshape(s1, [self.num_input_frames, self.frame_height,
+                               self.frame_width, self.num_channels])
+          s2 = tf.reshape(s2, [self.num_target_frames, self.frame_height,
+                               self.frame_width, self.num_channels])
+          features["inputs"] = s1
+          features["targets"] = s2
+        else:
+          s1, s2 = split_on_batch(v)
+          features["input_%s" % k] = s1
+          features["target_%s" % k] = s2
+      return features
+
+    # Batch and construct features.
+    def _preprocess(example):
+      return self.preprocess_example(example, mode, hparams)
+    preprocessed_dataset = dataset.map(_preprocess)
+
+    num_frames = self.num_input_frames + self.num_target_frames
+    # TODO(lukaszkaiser): should jump by a random position at the beginning.
+    batch_dataset = preprocessed_dataset.apply(
+        tf.contrib.data.batch_and_drop_remainder(num_frames))
+    dataset = batch_dataset.map(features_from_batch).shuffle(8)
+    return dataset
+
+  def eval_metrics(self):
+    eval_metrics = [
+        metrics.Metrics.ACC, metrics.Metrics.ACC_PER_SEQ,
+        metrics.Metrics.NEG_LOG_PERPLEXITY, metrics.Metrics.IMAGE_SUMMARY]
+    return eval_metrics
+
+  def generate_samples(self, data_dir, tmp_dir, dataset_split):
+    """Generate samples of the frames with possible extra data.
+
+    Args:
+      data_dir: final data directory. Typically only used in this method to copy
+        over user-supplied vocab files if there are extra fields needing them.
+      tmp_dir: temporary directory that you can use for downloading and scratch.
+      dataset_split: problem.DatasetSplit, which data split to generate samples
+        for (for example, training and evaluation). You can assume it's TRAIN
+        if self.
+
+    Yields:
+      Sample: dict<str feature_name, feature value>; we assume that there is
+        a "frame" feature with unencoded frame which is a numpy arrays of shape
+        [frame_height, frame_width, num_channels] and which will be transcoded
+        into an image format by generate_encodeded_samples.
+    """
+    raise NotImplementedError()
+
+  def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
+    """Generate samples of the encoded frames with possible extra data.
+
+    By default this function just encodes the numpy array returned as "frame"
+    from `self.generate_samples` into a PNG image. Override this function to
+    get other encodings on disk.
+
+    Args:
+      data_dir: final data directory. Typically only used in this method to copy
+        over user-supplied vocab files if there are extra fields needing them.
+      tmp_dir: temporary directory that you can use for downloading and scratch.
+      dataset_split: problem.DatasetSplit, which data split to generate samples
+        for (for example, training and evaluation).
+
+    Yields:
+      Sample: dict<str feature_name, feature value> which is in disk encoding.
+
+    Raises:
+      ValueError: if the frame has a different number of channels than required.
+    """
+    for features in self.generate_samples(data_dir, tmp_dir, dataset_split):
+      unencoded_frame = features.pop("frame")
+      height, width, channels = unencoded_frame.shape
+      if channels != self.num_channels:
+        raise ValueError("Generated frame has %d channels while the class "
+                         "assumes %d channels." % (channels, self.num_channels))
+      if height != self.frame_height:
+        raise ValueError("Generated frame has height %d while the class "
+                         "assumes height %d." % (height, self.frame_height))
+      if width != self.frame_width:
+        raise ValueError("Generated frame has width %d while the class "
+                         "assumes width %d." % (width, self.frame_width))
+      encoded_frame = image_utils.encode_images_as_png([unencoded_frame]).next()
+      features["image/encoded"] = [encoded_frame]
+      features["image/format"] = ["png"]
+      features["image/height"] = [height]
+      features["image/width"] = [width]
+      yield features
+
+  def generate_data(self, data_dir, tmp_dir, task_id=-1):
+    """The function generating the data."""
+    filepath_fns = {
+        problem.DatasetSplit.TRAIN: self.training_filepaths,
+        problem.DatasetSplit.EVAL: self.dev_filepaths,
+        problem.DatasetSplit.TEST: self.test_filepaths,
+    }
+
+    # We set shuffled=True as we don't want to shuffle on disk later.
+    split_paths = [(split["split"], filepath_fns[split["split"]](
+        data_dir, split["shards"], shuffled=True))
+                   for split in self.dataset_splits]
+    all_paths = []
+    for _, paths in split_paths:
+      all_paths.extend(paths)
+
+    if self.is_generate_per_split:
+      for split, paths in split_paths:
+        generator_utils.generate_files(
+            self.generate_encoded_samples(data_dir, tmp_dir, split), paths)
+    else:
+      generator_utils.generate_files(
+          self.generate_encoded_samples(
+              data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths)
+
+
+# TODO(lukaszkaiser): remove this version after everything is ported.
+class VideoProblemOld(problem.Problem):
+  """Base class for problems with videos: previous version."""
+
   @property
   def num_channels(self):
     """Number of color channels."""
@@ -71,7 +308,7 @@ def eval_metrics(self):
     return eval_metrics
 
 
-class Video2ClassProblem(VideoProblem):
+class Video2ClassProblem(VideoProblemOld):
   """Base class for image classification problems."""
 
   @property
@@ -113,7 +350,6 @@ def example_reading_spec(self):
     data_fields, data_items_to_decoders = (
         super(Video2ClassProblem, self).example_reading_spec())
     data_fields[label_key] = tf.FixedLenFeature((1,), tf.int64)
-
     data_items_to_decoders[
         "targets"] = tf.contrib.slim.tfexample_decoder.Tensor(label_key)
     return data_fields, data_items_to_decoders
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index fc6a84f93..b8502dbf3 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -247,6 +247,19 @@ def expand_squeeze_to_nd(x, n, squeeze_dim=2, expand_dim=-1):
   return x
 
 
+def standardize_images(x):
+  """Image standardization on batches."""
+  with tf.name_scope("standardize_images", [x]):
+    x = tf.to_float(x)
+    x_mean = tf.reduce_mean(x, axis=[1, 2, 3], keep_dims=True)
+    x_variance = tf.reduce_mean(
+        tf.square(x - x_mean), axis=[1, 2, 3], keep_dims=True)
+    x_shape = shape_list(x)
+    num_pixels = tf.to_float(x_shape[1] * x_shape[2] * x_shape[3])
+    x = (x - x_mean) / tf.maximum(tf.sqrt(x_variance), tf.rsqrt(num_pixels))
+    return x
+
+
 def flatten4d3d(x):
   """Flatten a 4d-tensor into a 3d-tensor by joining width and height."""
   xshape = shape_list(x)
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
index 050d5a79e..8bac3bd30 100644
--- a/tensor2tensor/layers/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -455,6 +455,90 @@ def xnet_resblock(x, filters, res_relu, name):
                            "compress_block_final")
 
 
+@registry.register_video_modality("default")
+class VideoModality(modality.Modality):
+  """Modality for videos, i.e., time-sequences of frames."""
+  PIXEL_EMBEDDING_SIZE = 64
+
+  def bottom(self, inputs):
+    with tf.variable_scope(self.name):
+      inputs_shape = common_layers.shape_list(inputs)
+      if len(inputs_shape) != 5:
+        raise ValueError("Assuming videos given as tensors in the format "
+                         "[batch, time, height, width, channels].")
+      if not context.in_eager_mode():
+        tf.summary.image("inputs", tf.cast(inputs[:, -1, :, :, :], tf.uint8),
+                         max_outputs=1)
+      # Standardize frames.
+      inputs = tf.reshape(inputs, [-1] + inputs_shape[2:])
+      inputs = common_layers.standardize_images(inputs)
+      inputs = tf.reshape(inputs, inputs_shape)
+      # Concatenate the time dimension on channels for image models to work.
+      transposed = tf.transpose(inputs, [0, 2, 3, 1, 4])
+      return tf.reshape(
+          transposed,
+          [inputs_shape[0], inputs_shape[2], inputs_shape[3],
+           inputs_shape[1] * inputs_shape[4]])
+
+  def targets_bottom(self, inputs):
+    with tf.variable_scope(self.name):
+      inputs_shape = common_layers.shape_list(inputs)
+      if len(inputs_shape) != 5:
+        raise ValueError("Assuming videos given as tensors in the format "
+                         "[batch, time, height, width, channels].")
+      if not context.in_eager_mode():
+        tf.summary.image(
+            "targets_bottom", tf.cast(inputs[:, -1, :, :, :], tf.uint8),
+            max_outputs=1)
+      # We embed each of 256=self.top_dimensionality possible pixel values.
+      embedding_var = tf.get_variable(
+          "pixel_embedding",
+          [self.top_dimensionality, self.PIXEL_EMBEDDING_SIZE])
+      hot_inputs = tf.one_hot(tf.to_int32(inputs), self.top_dimensionality)
+      hot_inputs = tf.reshape(hot_inputs, [-1, self.top_dimensionality])
+      embedded = tf.matmul(hot_inputs, embedding_var)
+      # Let's now merge all channels that were embedded into a single vector.
+      merged_size = self.PIXEL_EMBEDDING_SIZE * inputs_shape[4]
+      embedded = tf.reshape(embedded, inputs_shape[:4] + [merged_size])
+      # Put time dimension on channels and add a dense layer.
+      embedded = tf.transpose(embedded, [0, 2, 3, 1, 4])
+      embedded = tf.reshape(
+          embedded,
+          [inputs_shape[0], inputs_shape[2], inputs_shape[3],
+           inputs_shape[1] * merged_size])
+      merged = tf.layers.dense(embedded, self._body_input_depth,
+                               name="merge_pixel_embedded_frames")
+      return merged
+
+  def top(self, body_output, _):
+    num_channels = self._model_hparams.problem_instances[0].num_channels
+    num_frames = self._model_hparams.problem_instances[0].num_target_frames
+    with tf.variable_scope("rgb_softmax"):
+      body_output_shape = common_layers.shape_list(body_output)
+      reshape_shape = body_output_shape[:3]
+      reshape_shape.extend([num_channels, num_frames, self.top_dimensionality])
+      res = tf.layers.dense(
+          body_output, self.top_dimensionality * num_channels * num_frames)
+      res = tf.reshape(res, reshape_shape)
+      res = tf.transpose(res, [0, 4, 1, 2, 3, 5])
+      if not tf.get_variable_scope().reuse:
+        res_argmax = tf.cast(tf.argmax(res[:, -1, :, :, :, :], axis=-1),
+                             tf.uint8)
+        tf.summary.image("result", res_argmax, max_outputs=1)
+      return res
+
+  def loss(self, logits, targets):
+    """Compute loss numerator and denominator for one shard of output."""
+    logits = tf.reshape(logits, [-1] + common_layers.shape_list(logits)[2:])
+    targets = tf.reshape(targets, [-1] + common_layers.shape_list(targets)[2:])
+    return common_layers.padded_cross_entropy(
+        logits,
+        targets,
+        self._model_hparams.label_smoothing,
+        weights_fn=self.targets_weights_fn,
+        gaussian=True)
+
+
 @registry.register_class_label_modality("default")
 class ClassLabelModality(modality.Modality):
   """Used for label data."""
@@ -499,6 +583,7 @@ def top(self, body_output, _):
 @registry.register_generic_modality("default")
 @registry.register_audio_modality("identity")
 @registry.register_image_modality("identity")
+@registry.register_video_modality("identity")
 @registry.register_class_label_modality("identity")
 @registry.register_real_modality("identity")
 class IdentityModality(modality.Modality):
diff --git a/tensor2tensor/models/research/basic_conv_gen.py b/tensor2tensor/models/research/basic_conv_gen.py
index 129c71f07..a06879b2a 100644
--- a/tensor2tensor/models/research/basic_conv_gen.py
+++ b/tensor2tensor/models/research/basic_conv_gen.py
@@ -37,18 +37,26 @@ def body(self, features):
     filters = hparams.hidden_size
     kernel1, kernel2 = (3, 3), (4, 4)
 
-    # Concat frames and down-stride.
-    cur_frame = tf.to_float(features["inputs"])
-    prev_frame = tf.to_float(features["inputs_prev"])
-    x = tf.concat([cur_frame, prev_frame], axis=-1)
+    # Pad to make size powers of 2 as needed.
+    x = features["inputs"]
+    inputs_shape = common_layers.shape_list(x)
+    x, _ = common_layers.pad_to_same_length(
+        x, x, final_length_divisible_by=2**hparams.num_compress_steps, axis=1)
+    x, _ = common_layers.pad_to_same_length(
+        x, x, final_length_divisible_by=2**hparams.num_compress_steps, axis=2)
+
+    # Down-stride.
     for _ in range(hparams.num_compress_steps):
       x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu,
                            strides=(2, 2), padding="SAME")
       x = common_layers.layer_norm(x)
       filters *= 2
+
     # Add embedded action.
-    action = tf.reshape(features["action"], [-1, 1, 1, hparams.hidden_size])
-    zeros = tf.zeros(common_layers.shape_list(x)[:-1] + [hparams.hidden_size])
+    action = tf.reshape(features["input_action"][:, 1, :],
+                        [-1, 1, 1, hparams.hidden_size])
+    zeros = tf.zeros(common_layers.shape_list(x)[:-1] + [hparams.hidden_size],
+                     dtype=tf.float32)
     x = tf.concat([x, action + zeros], axis=-1)
 
     # Run a stack of convolutions.
@@ -56,10 +64,12 @@ def body(self, features):
       with tf.variable_scope("layer%d" % i):
         y = tf.layers.conv2d(x, filters, kernel1, activation=common_layers.belu,
                              strides=(1, 1), padding="SAME")
+        y = tf.nn.dropout(y, 1.0 - hparams.dropout)
         if i == 0:
           x = y
         else:
           x = common_layers.layer_norm(x + y)
+
     # Up-convolve.
     for _ in range(hparams.num_compress_steps):
       filters //= 2
@@ -67,12 +77,17 @@ def body(self, features):
           x, filters, kernel2, activation=common_layers.belu,
           strides=(2, 2), padding="SAME")
       x = common_layers.layer_norm(x)
+      x = tf.nn.dropout(x, 1.0 - hparams.dropout)
+
+    # Cut down to original size.
+    x = x[:, :inputs_shape[1], :inputs_shape[2], :]
 
     # Reward prediction.
     reward_pred_h1 = tf.reduce_mean(x, axis=[1, 2], keep_dims=True)
-    # Rewards are {-1, 0, 1} so we add 1 to the raw gold ones, predict 3.
+    # Rewards are {-1, 0, 1} so we predict 3.
     reward_pred = tf.layers.dense(reward_pred_h1, 3, name="reward")
-    reward_gold = tf.expand_dims(tf.to_int32(features["reward_raw"]) + 1, 1)
+    reward_gold = tf.expand_dims(tf.to_int32(
+        features["input_reward_raw"][:, 1, :]), axis=1)
     reward_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
         labels=reward_gold, logits=reward_pred, name="reward_loss")
     reward_loss = tf.reduce_mean(reward_loss)
@@ -94,7 +109,8 @@ def basic_conv():
   hparams.initializer = "uniform_unit_scaling"
   hparams.initializer_gain = 1.0
   hparams.weight_decay = 0.0
-  hparams.add_hparam("num_compress_steps", 2)
+  hparams.dropout = 0.1
+  hparams.add_hparam("num_compress_steps", 5)
   return hparams
 
 
@@ -121,18 +137,6 @@ def basic_conv_small_per_image_standardization():
 class MichiganBasicConvGen(t2t_model.T2TModel):
 
   def body(self, features):
-    def standardize_images(x):
-      """Image standardization on batches."""
-      with tf.name_scope("standardize_images", [x]):
-        x = tf.to_float(x)
-        x_mean = tf.reduce_mean(x, axis=[1, 2, 3], keep_dims=True)
-        x_variance = tf.reduce_mean(
-            tf.square(x - x_mean), axis=[1, 2, 3], keep_dims=True)
-        x_shape = common_layers.shape_list(x)
-        num_pixels = tf.to_float(x_shape[1] * x_shape[2] * 3)
-        x = (x - x_mean) / tf.maximum(tf.sqrt(x_variance), tf.rsqrt(num_pixels))
-        return x
-
     def deconv2d(cur, i, kernel_size, output_filters, activation=tf.nn.relu):
       thicker = common_layers.conv(
           cur,
@@ -143,8 +147,8 @@ def deconv2d(cur, i, kernel_size, output_filters, activation=tf.nn.relu):
           name="deconv2d" + str(i))
       return tf.depth_to_space(thicker, 2)
 
-    cur_frame = standardize_images(features["inputs_0"])
-    prev_frame = standardize_images(features["inputs_1"])
+    cur_frame = common_layers.standardize_images(features["inputs_0"])
+    prev_frame = common_layers.standardize_images(features["inputs_1"])
 
     frames = tf.concat([cur_frame, prev_frame], axis=3)
     frames = tf.reshape(frames, [-1, 210, 160, 6])
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index 0cba1adda..dc3b71607 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -248,12 +248,12 @@ def set_recall(predictions, labels, weights_fn=common_layers.weights_nonzero):
     return tf.to_float(tf.equal(labels, predictions)), weights
 
 
-def image_summary(predictions, features, hparams):
+def image_summary(predictions, targets, hparams):
   """Reshapes predictions and passes it to tensorboard.
 
   Args:
     predictions : The predicted image (logits).
-    features : The features dictionary with tensors.
+    targets : The ground truth.
     hparams: model hparams.
 
   Returns:
@@ -262,7 +262,7 @@ def image_summary(predictions, features, hparams):
   """
   del hparams
   results = tf.cast(tf.argmax(predictions, axis=-1), tf.uint8)
-  gold = tf.cast(features["targets"], tf.uint8)
+  gold = tf.cast(targets, tf.uint8)
   summary1 = tf.summary.image("prediction", results, max_outputs=2)
   summary2 = tf.summary.image("data", gold, max_outputs=2)
   summary = tf.summary.merge([summary1, summary2])
@@ -368,13 +368,22 @@ def create_evaluation_metrics(problems, model_hparams):
     ValueError: if the metrics specified by a problem are not recognized (i.e.
       are not defined in the Metrics enum.
   """
+  def reduce_dimensions(predictions, labels):
+    """Reduce dimensions for high-dimensional predictions and labels."""
+    # We will treat first dimensions as batch. One example are video frames.
+    if len(predictions.get_shape()) > 5:
+      predictions = tf.reshape(
+          predictions, [-1] + common_layers.shape_list(predictions)[-4:])
+    if len(labels.get_shape()) > 4:
+      labels = tf.reshape(
+          labels, [-1] + common_layers.shape_list(labels)[-3:])
+    return predictions, labels
 
   def make_problem_specific_metric_fn(metric_fn, problem_idx, weights_fn):
     """Create a metric fn conditioned on problem_idx."""
 
     def problem_metric_fn(predictions, features, labels):
       """Metric fn."""
-      # labels = features.get("targets", None)
       problem_choice = features.get("problem_choice", 0)
 
       # Send along the entire features dict if the metric fn has the kwarg
@@ -384,6 +393,8 @@ def problem_metric_fn(predictions, features, labels):
       if ("features" in args) or keywords:
         kwargs["features"] = features
 
+      predictions, labels = reduce_dimensions(predictions, labels)
+
       def wrapped_metric_fn():
         return metric_fn(predictions, labels, weights_fn=weights_fn, **kwargs)
 
@@ -407,9 +418,12 @@ def wrapped_metric_fn():
                                     list(METRICS_FNS.keys())))
 
     def image_wrapped_metric_fn(predictions,
+                                features,
                                 labels,
-                                weights_fn=common_layers.weights_nonzero):
+                                weights_fn=common_layers.weights_all):
       del weights_fn
+      del features
+      predictions, labels = reduce_dimensions(predictions, labels)
       return metric_fn(predictions, labels, model_hparams)
 
     tm = problem_instance.get_hparams().target_modality
diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py
index 65caa5181..16a6c7437 100644
--- a/tensor2tensor/utils/registry.py
+++ b/tensor2tensor/utils/registry.py
@@ -63,6 +63,7 @@ class Modalities(object):
   SYMBOL = "symbol"
   IMAGE = "image"
   AUDIO = "audio"
+  VIDEO = "video"
   CLASS_LABEL = "class_label"
   GENERIC = "generic"
   REAL = "real"
@@ -72,6 +73,7 @@ class Modalities(object):
     Modalities.SYMBOL: {},
     Modalities.IMAGE: {},
     Modalities.AUDIO: {},
+    Modalities.VIDEO: {},
     Modalities.CLASS_LABEL: {},
     Modalities.GENERIC: {},
     Modalities.REAL: {},
@@ -295,6 +297,11 @@ def generic_modality(name=None):
                                 Modalities.GENERIC.capitalize())
 
 
+def video_modality(name=None):
+  return _internal_get_modality(name, _MODALITIES[Modalities.VIDEO],
+                                Modalities.VIDEO.capitalize())
+
+
 def audio_modality(name=None):
   return _internal_get_modality(name, _MODALITIES[Modalities.AUDIO],
                                 Modalities.AUDIO.capitalize())
@@ -365,6 +372,12 @@ def register_image_modality(name=None):
                                      Modalities.IMAGE.capitalize())
 
 
+def register_video_modality(name=None):
+  """Register a video modality. name defaults to class name snake-cased."""
+  return _internal_register_modality(name, _MODALITIES[Modalities.VIDEO],
+                                     Modalities.VIDEO.capitalize())
+
+
 def register_class_label_modality(name=None):
   """Register an image modality. name defaults to class name snake-cased."""
   return _internal_register_modality(name, _MODALITIES[Modalities.CLASS_LABEL],
@@ -406,8 +419,9 @@ def create_modality(modality_spec, model_hparams):
   """
   retrieval_fns = {
       Modalities.SYMBOL: symbol_modality,
-      Modalities.AUDIO: audio_modality,
       Modalities.IMAGE: image_modality,
+      Modalities.AUDIO: audio_modality,
+      Modalities.VIDEO: video_modality,
       Modalities.CLASS_LABEL: class_label_modality,
       Modalities.GENERIC: generic_modality,
       Modalities.REAL: real_modality,

From a7c150e4f4500d1732443268c1ec1b1c61664c93 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Fri, 13 Apr 2018 13:03:20 -0700
Subject: [PATCH 27/29] v1.5.7

PiperOrigin-RevId: 192811712
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2379d8a66..9f9035efa 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.5.6',
+    version='1.5.7',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',

From 6f1152ca8792ec12792ef25fa002221b50cdc347 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Fri, 13 Apr 2018 13:05:32 -0700
Subject: [PATCH 28/29] TransformerScorer model to only score targets on infer

PiperOrigin-RevId: 192812089
---
 tensor2tensor/models/transformer.py      | 53 ++++++++++++++++++++++++
 tensor2tensor/models/transformer_test.py | 49 +++++++++++++++++++++-
 2 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 8497913ee..88be60dfd 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -535,6 +535,59 @@ def is_not_finished(i, finished, *_):
   return {"outputs": decoded_ids, "scores": scores}
 
 
+@registry.register_model
+class TransformerScorer(Transformer):
+  """Transformer model, but only scores in PREDICT mode.
+
+  Checkpoints between Transformer and TransformerScorer are interchangeable.
+  """
+
+  def __init__(self, *args, **kwargs):
+    super(TransformerScorer, self).__init__(*args, **kwargs)
+    self._name = "transformer"
+    self._base_name = "transformer"
+
+  def infer(self,
+            features=None,
+            decode_length=50,
+            beam_size=1,
+            top_beams=1,
+            alpha=0.0):
+    """Returns the targets and their log probabilities."""
+    del decode_length, beam_size, top_beams, alpha
+    assert features is not None
+
+    # Run the model
+    self.hparams.force_full_predict = True
+    with tf.variable_scope(self.name):
+      logits, _ = self.model_fn(features)
+    assert len(logits.shape) == 5  # [batch, time, 1, 1, vocab]
+    logits = tf.squeeze(logits, [2, 3])
+
+    # Compute the log probabilities
+    log_probs = beam_search.log_prob_from_logits(logits)
+
+    # Slice out the log_probs of the targets
+    targets = features["targets"]
+    assert len(targets.shape) == 4  # [batch, time, 1, 1]
+    targets = tf.squeeze(targets, [2, 3])
+    batch_size, timesteps = common_layers.shape_list(targets)
+    vocab_size = common_layers.shape_list(log_probs)[-1]
+    flat_targets = tf.reshape(targets, [batch_size * timesteps])
+    flat_log_probs = tf.reshape(log_probs, [batch_size * timesteps, vocab_size])
+    flat_indices = tf.stack(
+        [tf.range(tf.to_int64(batch_size) * tf.to_int64(timesteps)),
+         tf.to_int64(flat_targets)], axis=1)
+    log_probs = tf.reshape(
+        tf.gather_nd(flat_log_probs, flat_indices),
+        [batch_size, timesteps])
+
+    # Sum over time to get the log_prob of the sequence
+    scores = tf.reduce_sum(log_probs, axis=1)
+
+    return {"outputs": targets, "scores": scores}
+
+
 @registry.register_model
 class TransformerEncoder(t2t_model.T2TModel):
   """Transformer, encoder only."""
diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py
index 2b2d3a9fa..9b5d6fe4d 100644
--- a/tensor2tensor/models/transformer_test.py
+++ b/tensor2tensor/models/transformer_test.py
@@ -37,7 +37,10 @@
 
 class TransformerTest(tf.test.TestCase):
 
-  def getModel(self, hparams, mode=tf.estimator.ModeKeys.TRAIN, has_input=True):
+  def getModel(self, hparams=None, mode=tf.estimator.ModeKeys.TRAIN,
+               has_input=True, model_cls=transformer.Transformer):
+    if hparams is None:
+      hparams = transformer.transformer_tiny()
     hparams.hidden_size = 8
     hparams.filter_size = 32
     hparams.num_heads = 1
@@ -58,7 +61,7 @@ def getModel(self, hparams, mode=tf.estimator.ModeKeys.TRAIN, has_input=True):
         "target_space_id": tf.constant(1, dtype=tf.int32)
     }
 
-    return transformer.Transformer(hparams, mode, p_hparams), features
+    return model_cls(hparams, mode, p_hparams), features
 
   def testTransformer(self):
     model, features = self.getModel(transformer.transformer_small())
@@ -240,5 +243,47 @@ def testTransformerWithEncoderDecoderAttentionLoss(self):
     self.assertEqual(res.shape, ())
 
 
+class TransformerScorerTest(TransformerTest):
+
+  def testReturnsScores(self):
+    model, features = self.getModel(
+        mode=tf.estimator.ModeKeys.PREDICT,
+        model_cls=transformer.TransformerScorer)
+    infer_out = model.infer(features)
+    self.assertTrue("outputs" in infer_out)
+    self.assertTrue("scores" in infer_out)
+
+    with self.test_session() as session:
+      session.run(tf.global_variables_initializer())
+      infer_out = session.run(infer_out)
+      self.assertEqual((BATCH_SIZE,), infer_out["scores"].shape)
+      self.assertEqual((BATCH_SIZE, TARGET_LENGTH), infer_out["outputs"].shape)
+
+  def testVarNames(self):
+    with tf.Graph().as_default():
+      model, features = self.getModel(
+          mode=tf.estimator.ModeKeys.PREDICT,
+          model_cls=transformer.TransformerScorer)
+      _ = model.infer(features)
+      scorer_vars = [v.name for v in tf.global_variables()]
+
+    with tf.Graph().as_default():
+      model, features = self.getModel(
+          mode=tf.estimator.ModeKeys.EVAL,
+          model_cls=transformer.TransformerScorer)
+      _ = model(features)
+      scorer_eval_vars = [v.name for v in tf.global_variables()]
+
+    with tf.Graph().as_default():
+      model, features = self.getModel(
+          mode=tf.estimator.ModeKeys.EVAL,
+          model_cls=transformer.Transformer)
+      _ = model(features)
+      transformer_vars = [v.name for v in tf.global_variables()]
+
+    self.assertEqual(sorted(scorer_vars), sorted(transformer_vars))
+    self.assertEqual(sorted(scorer_eval_vars), sorted(transformer_vars))
+
+
 if __name__ == "__main__":
   tf.test.main()

From 95aeb116392040a8bd0b17999e6771c141525ff0 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Fri, 13 Apr 2018 13:58:48 -0700
Subject: [PATCH 29/29] Make SRU code Py3 compatible

PiperOrigin-RevId: 192819555
---
 tensor2tensor/layers/common_layers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index b8502dbf3..3c1155643 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -1951,7 +1951,8 @@ def sru(x, num_layers=2,
     x = tf.transpose(x, [1, 0, 2])  # Scan assumes time on axis 0.
     initial_state = initial_state or tf.zeros([x_shape[0], x_shape[-1]])
     # SRU state manipulation function.
-    def next_state(cur_state, (cur_x_times_one_minus_f, cur_f)):
+    def next_state(cur_state, args_tup):
+      cur_x_times_one_minus_f, cur_f = args_tup
       return cur_f * cur_state + cur_x_times_one_minus_f
     # Calculate SRU on each layer.
     for i in xrange(num_layers):