diff --git a/.gitignore b/.gitignore
index b14c224ec..f2b475d4e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,5 @@
 # Compiled python modules.
 *.pyc
-*DS_Store
-
 
 # Byte-compiled
 _pycache__/
@@ -18,3 +16,9 @@ dist/
 # Sublime project files
 *.sublime-project
 *.sublime-workspace
+
+# Tests
+.pytest_cache/
+
+# Other
+*.DS_Store
diff --git a/.travis.yml b/.travis.yml
index 4cf0843a2..ecfcb699a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,14 +8,11 @@ env:
     - T2T_DATA_DIR=/tmp/t2t-data
     - T2T_TRAIN_DIR=/tmp/t2t-train
   matrix:
-    - TF_VERSION="1.4.*"
     - TF_VERSION="1.5.*"
     - TF_VERSION="1.6.*"
     - TF_VERSION="1.7.*"
 matrix:
   exclude:
-    - python: "3.6"
-      env: TF_VERSION="1.4.*"
     - python: "3.6"
       env: TF_VERSION="1.5.*"
     - python: "3.6"
@@ -57,13 +54,13 @@ script:
 
   # Run data generation, training, and decoding on a dummy problem
   - t2t-datagen --problem=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR
-  - t2t-trainer --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --train_steps=5 --eval_steps=5 --output_dir=$T2T_TRAIN_DIR
-  - t2t-decoder --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR --decode_hparams='num_samples=10'
+  - t2t-trainer --problem=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --train_steps=5 --eval_steps=5 --output_dir=$T2T_TRAIN_DIR
+  - t2t-decoder --problem=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR --decode_hparams='num_samples=10'
 
   # Export and query (on Python 2 only)
   # Bug: https://github.com/tensorflow/serving/issues/819
   #- if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]] && [[ "$TF_VERSION" == "1.6.*"  ]]; then
-  #      t2t-exporter --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR;
+  #      t2t-exporter --problem=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR;
   #      pip install tensorflow-serving-api;
   #      tensorflow_model_server --port=9000 --model_name=my_model --model_base_path=$T2T_TRAIN_DIR/export/Servo &
   #      sleep 10;
diff --git a/README.md b/README.md
index a59f69c98..31b25562f 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ pip install tensor2tensor && t2t-trainer \
   --generate_data \
   --data_dir=~/t2t_data \
   --output_dir=~/t2t_train/mnist \
-  --problems=image_mnist \
+  --problem=image_mnist \
   --model=shake_shake \
   --hparams_set=shake_shake_quick \
   --train_steps=1000 \
@@ -78,13 +78,13 @@ to modify the hyperparameters if you run on a different setup.
 ### Image Classification
 
 For image classification, we have a number of standard data-sets:
-* ImageNet (a large data-set): `--problems=image_imagenet`, or one
+* ImageNet (a large data-set): `--problem=image_imagenet`, or one
    of the re-scaled versions (`image_imagenet224`, `image_imagenet64`,
    `image_imagenet32`)
-* CIFAR-10: `--problems=image_cifar10` (or
-    `--problems=image_cifar10_plain` to turn off data augmentation)
-* CIFAR-100: `--problems=image_cifar100`
-* MNIST: `--problems=image_mnist`
+* CIFAR-10: `--problem=image_cifar10` (or
+    `--problem=image_cifar10_plain` to turn off data augmentation)
+* CIFAR-100: `--problem=image_cifar100`
+* MNIST: `--problem=image_mnist`
 
 For ImageNet, we suggest to use the ResNet or Xception, i.e.,
 use `--model=resnet --hparams_set=resnet_50` or
@@ -99,11 +99,11 @@ close to 97% accuracy on CIFAR-10.
 ### Language Modeling
 
 For language modeling, we have these data-sets in T2T:
-* PTB (a small data-set): `--problems=languagemodel_ptb10k` for
-    word-level modeling and `--problems=languagemodel_ptb_characters`
+* PTB (a small data-set): `--problem=languagemodel_ptb10k` for
+    word-level modeling and `--problem=languagemodel_ptb_characters`
     for character-level modeling.
-* LM1B (a billion-word corpus): `--problems=languagemodel_lm1b32k` for
-    subword-level modeling and `--problems=languagemodel_lm1b_characters`
+* LM1B (a billion-word corpus): `--problem=languagemodel_lm1b32k` for
+    subword-level modeling and `--problem=languagemodel_lm1b_characters`
     for character-level modeling.
 
 We suggest to start with `--model=transformer` on this task and use
@@ -113,7 +113,7 @@ We suggest to start with `--model=transformer` on this task and use
 ### Sentiment Analysis
 
 For the task of recognizing the sentiment of a sentence, use
-* the IMDB data-set: `--problems=sentiment_imdb`
+* the IMDB data-set: `--problem=sentiment_imdb`
 
 We suggest to use `--model=transformer_encoder` here and since it is
 a small data-set, try `--hparams_set=transformer_tiny` and train for
@@ -122,15 +122,15 @@ few steps (e.g., `--train_steps=2000`).
 ### Speech Recognition
 
 For speech-to-text, we have these data-sets in T2T:
-* Librispeech (English speech to text): `--problems=librispeech` for
-    the whole set and `--problems=librispeech_clean` for a smaller
+* Librispeech (English speech to text): `--problem=librispeech` for
+    the whole set and `--problem=librispeech_clean` for a smaller
     but nicely filtered part.
 
 ### Summarization
 
 For summarizing longer text into shorter one we have these data-sets:
 * CNN/DailyMail articles summarized into a few sentences:
-  `--problems=summarize_cnn_dailymail32k`
+  `--problem=summarize_cnn_dailymail32k`
 
 We suggest to use `--model=transformer` and
 `--hparams_set=transformer_prepend` for this task.
@@ -139,15 +139,15 @@ This yields good ROUGE scores.
 ### Translation
 
 There are a number of translation data-sets in T2T:
-* English-German: `--problems=translate_ende_wmt32k`
-* English-French: `--problems=translate_enfr_wmt32k`
-* English-Czech: `--problems=translate_encs_wmt32k`
-* English-Chinese: `--problems=translate_enzh_wmt32k`
-* English-Vietnamese: `--problems=translate_envi_iwslt32k`
+* English-German: `--problem=translate_ende_wmt32k`
+* English-French: `--problem=translate_enfr_wmt32k`
+* English-Czech: `--problem=translate_encs_wmt32k`
+* English-Chinese: `--problem=translate_enzh_wmt32k`
+* English-Vietnamese: `--problem=translate_envi_iwslt32k`
 
 You can get translations in the other direction by appending `_rev` to
 the problem name, e.g., for German-English use
-`--problems=translate_ende_wmt32k_rev`.
+`--problem=translate_ende_wmt32k_rev`.
 
 For all translation problems, we suggest to try the Transformer model:
 `--model=transformer`. At first it is best to try the base setting,
@@ -193,7 +193,7 @@ t2t-datagen \
 # *  If you run out of memory, add --hparams='batch_size=1024'.
 t2t-trainer \
   --data_dir=$DATA_DIR \
-  --problems=$PROBLEM \
+  --problem=$PROBLEM \
   --model=$MODEL \
   --hparams_set=$HPARAMS \
   --output_dir=$TRAIN_DIR
@@ -210,7 +210,7 @@ ALPHA=0.6
 
 t2t-decoder \
   --data_dir=$DATA_DIR \
-  --problems=$PROBLEM \
+  --problem=$PROBLEM \
   --model=$MODEL \
   --hparams_set=$HPARAMS \
   --output_dir=$TRAIN_DIR \
@@ -325,7 +325,7 @@ and hyperparameter set functions can compose other hyperparameter set functions.
 
 The **trainer** binary is the main entrypoint for training, evaluation, and
 inference. Users can easily switch between problems, models, and hyperparameter
-sets by using the `--model`, `--problems`, and `--hparams_set` flags. Specific
+sets by using the `--model`, `--problem`, and `--hparams_set` flags. Specific
 hyperparameters can be overridden with the `--hparams` flag. `--schedule` and
 related flags control local and distributed training/evaluation
 ([distributed training documentation](https://github.com/tensorflow/tensor2tensor/tree/master/docs/distributed_training.md)).
diff --git a/docs/cloud_mlengine.md b/docs/cloud_mlengine.md
index 709582f65..25673901e 100644
--- a/docs/cloud_mlengine.md
+++ b/docs/cloud_mlengine.md
@@ -14,7 +14,7 @@ It's the same `t2t-trainer` you know and love with the addition of the
 DATA_DIR=gs://my-bucket/data
 OUTPUT_DIR=gs://my-bucket/train
 t2t-trainer \
-  --problems=translate_ende_wmt32k \
+  --problem=translate_ende_wmt32k \
   --model=transformer \
   --hparams_set=transformer_base \
   --data_dir=$DATA_DIR \
@@ -57,7 +57,7 @@ with `--hparams_range` and the `--autotune_*` flags:
 
 ```
 t2t-trainer \
-  --problems=translate_ende_wmt32k \
+  --problem=translate_ende_wmt32k \
   --model=transformer \
   --hparams_set=transformer_base \
   --data_dir=$DATA_DIR \
diff --git a/docs/cloud_tpu.md b/docs/cloud_tpu.md
index d923ee02e..d508aa125 100644
--- a/docs/cloud_tpu.md
+++ b/docs/cloud_tpu.md
@@ -39,8 +39,6 @@ work on any image classification data-set.
 
 ## Tutorial: Transformer En-De translation on TPU
 
-**Note**: You'll need TensorFlow 1.5+.
-
 Configure the `gcloud` CLI:
 ```
 gcloud components update
@@ -71,7 +69,7 @@ Launch! It's as simple as adding the `--cloud_tpu` flag.
 t2t-trainer \
   --model=transformer \
   --hparams_set=transformer_tpu \
-  --problems=translate_ende_wmt8k \
+  --problem=translate_ende_wmt8k \
   --train_steps=10 \
   --eval_steps=10 \
   --local_eval_frequency=10 \
@@ -109,7 +107,7 @@ For example, to train a shake-shake model on CIFAR you can run this command.
 t2t-trainer \
   --model=shake_shake \
   --hparams_set=shakeshake_tpu \
-  --problems=image_cifar10 \
+  --problem=image_cifar10 \
   --train_steps=180000 \
   --eval_steps=9 \
   --local_eval_frequency=100 \
diff --git a/docs/index.md b/docs/index.md
index 060e10471..9262461c7 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -42,13 +42,13 @@ to modify the hyperparameters if you run on a different setup.
 ### Image Classification
 
 For image classification, we have a number of standard data-sets:
-* ImageNet (a large data-set): `--problems=image_imagenet`, or one
+* ImageNet (a large data-set): `--problem=image_imagenet`, or one
    of the re-scaled versions (`image_imagenet224`, `image_imagenet64`,
    `image_imagenet32`)
-* CIFAR-10: `--problems=image_cifar10` (or
-    `--problems=image_cifar10_plain` to turn off data augmentation)
-* CIFAR-100: `--problems=image_cifar100`
-* MNIST: `--problems=image_mnist`
+* CIFAR-10: `--problem=image_cifar10` (or
+    `--problem=image_cifar10_plain` to turn off data augmentation)
+* CIFAR-100: `--problem=image_cifar100`
+* MNIST: `--problem=image_mnist`
 
 For ImageNet, we suggest to use the ResNet or Xception, i.e.,
 use `--model=resnet --hparams_set=resnet_50` or
@@ -63,11 +63,11 @@ close to 97% accuracy on CIFAR-10.
 ### Language Modeling
 
 For language modeling, we have these data-sets in T2T:
-* PTB (a small data-set): `--problems=languagemodel_ptb10k` for
-    word-level modeling and `--problems=languagemodel_ptb_characters`
+* PTB (a small data-set): `--problem=languagemodel_ptb10k` for
+    word-level modeling and `--problem=languagemodel_ptb_characters`
     for character-level modeling.
-* LM1B (a billion-word corpus): `--problems=languagemodel_lm1b32k` for
-    subword-level modeling and `--problems=languagemodel_lm1b_characters`
+* LM1B (a billion-word corpus): `--problem=languagemodel_lm1b32k` for
+    subword-level modeling and `--problem=languagemodel_lm1b_characters`
     for character-level modeling.
 
 We suggest to start with `--model=transformer` on this task and use
@@ -77,7 +77,7 @@ We suggest to start with `--model=transformer` on this task and use
 ### Sentiment Analysis
 
 For the task of recognizing the sentiment of a sentence, use
-* the IMDB data-set: `--problems=sentiment_imdb`
+* the IMDB data-set: `--problem=sentiment_imdb`
 
 We suggest to use `--model=transformer_encoder` here and since it is
 a small data-set, try `--hparams_set=transformer_tiny` and train for
@@ -86,15 +86,15 @@ few steps (e.g., `--train_steps=2000`).
 ### Speech Recognition
 
 For speech-to-text, we have these data-sets in T2T:
-* Librispeech (English speech to text): `--problems=librispeech` for
-    the whole set and `--problems=librispeech_clean` for a smaller
+* Librispeech (English speech to text): `--problem=librispeech` for
+    the whole set and `--problem=librispeech_clean` for a smaller
     but nicely filtered part.
 
 ### Summarization
 
 For summarizing longer text into shorter one we have these data-sets:
 * CNN/DailyMail articles summarized into a few sentences:
-  `--problems=summarize_cnn_dailymail32k`
+  `--problem=summarize_cnn_dailymail32k`
 
 We suggest to use `--model=transformer` and
 `--hparams_set=transformer_prepend` for this task.
@@ -103,15 +103,15 @@ This yields good ROUGE scores.
 ### Translation
 
 There are a number of translation data-sets in T2T:
-* English-German: `--problems=translate_ende_wmt32k`
-* English-French: `--problems=translate_enfr_wmt32k`
-* English-Czech: `--problems=translate_encs_wmt32k`
-* English-Chinese: `--problems=translate_enzh_wmt32k`
-* English-Vietnamese: `--problems=translate_envi_iwslt32k`
+* English-German: `--problem=translate_ende_wmt32k`
+* English-French: `--problem=translate_enfr_wmt32k`
+* English-Czech: `--problem=translate_encs_wmt32k`
+* English-Chinese: `--problem=translate_enzh_wmt32k`
+* English-Vietnamese: `--problem=translate_envi_iwslt32k`
 
 You can get translations in the other direction by appending `_rev` to
 the problem name, e.g., for German-English use
-`--problems=translate_ende_wmt32k_rev`.
+`--problem=translate_ende_wmt32k_rev`.
 
 For all translation problems, we suggest to try the Transformer model:
 `--model=transformer`. At first it is best to try the base setting,
diff --git a/docs/new_problem.md b/docs/new_problem.md
index 371ae3daa..d9a7987fd 100644
--- a/docs/new_problem.md
+++ b/docs/new_problem.md
@@ -239,6 +239,6 @@ clone the repository and install it in developer mode with `pip install -e .`.
 # Train!
 
 You can train exactly as you do in the [walkthrough](walkthrough.md) with flags
-`--problems=poetry_lines` and `--t2t_usr_dir=$USR_DIR`.
+`--problem=poetry_lines` and `--t2t_usr_dir=$USR_DIR`.
 
 All done. Let us know what amazing poetry your model writes!
diff --git a/docs/tutorials/asr_with_transformer.md b/docs/tutorials/asr_with_transformer.md
index 92c847ba8..728d85c4b 100644
--- a/docs/tutorials/asr_with_transformer.md
+++ b/docs/tutorials/asr_with_transformer.md
@@ -29,7 +29,7 @@ To train a model on GPU set up`OUT_DIR` and run the trainer:
 t2t-trainer \
   --model=transformer \
   --hparams_set=transformer_librispeech \
-  --problems=librispeech \
+  --problem=librispeech \
   --train_steps=120000 \
   --eval_steps=3 \
   --local_eval_frequency=100 \
@@ -48,7 +48,7 @@ To train a model on TPU set up `OUT_DIR` and run the trainer:
 t2t-trainer \
   --model=transformer \
   --hparams_set=transformer_librispeech_tpu \
-  --problems=librispeech \
+  --problem=librispeech \
   --train_steps=120000 \
   --eval_steps=3 \
   --local_eval_frequency=100 \
diff --git a/docs/walkthrough.md b/docs/walkthrough.md
index a59f69c98..31b25562f 100644
--- a/docs/walkthrough.md
+++ b/docs/walkthrough.md
@@ -36,7 +36,7 @@ pip install tensor2tensor && t2t-trainer \
   --generate_data \
   --data_dir=~/t2t_data \
   --output_dir=~/t2t_train/mnist \
-  --problems=image_mnist \
+  --problem=image_mnist \
   --model=shake_shake \
   --hparams_set=shake_shake_quick \
   --train_steps=1000 \
@@ -78,13 +78,13 @@ to modify the hyperparameters if you run on a different setup.
 ### Image Classification
 
 For image classification, we have a number of standard data-sets:
-* ImageNet (a large data-set): `--problems=image_imagenet`, or one
+* ImageNet (a large data-set): `--problem=image_imagenet`, or one
    of the re-scaled versions (`image_imagenet224`, `image_imagenet64`,
    `image_imagenet32`)
-* CIFAR-10: `--problems=image_cifar10` (or
-    `--problems=image_cifar10_plain` to turn off data augmentation)
-* CIFAR-100: `--problems=image_cifar100`
-* MNIST: `--problems=image_mnist`
+* CIFAR-10: `--problem=image_cifar10` (or
+    `--problem=image_cifar10_plain` to turn off data augmentation)
+* CIFAR-100: `--problem=image_cifar100`
+* MNIST: `--problem=image_mnist`
 
 For ImageNet, we suggest to use the ResNet or Xception, i.e.,
 use `--model=resnet --hparams_set=resnet_50` or
@@ -99,11 +99,11 @@ close to 97% accuracy on CIFAR-10.
 ### Language Modeling
 
 For language modeling, we have these data-sets in T2T:
-* PTB (a small data-set): `--problems=languagemodel_ptb10k` for
-    word-level modeling and `--problems=languagemodel_ptb_characters`
+* PTB (a small data-set): `--problem=languagemodel_ptb10k` for
+    word-level modeling and `--problem=languagemodel_ptb_characters`
     for character-level modeling.
-* LM1B (a billion-word corpus): `--problems=languagemodel_lm1b32k` for
-    subword-level modeling and `--problems=languagemodel_lm1b_characters`
+* LM1B (a billion-word corpus): `--problem=languagemodel_lm1b32k` for
+    subword-level modeling and `--problem=languagemodel_lm1b_characters`
     for character-level modeling.
 
 We suggest to start with `--model=transformer` on this task and use
@@ -113,7 +113,7 @@ We suggest to start with `--model=transformer` on this task and use
 ### Sentiment Analysis
 
 For the task of recognizing the sentiment of a sentence, use
-* the IMDB data-set: `--problems=sentiment_imdb`
+* the IMDB data-set: `--problem=sentiment_imdb`
 
 We suggest to use `--model=transformer_encoder` here and since it is
 a small data-set, try `--hparams_set=transformer_tiny` and train for
@@ -122,15 +122,15 @@ few steps (e.g., `--train_steps=2000`).
 ### Speech Recognition
 
 For speech-to-text, we have these data-sets in T2T:
-* Librispeech (English speech to text): `--problems=librispeech` for
-    the whole set and `--problems=librispeech_clean` for a smaller
+* Librispeech (English speech to text): `--problem=librispeech` for
+    the whole set and `--problem=librispeech_clean` for a smaller
     but nicely filtered part.
 
 ### Summarization
 
 For summarizing longer text into shorter one we have these data-sets:
 * CNN/DailyMail articles summarized into a few sentences:
-  `--problems=summarize_cnn_dailymail32k`
+  `--problem=summarize_cnn_dailymail32k`
 
 We suggest to use `--model=transformer` and
 `--hparams_set=transformer_prepend` for this task.
@@ -139,15 +139,15 @@ This yields good ROUGE scores.
 ### Translation
 
 There are a number of translation data-sets in T2T:
-* English-German: `--problems=translate_ende_wmt32k`
-* English-French: `--problems=translate_enfr_wmt32k`
-* English-Czech: `--problems=translate_encs_wmt32k`
-* English-Chinese: `--problems=translate_enzh_wmt32k`
-* English-Vietnamese: `--problems=translate_envi_iwslt32k`
+* English-German: `--problem=translate_ende_wmt32k`
+* English-French: `--problem=translate_enfr_wmt32k`
+* English-Czech: `--problem=translate_encs_wmt32k`
+* English-Chinese: `--problem=translate_enzh_wmt32k`
+* English-Vietnamese: `--problem=translate_envi_iwslt32k`
 
 You can get translations in the other direction by appending `_rev` to
 the problem name, e.g., for German-English use
-`--problems=translate_ende_wmt32k_rev`.
+`--problem=translate_ende_wmt32k_rev`.
 
 For all translation problems, we suggest to try the Transformer model:
 `--model=transformer`. At first it is best to try the base setting,
@@ -193,7 +193,7 @@ t2t-datagen \
 # *  If you run out of memory, add --hparams='batch_size=1024'.
 t2t-trainer \
   --data_dir=$DATA_DIR \
-  --problems=$PROBLEM \
+  --problem=$PROBLEM \
   --model=$MODEL \
   --hparams_set=$HPARAMS \
   --output_dir=$TRAIN_DIR
@@ -210,7 +210,7 @@ ALPHA=0.6
 
 t2t-decoder \
   --data_dir=$DATA_DIR \
-  --problems=$PROBLEM \
+  --problem=$PROBLEM \
   --model=$MODEL \
   --hparams_set=$HPARAMS \
   --output_dir=$TRAIN_DIR \
@@ -325,7 +325,7 @@ and hyperparameter set functions can compose other hyperparameter set functions.
 
 The **trainer** binary is the main entrypoint for training, evaluation, and
 inference. Users can easily switch between problems, models, and hyperparameter
-sets by using the `--model`, `--problems`, and `--hparams_set` flags. Specific
+sets by using the `--model`, `--problem`, and `--hparams_set` flags. Specific
 hyperparameters can be overridden with the `--hparams` flag. `--schedule` and
 related flags control local and distributed training/evaluation
 ([distributed training documentation](https://github.com/tensorflow/tensor2tensor/tree/master/docs/distributed_training.md)).
diff --git a/setup.py b/setup.py
index 9f9035efa..cc22c8a0f 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.5.7',
+    version='1.6.0',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',
@@ -45,8 +45,8 @@
         'six',
     ],
     extras_require={
-        'tensorflow': ['tensorflow>=1.4.1'],
-        'tensorflow_gpu': ['tensorflow-gpu>=1.4.1'],
+        'tensorflow': ['tensorflow>=1.5.0'],
+        'tensorflow_gpu': ['tensorflow-gpu>=1.5.0'],
         'tests': ['pytest', 'h5py', 'mock'],
     },
     classifiers=[
diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer
index 01c2dcb56..1d848d04d 100755
--- a/tensor2tensor/bin/t2t-trainer
+++ b/tensor2tensor/bin/t2t-trainer
@@ -7,7 +7,7 @@ For example, to train a shake-shake model on MNIST run this:
 
 t2t-trainer \
   --generate_data \
-  --problems=image_mnist \
+  --problem=image_mnist \
   --data_dir=~/t2t_data \
   --tmp_dir=~/t2t_data/tmp
   --model=shake_shake \
diff --git a/tensor2tensor/bin/t2t_datagen.py b/tensor2tensor/bin/t2t_datagen.py
index 862abca84..d1dcab834 100644
--- a/tensor2tensor/bin/t2t_datagen.py
+++ b/tensor2tensor/bin/t2t_datagen.py
@@ -70,7 +70,7 @@
 flags.DEFINE_integer("task_id_start", -1, "For distributed data generation.")
 flags.DEFINE_integer("task_id_end", -1, "For distributed data generation.")
 flags.DEFINE_integer(
-    "num_concurrent_processes", 10,
+    "num_concurrent_processes", None,
     "Applies only to problems for which multiprocess_generate=True.")
 flags.DEFINE_string("t2t_usr_dir", "",
                     "Path to a Python module that will be imported. The "
diff --git a/tensor2tensor/bin/t2t_decoder.py b/tensor2tensor/bin/t2t_decoder.py
index 25f47eace..08d8c7ee5 100644
--- a/tensor2tensor/bin/t2t_decoder.py
+++ b/tensor2tensor/bin/t2t_decoder.py
@@ -21,7 +21,7 @@
 
   t2t-decoder \
       --data_dir ~/data \
-      --problems=algorithmic_identity_binary40 \
+      --problem=algorithmic_identity_binary40 \
       --model=transformer
       --hparams_set=transformer_base
 
@@ -70,7 +70,7 @@ def create_hparams():
       FLAGS.hparams_set,
       FLAGS.hparams,
       data_dir=os.path.expanduser(FLAGS.data_dir),
-      problem_name=FLAGS.problems)
+      problem_name=FLAGS.problem)
 
 
 def create_decode_hparams():
@@ -94,7 +94,7 @@ def decode(estimator, hparams, decode_hp):
   else:
     decoding.decode_from_dataset(
         estimator,
-        FLAGS.problems.split("-"),
+        FLAGS.problem,
         hparams,
         decode_hp,
         decode_to_file=FLAGS.decode_to_file,
@@ -105,7 +105,7 @@ def score_file(filename):
   """Score each line in a file and return the scores."""
   # Prepare model.
   hparams = create_hparams()
-  encoders = registry.problem(FLAGS.problems).feature_encoders(FLAGS.data_dir)
+  encoders = registry.problem(FLAGS.problem).feature_encoders(FLAGS.data_dir)
   has_inputs = "inputs" in encoders
 
   # Prepare features for feeding into the model.
diff --git a/tensor2tensor/bin/t2t_distill.py b/tensor2tensor/bin/t2t_distill.py
index be31a2ba7..75c14ca55 100644
--- a/tensor2tensor/bin/t2t_distill.py
+++ b/tensor2tensor/bin/t2t_distill.py
@@ -91,5 +91,71 @@ def main(argv):
     # ==========================
 
 
+def create_teacher_experiment(run_config, hparams, argv):
+  """Creates experiment function."""
+  tf.logging.info("training teacher")
+  tf.logging.set_verbosity(tf.logging.INFO)
+  trainer_lib.set_random_seed(FLAGS.random_seed)
+  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
+  t2t_trainer.log_registry()
+
+  if FLAGS.cloud_mlengine:
+    return cloud_mlengine.launch()
+
+  if FLAGS.generate_data:
+    t2t_trainer.generate_data()
+
+  if cloud_mlengine.job_dir():
+    FLAGS.output_dir = cloud_mlengine.job_dir()
+
+  if argv:
+    t2t_trainer.set_hparams_from_args(argv[1:])
+
+  with t2t_trainer.maybe_cloud_tpu():
+    hparams.distill_phase = "train"
+    exp_fn = t2t_trainer.create_experiment_fn()
+    exp = exp_fn(run_config, hparams)
+    return exp
+
+
+def create_student_experiment(run_config, hparams, argv):
+  """Creates experiment function."""
+  tf.logging.info("training student")
+  tf.logging.set_verbosity(tf.logging.INFO)
+  trainer_lib.set_random_seed(FLAGS.random_seed)
+  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
+  t2t_trainer.log_registry()
+
+  if FLAGS.cloud_mlengine:
+    return cloud_mlengine.launch()
+
+  if FLAGS.generate_data:
+    t2t_trainer.generate_data()
+
+  if cloud_mlengine.job_dir():
+    FLAGS.output_dir = cloud_mlengine.job_dir()
+
+  if argv:
+    t2t_trainer.set_hparams_from_args(argv[1:])
+
+  with t2t_trainer.maybe_cloud_tpu():
+    hparams.add_hparam("teacher_dir", FLAGS.teacher_dir)
+    hparams.distill_phase = "distill"
+    exp_fn = t2t_trainer.create_experiment_fn()
+    exp = exp_fn(run_config, hparams)
+    return exp
+
+
+def create_experiment_fn(argv, train_teacher):
+
+  def teacher_experiment_fn(run_config, hparams):
+    return create_teacher_experiment(run_config, hparams, argv)
+
+  def student_experiment_fn(run_config, hparams):
+    return create_student_experiment(run_config, hparams, argv)
+
+  return teacher_experiment_fn if train_teacher else student_experiment_fn
+
+
 if __name__ == "__main__":
   tf.app.run()
diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
index 628d7a9a9..87443ad47 100644
--- a/tensor2tensor/bin/t2t_trainer.py
+++ b/tensor2tensor/bin/t2t_trainer.py
@@ -57,10 +57,12 @@
 flags.DEFINE_string("tmp_dir", "/tmp/t2t_datagen",
                     "Temporary storage directory, used if --generate_data.")
 flags.DEFINE_bool("profile", False, "Profile performance?")
-flags.DEFINE_integer("inter_op_parallelism_threads", 0, "Number of inter_op_parallelism_threads "
-                     "to use for CPU. See TensorFlow config.proto for details.")
-flags.DEFINE_integer("intra_op_parallelism_threads", 0, "Number of intra_op_parallelism_threads "
-                     "to use for CPU. See TensorFlow config.proto for details.")
+flags.DEFINE_integer("inter_op_parallelism_threads", 0,
+                     "Number of inter_op_parallelism_threads to use for CPU. "
+                     "See TensorFlow config.proto for details.")
+flags.DEFINE_integer("intra_op_parallelism_threads", 0,
+                     "Number of intra_op_parallelism_threads to use for CPU. "
+                     "See TensorFlow config.proto for details.")
 
 # To maintain compatibility with some internal libs, we guard against these flag
 # definitions possibly erring. Apologies for the ugliness.
@@ -115,12 +117,6 @@
                     "during hyperparameter tuning. Overrides --output_dir.")
 
 
-def get_problem_name():
-  problems = FLAGS.problems.split("-")
-  assert len(problems) == 1
-  return problems[0]
-
-
 def set_hparams_from_args(args):
   """Set hparams overrides from unparsed args list."""
   if not args:
@@ -159,7 +155,7 @@ def create_hparams():
 def create_experiment_fn():
   return trainer_lib.create_experiment_fn(
       model_name=FLAGS.model,
-      problem_name=get_problem_name(),
+      problem_name=FLAGS.problem,
       data_dir=os.path.expanduser(FLAGS.data_dir),
       train_steps=FLAGS.train_steps,
       eval_steps=FLAGS.eval_steps,
@@ -178,11 +174,24 @@ def create_experiment_fn():
 
 
 def create_run_config(hp):
+  """Create a run config.
+
+  Args:
+    hp: model hyperparameters
+  Returns:
+    a run config
+  """
   save_ckpt_steps = max(FLAGS.iterations_per_loop, FLAGS.local_eval_frequency)
   save_ckpt_secs = FLAGS.save_checkpoints_secs or None
   if save_ckpt_secs:
     save_ckpt_steps = None
   assert FLAGS.output_dir or FLAGS.checkpoint_path
+  # the various custom getters we have written do not play well together yet.
+  # TODO(noam): ask rsepassi for help here.
+  daisy_chain_variables = (
+      hp.daisy_chain_variables and
+      hp.activation_dtype == "float32" and
+      hp.weight_dtype == "float32")
   return trainer_lib.create_run_config(
       model_dir=os.path.expanduser(FLAGS.output_dir),
       master=FLAGS.master,
@@ -202,7 +211,7 @@ def create_run_config(hp):
       use_tpu=FLAGS.use_tpu,
       schedule=FLAGS.schedule,
       no_data_parallelism=hp.no_data_parallelism,
-      daisy_chain_variables=hp.daisy_chain_variables,
+      daisy_chain_variables=daisy_chain_variables,
       ps_replicas=FLAGS.ps_replicas,
       ps_job=FLAGS.ps_job,
       ps_gpu=FLAGS.ps_gpu,
@@ -222,7 +231,7 @@ def generate_data():
   tf.gfile.MakeDirs(data_dir)
   tf.gfile.MakeDirs(tmp_dir)
 
-  problem_name = get_problem_name()
+  problem_name = FLAGS.problem
   tf.logging.info("Generating data for %s" % problem_name)
   registry.problem(problem_name).generate_data(data_dir, tmp_dir)
 
@@ -281,9 +290,7 @@ def save_metadata(hparams):
   # Save hparams as hparams.json
   hparams_fname = os.path.join(output_dir, "hparams.json")
   with tf.gfile.Open(hparams_fname, "w") as f:
-    # TODO(lukaszkaiser): use the first line once we require TF 1.5+.
-    # f.write(hparams.to_json(indent=0, sort_keys=True))
-    f.write(hparams.to_json())
+    f.write(hparams.to_json(indent=0, sort_keys=True))
 
 
 def execute_schedule(exp):
diff --git a/tensor2tensor/bin/t2t_trainer_test.py b/tensor2tensor/bin/t2t_trainer_test.py
index 1f4569e60..52d58111a 100644
--- a/tensor2tensor/bin/t2t_trainer_test.py
+++ b/tensor2tensor/bin/t2t_trainer_test.py
@@ -36,7 +36,7 @@ def setUpClass(cls):
     trainer_lib_test.TrainerLibTest.setUpClass()
 
   def testTrain(self):
-    FLAGS.problems = "tiny_algo"
+    FLAGS.problem = "tiny_algo"
     FLAGS.model = "transformer"
     FLAGS.hparams_set = "transformer_tiny"
     FLAGS.train_steps = 1
diff --git a/tensor2tensor/bin/t2t_translate_all.py b/tensor2tensor/bin/t2t_translate_all.py
index 553489b61..7041fb8c1 100644
--- a/tensor2tensor/bin/t2t_translate_all.py
+++ b/tensor2tensor/bin/t2t_translate_all.py
@@ -16,7 +16,7 @@
 """Translate a file with all checkpoints in a given directory.
 
 t2t-decoder will be executed with these parameters:
---problems
+--problem
 --data_dir
 --output_dir with the value of --model_dir
 --decode_from_file with the value of --source
@@ -61,7 +61,7 @@
 flags.DEFINE_string("model", "transformer", "see t2t-decoder")
 flags.DEFINE_string("t2t_usr_dir", None, "see t2t-decoder")
 flags.DEFINE_string("data_dir", None, "see t2t-decoder")
-flags.DEFINE_string("problems", None, "see t2t-decoder")
+flags.DEFINE_string("problem", None, "see t2t-decoder")
 flags.DEFINE_string("hparams_set", "transformer_big_single_gpu",
                     "see t2t-decoder")
 
@@ -73,11 +73,11 @@ def main(_):
   translations_dir = os.path.expanduser(FLAGS.translations_dir)
   source = os.path.expanduser(FLAGS.source)
   tf.gfile.MakeDirs(translations_dir)
-  translated_base_file = os.path.join(translations_dir, FLAGS.problems)
+  translated_base_file = os.path.join(translations_dir, FLAGS.problem)
 
   # Copy flags.txt with the original time, so t2t-bleu can report correct
   # relative time.
-  flags_path = os.path.join(translations_dir, FLAGS.problems + "-flags.txt")
+  flags_path = os.path.join(translations_dir, FLAGS.problem + "-flags.txt")
   if not os.path.exists(flags_path):
     shutil.copy2(os.path.join(model_dir, "flags.txt"), flags_path)
 
@@ -93,7 +93,7 @@ def main(_):
       tf.logging.info("Translating " + out_file)
       params = (
           "--t2t_usr_dir={FLAGS.t2t_usr_dir} --output_dir={model_dir} "
-          "--data_dir={FLAGS.data_dir} --problems={FLAGS.problems} "
+          "--data_dir={FLAGS.data_dir} --problem={FLAGS.problem} "
           "--decode_hparams=beam_size={FLAGS.beam_size},alpha={FLAGS.alpha} "
           "--model={FLAGS.model} --hparams_set={FLAGS.hparams_set} "
           "--checkpoint_path={model.filename} --decode_from_file={source} "
diff --git a/tensor2tensor/data_generators/README.md b/tensor2tensor/data_generators/README.md
index 0ccbfe1c1..e67eac019 100644
--- a/tensor2tensor/data_generators/README.md
+++ b/tensor2tensor/data_generators/README.md
@@ -47,7 +47,7 @@ with an integer denoting the length of the input list.
 
 ```
 def length_generator(nbr_cases):
-  for _ in xrange(nbr_cases):
+  for _ in range(nbr_cases):
     length = np.random.randint(100) + 1
     yield {"inputs": [2] * length, "targets": [length]}
 ```
diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py
index ddb456cb1..25dbb8add 100644
--- a/tensor2tensor/data_generators/algorithmic.py
+++ b/tensor2tensor/data_generators/algorithmic.py
@@ -22,7 +22,7 @@
 
 import numpy as np
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.data_generators import generator_utils as utils
 from tensor2tensor.data_generators import problem
@@ -113,9 +113,9 @@ def generator(self, nbr_symbols, max_length, nbr_cases):
       A dictionary {"inputs": input-list, "targets": target-list} where
       input-list and target-list are the same.
     """
-    for _ in xrange(nbr_cases):
+    for _ in range(nbr_cases):
       l = np.random.randint(max_length) + 1
-      inputs = [np.random.randint(nbr_symbols) for _ in xrange(l)]
+      inputs = [np.random.randint(nbr_symbols) for _ in range(l)]
       yield {"inputs": inputs, "targets": inputs}
 
 
@@ -153,9 +153,9 @@ def generator(self, nbr_symbols, max_length, nbr_cases):
       target-list[i] = input-list[i] + shift.
     """
     shift = 10
-    for _ in xrange(nbr_cases):
+    for _ in range(nbr_cases):
       l = np.random.randint(max_length) + 1
-      inputs = [np.random.randint(nbr_symbols - shift) for _ in xrange(l)]
+      inputs = [np.random.randint(nbr_symbols - shift) for _ in range(l)]
       yield {"inputs": inputs, "targets": [i + shift for i in inputs]}
 
   @property
@@ -187,9 +187,9 @@ def generator(self, nbr_symbols, max_length, nbr_cases):
       A dictionary {"inputs": input-list, "targets": target-list} where
       target-list is input-list reversed.
     """
-    for _ in xrange(nbr_cases):
+    for _ in range(nbr_cases):
       l = np.random.randint(max_length) + 1
-      inputs = [np.random.randint(nbr_symbols) for _ in xrange(l)]
+      inputs = [np.random.randint(nbr_symbols) for _ in range(l)]
       yield {"inputs": inputs, "targets": list(reversed(inputs))}
 
 
@@ -265,7 +265,7 @@ def reverse_generator_nlplike(nbr_symbols,
   """
   std_dev = max_length / scale_std_dev
   distr_map = zipf_distribution(nbr_symbols, alpha)
-  for _ in xrange(nbr_cases):
+  for _ in range(nbr_cases):
     l = int(abs(np.random.normal(loc=max_length / 2, scale=std_dev)) + 1)
     inputs = zipf_random_sample(distr_map, l)
     yield {"inputs": inputs, "targets": list(reversed(inputs))}
@@ -321,7 +321,7 @@ def random_number_lower_endian(length, base):
   """Helper function: generate a random number as a lower-endian digits list."""
   if length == 1:  # Last digit can be 0 only if length is 1.
     return [np.random.randint(base)]
-  prefix = [np.random.randint(base) for _ in xrange(length - 1)]
+  prefix = [np.random.randint(base) for _ in range(length - 1)]
   return prefix + [np.random.randint(base - 1) + 1]  # Last digit is not 0.
 
 
@@ -354,7 +354,7 @@ def generator(self, base, max_length, nbr_cases):
     """
     if max_length < 3:
       raise ValueError("Maximum length must be at least 3.")
-    for _ in xrange(nbr_cases):
+    for _ in range(nbr_cases):
       l1 = np.random.randint(max_length // 2) + 1
       l2 = np.random.randint(max_length - l1 - 1) + 1
       n1 = random_number_lower_endian(l1, base)
@@ -405,7 +405,7 @@ def generator(self, base, max_length, nbr_cases):
     """
     if max_length < 3:
       raise ValueError("Maximum length must be at least 3.")
-    for _ in xrange(nbr_cases):
+    for _ in range(nbr_cases):
       l1 = np.random.randint(max_length // 2) + 1
       l2 = np.random.randint(max_length - l1 - 1) + 1
       n1 = random_number_lower_endian(l1, base)
diff --git a/tensor2tensor/data_generators/algorithmic_math.py b/tensor2tensor/data_generators/algorithmic_math.py
index 689fa4b41..ed96bbfad 100644
--- a/tensor2tensor/data_generators/algorithmic_math.py
+++ b/tensor2tensor/data_generators/algorithmic_math.py
@@ -28,7 +28,7 @@
 # Dependency imports
 
 import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 import sympy
 
 
@@ -421,7 +421,7 @@ def math_dataset_init(alphabet_size=26, digits=None, functions=None):
     raise ValueError("digits cannot must be between 1 and 10. Got %s." % digits)
   vlist = alphabet[:alphabet_size]
   if digits is not None:
-    dlist = [str(d) for d in xrange(digits)]
+    dlist = [str(d) for d in range(digits)]
   else:
     dlist = []
   if functions is None:
@@ -481,7 +481,7 @@ def algebra_inverse(alphabet_size=26, min_depth=0, max_depth=2,
                      "Got max_depth=%s, min_depth=%s" % (max_depth, min_depth))
 
   alg_cfg = math_dataset_init(alphabet_size)
-  for _ in xrange(nbr_cases):
+  for _ in range(nbr_cases):
     sample, target = generate_algebra_inverse_sample(
         alg_cfg.vlist,
         list(alg_cfg.ops.values()), alg_cfg.solve_ops, min_depth, max_depth)
@@ -522,7 +522,7 @@ def algebra_simplify(alphabet_size=26,
                      "Got max_depth=%s, min_depth=%s" % (max_depth, min_depth))
 
   alg_cfg = math_dataset_init(alphabet_size, digits=5)
-  for _ in xrange(nbr_cases):
+  for _ in range(nbr_cases):
     sample, target = generate_algebra_simplify_sample(
         alg_cfg.vlist, list(alg_cfg.ops.values()), min_depth, max_depth)
     yield {
diff --git a/tensor2tensor/data_generators/algorithmic_test.py b/tensor2tensor/data_generators/algorithmic_test.py
index 5d954fd14..2644a3b33 100644
--- a/tensor2tensor/data_generators/algorithmic_test.py
+++ b/tensor2tensor/data_generators/algorithmic_test.py
@@ -21,7 +21,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.data_generators import algorithmic
 
@@ -51,7 +51,7 @@ def testZipfDistribution(self):
     # more probable/frequent that the second in rank, three times more prob/freq
     # that the third in rank and so on.
     d = algorithmic.zipf_distribution(10, 1.0001)
-    for i in xrange(len(d[1:])-1):
+    for i in range(len(d[1:])-1):
       self.assertEqual("%.4f" % (abs(d[i+1]-d[i+2])*(i+2)), "%.4f" % d[1])
 
   def testReverseGeneratorNlpLike(self):
diff --git a/tensor2tensor/data_generators/cifar.py b/tensor2tensor/data_generators/cifar.py
index 9a1a80ef3..2c332e6e7 100644
--- a/tensor2tensor/data_generators/cifar.py
+++ b/tensor2tensor/data_generators/cifar.py
@@ -100,10 +100,10 @@ def cifar_generator(cifar_version, tmp_dir, training, how_many, start_from=0):
     num_images = images.shape[0]
     images = images.reshape((num_images, 3, image_size, image_size))
     all_images.extend([
-        np.squeeze(images[j]).transpose((1, 2, 0)) for j in xrange(num_images)
+        np.squeeze(images[j]).transpose((1, 2, 0)) for j in range(num_images)
     ])
     labels = data[label_key]
-    all_labels.extend([labels[j] for j in xrange(num_images)])
+    all_labels.extend([labels[j] for j in range(num_images)])
   return image_utils.image_generator(
       all_images[start_from:start_from + how_many],
       all_labels[start_from:start_from + how_many])
diff --git a/tensor2tensor/data_generators/dna_encoder.py b/tensor2tensor/data_generators/dna_encoder.py
index 9db20de42..a4b2c244b 100644
--- a/tensor2tensor/data_generators/dna_encoder.py
+++ b/tensor2tensor/data_generators/dna_encoder.py
@@ -26,7 +26,7 @@
 import itertools
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 from tensor2tensor.data_generators import text_encoder
 
 
@@ -77,7 +77,7 @@ def encode(self, s):
     assert (len(bases) % self._chunk_size) == 0
     num_chunks = len(bases) // self._chunk_size
     ids = []
-    for chunk_idx in xrange(num_chunks):
+    for chunk_idx in range(num_chunks):
       start_idx = chunk_idx * self._chunk_size
       end_idx = start_idx + self._chunk_size
       chunk = tuple(bases[start_idx:end_idx])
diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py
index 29d7819f2..cdd62491f 100644
--- a/tensor2tensor/data_generators/gene_expression.py
+++ b/tensor2tensor/data_generators/gene_expression.py
@@ -44,7 +44,7 @@
 import h5py
 import numpy as np
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.data_generators import dna_encoder
 from tensor2tensor.data_generators import generator_utils
@@ -130,7 +130,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
     # Start and wait for processes in batches
     num_batches = int(
         math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES))
-    for i in xrange(num_batches):
+    for i in range(num_batches):
       start = i * MAX_CONCURRENT_PROCESSES
       end = start + MAX_CONCURRENT_PROCESSES
       current = processes[start:end]
@@ -211,7 +211,7 @@ def generate_shard_args(outfiles, num_examples):
   """Generate start and end indices per outfile."""
   num_shards = len(outfiles)
   num_examples_per_shard = num_examples // num_shards
-  start_idxs = [i * num_examples_per_shard for i in xrange(num_shards)]
+  start_idxs = [i * num_examples_per_shard for i in range(num_shards)]
   end_idxs = list(start_idxs)
   end_idxs.pop(0)
   end_idxs.append(num_examples)
@@ -249,7 +249,7 @@ def dataset_generator(filepath,
     if end_idx is None:
       end_idx = inp_data.len()
 
-    for i in xrange(start_idx, end_idx):
+    for i in range(start_idx, end_idx):
       if i % 100 == 0:
         print("Generating example %d for %s" % (i, dataset))
       inputs, mask, outputs = inp_data[i], mask_data[i], out_data[i]
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
index 3078f8dfe..6bd069388 100644
--- a/tensor2tensor/data_generators/generator_utils.py
+++ b/tensor2tensor/data_generators/generator_utils.py
@@ -29,7 +29,7 @@
 
 import requests
 import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 import six.moves.urllib_request as urllib  # Imports urllib on Python2, urllib.request on Python3
 
 from tensor2tensor.data_generators import text_encoder
@@ -119,7 +119,7 @@ def sharded_name(base_name, shard, total_shards):
 
 def shard_filepath(fname, num_shards):
   return [
-      sharded_name(fname, shard, num_shards) for shard in xrange(num_shards)
+      sharded_name(fname, shard, num_shards) for shard in range(num_shards)
   ]
 
 
@@ -592,7 +592,7 @@ def pack_examples(examples,
     if chop_long_sequences and len(x) > packed_length:
       assert not has_inputs
       num_fragments = len(x) // packed_length
-      for i in xrange(num_fragments):
+      for i in range(num_fragments):
         yield packer(
             x[packed_length * i:packed_length * (i + 1)], spacing).to_dict()
       x = x[packed_length * num_fragments:]
diff --git a/tensor2tensor/data_generators/gym.py b/tensor2tensor/data_generators/gym.py
index 5b979ee00..6a82f1d4c 100644
--- a/tensor2tensor/data_generators/gym.py
+++ b/tensor2tensor/data_generators/gym.py
@@ -22,11 +22,9 @@
 from collections import deque
 
 import functools
-import os
 # Dependency imports
 import gym
 
-from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import video_utils
 
@@ -35,6 +33,7 @@
 from tensor2tensor.rl.envs import tf_atari_wrappers as atari
 from tensor2tensor.rl.envs.utils import batch_env_factory
 
+from tensor2tensor.utils import metrics
 from tensor2tensor.utils import registry
 
 import tensorflow as tf
@@ -63,6 +62,12 @@ def num_target_frames(self):
     """Number of frames to batch on one target."""
     return 1
 
+  def eval_metrics(self):
+    eval_metrics = [
+        metrics.Metrics.ACC, metrics.Metrics.ACC_PER_SEQ,
+        metrics.Metrics.NEG_LOG_PERPLEXITY]
+    return eval_metrics
+
   @property
   def extra_reading_spec(self):
     """Additional data fields to store on disk and their decoders."""
@@ -116,7 +121,8 @@ def hparams(self, defaults, unused_model_hparams):
     p.input_modality = {"inputs": ("video", 256),
                         "input_reward": ("symbol", self.num_rewards),
                         "input_action": ("symbol", self.num_actions)}
-    p.target_modality = ("video", 256)
+    p.target_modality = {"targets": ("video", 256),
+                         "target_reward": ("symbol", self.num_rewards)}
     p.input_space_id = problem.SpaceID.IMAGE
     p.target_space_id = problem.SpaceID.IMAGE
 
@@ -174,119 +180,27 @@ def num_steps(self):
     return 50000
 
 
-def moviepy_editor():
-  """Access to moviepy that fails gracefully without a moviepy install."""
-  try:
-    from moviepy import editor  # pylint: disable=g-import-not-at-top
-  except ImportError:
-    raise ImportError("pip install moviepy to record videos")
-  return editor
-
 @registry.register_problem
-class GymDiscreteProblemWithAgent2(GymDiscreteProblem):
-  """Gym environment with discrete actions and rewards."""
-
-  def __init__(self, *args, **kwargs):
-    super(GymDiscreteProblemWithAgent2, self).__init__(*args, **kwargs)
-    self._env = None
-
-  @property
-  def extra_reading_spec(self):
-    """Additional data fields to store on disk and their decoders."""
-    data_fields = {
-        "action": tf.FixedLenFeature([1], tf.int64),
-        "reward": tf.FixedLenFeature([1], tf.int64)
-    }
-    decoders = {
-        "action": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="action"),
-        "reward": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="reward"),
-    }
-    return data_fields, decoders
-
-  @property
-  def num_input_frames(self):
-    """Number of frames to batch on one input."""
-    return 4
-
-  @property
-  def env_name(self):
-    """This is the name of the Gym environment for this problem."""
-    return "PongDeterministic-v4"
-
-  @property
-  def num_actions(self):
-    return self.env.action_space.n
-
-  @property
-  def num_rewards(self):
-    return 3
-
-  @property
-  def num_steps(self):
-    return 200
-
-  @property
-  def frame_height(self):
-    return 210
-
-  @property
-  def frame_width(self):
-    return 160
-
-  @property
-  def min_reward(self):
-    return -1
-
-  def get_action(self, observation=None):
-    return self.env.action_space.sample()
-
-  def hparams(self, defaults, unused_model_hparams):
-    p = defaults
-    p.input_modality = {"inputs": ("video", 256),
-                        "input_reward": ("symbol", self.num_rewards),
-                        "input_action": ("symbol", self.num_actions)}
-    # p.input_modality = {"inputs": ("video", 256),
-    #                     "reward": ("symbol", self.num_rewards),
-    #                     "input_action": ("symbol", self.num_actions)}
-    # p.target_modality = ("video", 256)
-    p.target_modality = {"targets": ("video", 256),
-                         "target_reward": ("symbol", self.num_rewards)}
-    #p.target_modality = {"targets": ("image", 256),
-    #                     "reward": ("symbol", self.num_rewards + 1)} # ("video", 256)
-    p.input_space_id = problem.SpaceID.IMAGE
-    p.target_space_id = problem.SpaceID.IMAGE
-
-  def generate_samples(self, data_dir, tmp_dir, unused_dataset_split):
-    self.env.reset()
-    action = self.get_action()
-    for _ in range(self.num_steps):
-      observation, reward, done, _ = self.env.step(action)
-      action = self.get_action(observation)
-      yield {"frame": observation,
-             "action": [action],
-             "done": [done],
-             "reward": [int(reward - self.min_reward)]}
-
-
-@registry.register_problem
-class GymDiscreteProblemWithAgent(problem.Problem):
-  """Gym environment with discrete actions and rewards."""
+class GymDiscreteProblemWithAgent(GymPongRandom5k):
+  """Gym environment with discrete actions and rewards and an agent."""
 
   def __init__(self, *args, **kwargs):
     super(GymDiscreteProblemWithAgent, self).__init__(*args, **kwargs)
-    self.num_channels = 3
+    self._env = None
     self.history_size = 2
 
     # defaults
-    self.environment_spec = lambda: gym.make("PongNoFrameskip-v4")
+    self.environment_spec = lambda: gym.make("PongDeterministic-v4")
     self.in_graph_wrappers = [(atari.MaxAndSkipWrapper, {"skip": 4})]
     self.collect_hparams = rl.atari_base()
-    self.num_steps = 1000
-    self.movies = True
-    self.movies_fps = 24
+    self.settable_num_steps = 1000
     self.simulated_environment = None
     self.warm_up = 70
 
+  @property
+  def num_steps(self):
+    return self.settable_num_steps
+
   def _setup(self):
     in_graph_wrappers = [(atari.ShiftRewardWrapper, {"add_value": 2}),
                          (atari.MemoryWrapper, {})] + self.in_graph_wrappers
@@ -319,85 +233,23 @@ def _setup(self):
     self.data_get_op = atari.MemoryWrapper.singleton.speculum.dequeue()
     self.history_buffer = deque(maxlen=self.history_size+1)
 
-  def example_reading_spec(self, label_repr=None):
-    data_fields = {
-        "targets_encoded": tf.FixedLenFeature((), tf.string),
-        "image/format": tf.FixedLenFeature((), tf.string),
-        "action": tf.FixedLenFeature([1], tf.int64),
-        "reward": tf.FixedLenFeature([1], tf.int64),
-        # "done": tf.FixedLenFeature([1], tf.int64)
-    }
-
-    for x in range(self.history_size):
-      data_fields["inputs_encoded_{}".format(x)] = tf.FixedLenFeature(
-          (), tf.string)
-
-    data_items_to_decoders = {
-        "targets": tf.contrib.slim.tfexample_decoder.Image(
-            image_key="targets_encoded",
-            format_key="image/format",
-            shape=[210, 160, 3],
-            channels=3),
-        # Just do a pass through.
-        "action": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="action"),
-        "reward": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="reward"),
-    }
-
-    for x in range(self.history_size):
-      key = "inputs_{}".format(x)
-      data_items_to_decoders[key] = tf.contrib.slim.tfexample_decoder.Image(
-          image_key="inputs_encoded_{}".format(x),
-          format_key="image/format",
-          shape=[210, 160, 3],
-          channels=3)
-
-    return data_fields, data_items_to_decoders
-
-  @property
-  def num_actions(self):
-    return 4
-
-  @property
-  def num_rewards(self):
-    return 2
-
-  @property
-  def num_shards(self):
-    return 10
-
-  @property
-  def num_dev_shards(self):
-    return 1
-
-  def get_action(self, observation=None):
-    return self.env.action_space.sample()
-
-  def hparams(self, defaults, unused_model_hparams):
-    p = defaults
-    # The hard coded +1 after "symbol" refers to the fact
-    # that 0 is a special symbol meaning padding
-    # when symbols are e.g. 0, 1, 2, 3 we
-    # shift them to 0, 1, 2, 3, 4.
-    p.input_modality = {"action": ("symbol:identity", self.num_actions)}
-
-    for x in range(self.history_size):
-      p.input_modality["inputs_{}".format(x)] = ("image", 256)
-
-    p.target_modality = {"targets": ("image", 256),
-                         "reward": ("symbol", self.num_rewards + 1)}
-
-    p.input_space_id = problem.SpaceID.IMAGE
-    p.target_space_id = problem.SpaceID.IMAGE
-
   def restore_networks(self, sess):
     model_saver = tf.train.Saver(
         tf.global_variables(".*network_parameters.*"))
     if FLAGS.agent_policy_path:
       model_saver.restore(sess, FLAGS.agent_policy_path)
 
-  def generator(self, data_dir, tmp_dir):
+  def generate_encoded_samples(self, data_dir, tmp_dir, unused_dataset_split):
     self._setup()
-    clip_files = []
+
+    # When no agent_policy_path is set, just generate random samples.
+    if not FLAGS.agent_policy_path:
+      for sample in super(GymDiscreteProblemWithAgent,
+                          self).generate_encoded_samples(
+                              data_dir, tmp_dir, unused_dataset_split):
+        yield sample
+      return
+
     with tf.Session() as sess:
       sess.run(tf.global_variables_initializer())
       self.restore_networks(sess)
@@ -409,44 +261,20 @@ def generator(self, data_dir, tmp_dir):
           observ, reward, action, _ = sess.run(self.data_get_op)
           self.history_buffer.append(observ)
 
-          if self.movies and pieces_generated > self.warm_up:
-            file_name = os.path.join(tmp_dir,
-                                     "output_{}.png".format(pieces_generated))
-            clip_files.append(file_name)
-            with open(file_name, "wb") as f:
-              f.write(observ)
-
-          if len(self.history_buffer) == self.history_size+1:
+          if len(self.history_buffer) == self.history_size + 1:
             pieces_generated += 1
-            ret_dict = {
-                "targets_encoded": [observ],
-                "image/format": ["png"],
-                "action": [int(action)],
-                # "done": [bool(done)],
-                "reward": [int(reward)],
-            }
-            for i, v in enumerate(list(self.history_buffer)[:-1]):
-              ret_dict["inputs_encoded_{}".format(i)] = [v]
+            ret_dict = {"image/encoded": [observ],
+                        "image/format": ["png"],
+                        "image/height": [self.frame_height],
+                        "image/width": [self.frame_width],
+                        "action": [int(action)],
+                        "done": [int(False)],
+                        "reward": [int(reward) - self.min_reward]}
             if pieces_generated > self.warm_up:
               yield ret_dict
         else:
           sess.run(self.collect_trigger_op)
 
-    if self.movies:
-      clip = moviepy_editor().ImageSequenceClip(clip_files, fps=self.movies_fps)
-      clip_path = os.path.join(data_dir, "output_{}.mp4".format(self.name))
-      clip.write_videofile(clip_path, fps=self.movies_fps, codec="mpeg4")
-
-  def generate_data(self, data_dir, tmp_dir, task_id=-1):
-    train_paths = self.training_filepaths(
-        data_dir, self.num_shards, shuffled=False)
-    dev_paths = self.dev_filepaths(
-        data_dir, self.num_dev_shards, shuffled=False)
-    all_paths = train_paths + dev_paths
-    generator_utils.generate_files(
-        self.generator(data_dir, tmp_dir), all_paths)
-    generator_utils.shuffle_dataset(all_paths)
-
 
 @registry.register_problem
 class GymSimulatedDiscreteProblemWithAgent(GymDiscreteProblemWithAgent):
@@ -454,16 +282,12 @@ class GymSimulatedDiscreteProblemWithAgent(GymDiscreteProblemWithAgent):
 
   def __init__(self, *args, **kwargs):
     super(GymSimulatedDiscreteProblemWithAgent, self).__init__(*args, **kwargs)
-    # TODO(lukaszkaiser): pull it outside
-    self.in_graph_wrappers = [(atari.TimeLimitWrapper, {"timelimit": 150}),
-                              (atari.MaxAndSkipWrapper, {"skip": 4})]
     self.simulated_environment = True
-    self.movies_fps = 2
+    self.debug_dump_frames_path = "/tmp/t2t_debug_dump_frames"
 
   def restore_networks(self, sess):
     super(GymSimulatedDiscreteProblemWithAgent, self).restore_networks(sess)
-
-    # TODO(lukaszkaiser): adjust regexp for different models
+    # TODO(blazej): adjust regexp for different models.
     env_model_loader = tf.train.Saver(tf.global_variables(".*basic_conv_gen.*"))
     sess = tf.get_default_session()
 
diff --git a/tensor2tensor/data_generators/image_utils.py b/tensor2tensor/data_generators/image_utils.py
index d65cfa4ba..06061a5ff 100644
--- a/tensor2tensor/data_generators/image_utils.py
+++ b/tensor2tensor/data_generators/image_utils.py
@@ -31,8 +31,6 @@
 
 import tensorflow as tf
 
-from tensorflow.python.eager import context
-
 
 def resize_by_area(img, size):
   """image resize function used by quite a few image problems."""
@@ -159,7 +157,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
 
 
 def encode_images_as_png(images):
-  if context.in_eager_mode():
+  if tf.contrib.eager.in_eager_mode():
     for image in images:
       yield tf.image.encode_png(image).numpy()
   else:
diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py
index 0fb21bff6..b0b2e719a 100644
--- a/tensor2tensor/data_generators/lm1b.py
+++ b/tensor2tensor/data_generators/lm1b.py
@@ -24,7 +24,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
@@ -79,7 +79,7 @@ def _train_data_filenames(tmp_dir):
       os.path.join(tmp_dir,
                    "1-billion-word-language-modeling-benchmark-r13output",
                    "training-monolingual.tokenized.shuffled",
-                   "news.en-%05d-of-00100" % i) for i in xrange(1, 100)
+                   "news.en-%05d-of-00100" % i) for i in range(1, 100)
   ]
 
 
diff --git a/tensor2tensor/data_generators/ocr.py b/tensor2tensor/data_generators/ocr.py
index 924483d5a..074686459 100644
--- a/tensor2tensor/data_generators/ocr.py
+++ b/tensor2tensor/data_generators/ocr.py
@@ -69,7 +69,7 @@ def generator(self, data_dir, tmp_dir, is_training):
     num_examples = 2
     ocr_dir = os.path.join(tmp_dir, "ocr/")
     tf.logging.info("Looking for OCR data in %s." % ocr_dir)
-    for i in xrange(num_examples):
+    for i in range(num_examples):
       image_filepath = os.path.join(ocr_dir, "%d.png" % i)
       text_filepath = os.path.join(ocr_dir, "%d.txt" % i)
       with tf.gfile.Open(text_filepath, "rb") as f:
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 43ef66a4d..80d44ee61 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -344,13 +344,9 @@ def _preprocess(example):
       return examples
 
     is_training = mode == tf.estimator.ModeKeys.TRAIN
-    if hasattr(tf.contrib.data, "parallel_interleave"):
-      dataset = dataset.apply(
-          tf.contrib.data.parallel_interleave(
-              _preprocess, sloppy=is_training, cycle_length=8))
-    else:
-      dataset = dataset.interleave(_preprocess, cycle_length=8,
-                                   block_length=16)
+    dataset = dataset.apply(
+        tf.contrib.data.parallel_interleave(
+            _preprocess, sloppy=is_training, cycle_length=8))
 
     return dataset
 
@@ -568,14 +564,9 @@ def _load_records_and_preprocess(filename):
       random.shuffle(data_files)
     dataset = tf.data.Dataset.from_tensor_slices(tf.constant(data_files))
 
-    if hasattr(tf.contrib.data, "parallel_interleave"):
-      dataset = dataset.apply(
-          tf.contrib.data.parallel_interleave(
-              _load_records_and_preprocess, sloppy=is_training, cycle_length=8))
-    else:
-      dataset = dataset.interleave(_load_records_and_preprocess, cycle_length=8,
-                                   block_length=16)
-
+    dataset = dataset.apply(
+        tf.contrib.data.parallel_interleave(
+            _load_records_and_preprocess, sloppy=is_training, cycle_length=8))
     dataset = dataset.map(
         self.maybe_reverse_and_copy, num_parallel_calls=num_threads)
 
@@ -1067,7 +1058,6 @@ def problem_hparams_to_features(problem_hparams):
     input_space_id = problem_hparams.input_space_id
     target_space_id = problem_hparams.target_space_id
   return {
-      "problem_choice": 0,
       "input_space_id": input_space_id,
       "target_space_id": target_space_id,
   }
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index 5398c3930..cb2a43978 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -34,7 +34,7 @@
 
 import numpy as np
 import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 from tensor2tensor.data_generators import tokenizer
 
 import tensorflow as tf
@@ -60,26 +60,37 @@
 _ESCAPE_CHARS = set(u"\\_u;0123456789")
 
 
-# Conversion between Unicode and UTF-8, if required (on Python2).
-if six.PY2:
-
-  def native_to_unicode(s):
-    return s if isinstance(s, unicode) else s.decode("utf-8")
+# Unicode utility functions that work with Python 2 and 3
+def native_to_unicode(s):
+  return s if is_unicode(s) else to_unicode(s)
 
-  def unicode_to_native(s):
-    return s.encode("utf-8") if isinstance(s, unicode) else s
-else:  # No conversion required on Python >= 3.
 
-  def native_to_unicode(s):
+def unicode_to_native(s):
+  if six.PY2:
+    return s.encode("utf-8") if is_unicode(s) else s
+  else:
     return s
 
-  def unicode_to_native(s):
+
+def is_unicode(s):
+  if six.PY2:
+    if isinstance(s, unicode):
+      return True
+  else:
+    if isinstance(s, str):
+      return True
+  return False
+
+
+def to_unicode(s, ignore_errors=False):
+  if is_unicode(s):
     return s
+  error_mode = "ignore" if ignore_errors else "strict"
+  return s.decode("utf-8", errors=error_mode)
 
 
 def to_unicode_ignore_errors(s):
-  return (unicode(s, "utf-8", errors="ignore")
-          if six.PY2 else s.decode("utf-8", "ignore"))
+  return to_unicode(s, ignore_errors=True)
 
 
 class TextEncoder(object):
@@ -374,7 +385,7 @@ def store_to_file(self, filename):
       filename: Full path of the file to store the vocab to.
     """
     with tf.gfile.Open(filename, "w") as f:
-      for i in xrange(len(self._id_to_token)):
+      for i in range(len(self._id_to_token)):
         f.write(self._id_to_token[i] + "\n")
 
 
@@ -588,7 +599,7 @@ def _escaped_token_to_subtoken_strings(self, escaped_token):
     start = 0
     token_len = len(escaped_token)
     while start < token_len:
-      for end in xrange(
+      for end in range(
           min(token_len, start + self._max_subtoken_len), start, -1):
         subtoken = escaped_token[start:end]
         if subtoken in self._subtoken_string_to_id:
@@ -774,7 +785,7 @@ def build_from_token_counts(self,
     # with high enough counts for our new vocabulary.
     if min_count < 1:
       min_count = 1
-    for i in xrange(num_iterations):
+    for i in range(num_iterations):
       tf.logging.info("Iteration {0}".format(i))
 
       # Collect all substrings of the encoded token that break along current
@@ -789,7 +800,7 @@ def build_from_token_counts(self,
           if max_subtoken_length is not None:
             last_position = min(last_position, start + max_subtoken_length)
 
-          for end in xrange(start + 1, last_position):
+          for end in range(start + 1, last_position):
             new_subtoken = escaped_token[start:end]
             subtoken_counts[new_subtoken] += count
           start += len(subtoken)
@@ -806,7 +817,7 @@ def build_from_token_counts(self,
       # Consider the candidates longest to shortest, so that if we accept
       # a longer subtoken string, we can decrement the counts of its prefixes.
       new_subtoken_strings = []
-      for lsub in xrange(len(len_to_subtoken_strings) - 1, 0, -1):
+      for lsub in range(len(len_to_subtoken_strings) - 1, 0, -1):
         subtoken_strings = len_to_subtoken_strings[lsub]
         for subtoken_string in subtoken_strings:
           count = subtoken_counts[subtoken_string]
@@ -815,7 +826,7 @@ def build_from_token_counts(self,
             # explicitly, regardless of count.
             if subtoken_string not in self._alphabet:
               new_subtoken_strings.append((count, subtoken_string))
-            for l in xrange(1, lsub):
+            for l in range(1, lsub):
               subtoken_counts[subtoken_string[:l]] -= count
 
       # Include the alphabet explicitly to guarantee all strings are encodable.
diff --git a/tensor2tensor/data_generators/text_encoder_test.py b/tensor2tensor/data_generators/text_encoder_test.py
index e11607008..b3248a7c4 100644
--- a/tensor2tensor/data_generators/text_encoder_test.py
+++ b/tensor2tensor/data_generators/text_encoder_test.py
@@ -30,7 +30,7 @@
 # Dependency imports
 import mock
 import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.data_generators import text_encoder
 import tensorflow as tf
@@ -193,7 +193,7 @@ def test_long_tokens(self):
     long_tokens = []
     for _ in range(num_tokens):
       long_token = "".join([random.choice(string.ascii_uppercase)
-                            for _ in xrange(token_length)])
+                            for _ in range(token_length)])
       long_tokens.append(long_token)
 
     corpus = " ".join(long_tokens)
diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py
index c023627c2..b6c0e3236 100644
--- a/tensor2tensor/data_generators/tokenizer.py
+++ b/tensor2tensor/data_generators/tokenizer.py
@@ -51,7 +51,7 @@
 # Dependency imports
 
 import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
 # Conversion between Unicode and UTF-8, if required (on Python2)
@@ -60,7 +60,7 @@
 
 # This set contains all letter and number characters.
 _ALPHANUMERIC_CHAR_SET = set(
-    six.unichr(i) for i in xrange(sys.maxunicode)
+    six.unichr(i) for i in range(sys.maxunicode)
     if (unicodedata.category(six.unichr(i)).startswith("L") or
         unicodedata.category(six.unichr(i)).startswith("N")))
 
@@ -79,7 +79,7 @@ def encode(text):
   token_start = 0
   # Classify each character in the input string
   is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text]
-  for pos in xrange(1, len(text)):
+  for pos in range(1, len(text)):
     if is_alnum[pos] != is_alnum[pos - 1]:
       token = text[token_start:pos]
       if token != u" " or token_start == 0:
diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py
index ac4cd0cca..e977d1126 100644
--- a/tensor2tensor/data_generators/tokenizer_test.py
+++ b/tensor2tensor/data_generators/tokenizer_test.py
@@ -26,7 +26,7 @@
 # Dependency imports
 
 import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 from tensor2tensor.data_generators import tokenizer
 import tensorflow as tf
 
@@ -57,8 +57,8 @@ def test_decode(self):
             [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]))
 
   def test_invertibility_on_random_strings(self):
-    for _ in xrange(1000):
-      s = u"".join(six.unichr(random.randint(0, 65535)) for _ in xrange(10))
+    for _ in range(1000):
+      s = u"".join(six.unichr(random.randint(0, 65535)) for _ in range(10))
       self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
 
 
diff --git a/tensor2tensor/data_generators/video_utils.py b/tensor2tensor/data_generators/video_utils.py
index 723c192f5..869fad721 100644
--- a/tensor2tensor/data_generators/video_utils.py
+++ b/tensor2tensor/data_generators/video_utils.py
@@ -19,8 +19,12 @@
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 # Dependency imports
 
+import six
+
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import image_utils
 from tensor2tensor.data_generators import problem
@@ -43,6 +47,12 @@ def resize_video_frames(images, size):
 class VideoProblem(problem.Problem):
   """Base class for problems with videos."""
 
+  def __init__(self, *args, **kwargs):
+    super(VideoProblem, self).__init__(*args, **kwargs)
+    # Path to a directory to dump generated frames as png for debugging.
+    # If empty, no debug frames will be generated.
+    self.debug_dump_frames_path = ""
+
   @property
   def num_channels(self):
     """Number of color channels in each frame."""
@@ -157,7 +167,7 @@ def features_from_batch(batched_prefeatures):
         Features dictionary with joint features per-frame.
       """
       features = {}
-      for k, v in batched_prefeatures.items():
+      for k, v in six.iteritems(batched_prefeatures):
         if k == "frame":  # We rename past frames to inputs and targets.
           s1, s2 = split_on_batch(v)
           # Reshape just to make sure shapes are right and set.
@@ -242,13 +252,27 @@ def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
       if width != self.frame_width:
         raise ValueError("Generated frame has width %d while the class "
                          "assumes width %d." % (width, self.frame_width))
-      encoded_frame = image_utils.encode_images_as_png([unencoded_frame]).__next__()
+      encoded_frame = six.next(
+          image_utils.encode_images_as_png([unencoded_frame]))
       features["image/encoded"] = [encoded_frame]
       features["image/format"] = ["png"]
       features["image/height"] = [height]
       features["image/width"] = [width]
       yield features
 
+  def generate_encoded_samples_debug(self, data_dir, tmp_dir, dataset_split):
+    """Generate samples of the encoded frames and dump for debug if needed."""
+    counter = 0
+    for sample in self.generate_encoded_samples(
+        data_dir, tmp_dir, dataset_split):
+      if self.debug_dump_frames_path:
+        path = os.path.join(self.debug_dump_frames_path,
+                            "frame_%d.png" % counter)
+        with tf.gfile.Open(path, "wb") as f:
+          f.write(sample["image/encoded"][0])
+        counter += 1
+      yield sample
+
   def generate_data(self, data_dir, tmp_dir, task_id=-1):
     """The function generating the data."""
     filepath_fns = {
@@ -268,10 +292,11 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
     if self.is_generate_per_split:
       for split, paths in split_paths:
         generator_utils.generate_files(
-            self.generate_encoded_samples(data_dir, tmp_dir, split), paths)
+            self.generate_encoded_samples_debug(
+                data_dir, tmp_dir, split), paths)
     else:
       generator_utils.generate_files(
-          self.generate_encoded_samples(
+          self.generate_encoded_samples_debug(
               data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths)
 
 
diff --git a/tensor2tensor/insights/README.md b/tensor2tensor/insights/README.md
index ebed255e1..014bfca81 100644
--- a/tensor2tensor/insights/README.md
+++ b/tensor2tensor/insights/README.md
@@ -41,7 +41,7 @@ Start guide, a sample configuration would be:
         "data_dir": "/tmp/t2t/data",
         "hparams": "",
         "hparams_set": "transformer_base_single_gpu",
-        "problems": "translate_ende_wmt32k"
+        "problem": "translate_ende_wmt32k"
       },
     }]
     "language": [{
diff --git a/tensor2tensor/insights/transformer_model.py b/tensor2tensor/insights/transformer_model.py
index d7ac83a0a..da8cf5fe3 100644
--- a/tensor2tensor/insights/transformer_model.py
+++ b/tensor2tensor/insights/transformer_model.py
@@ -115,7 +115,7 @@ def __init__(self, processor_configuration):
         transformer_config["hparams_set"],
         transformer_config["hparams"],
         data_dir=data_dir,
-        problem_name=transformer_config["problems"])
+        problem_name=transformer_config["problem"])
 
     decode_hp = decoding.decode_hparams()
     decode_hp.add_hparam("shards", 1)
@@ -129,8 +129,8 @@ def __init__(self, processor_configuration):
         decode_hparams=decode_hp, use_tpu=False)
 
     # Fetch the vocabulary and other helpful variables for decoding.
-    self.source_vocab = self.hparams.problems[0].vocabulary["inputs"]
-    self.targets_vocab = self.hparams.problems[0].vocabulary["targets"]
+    self.source_vocab = self.hparams.problem_hparams.vocabulary["inputs"]
+    self.targets_vocab = self.hparams.problem_hparams.vocabulary["targets"]
     self.const_array_size = 10000
 
     # Prepare the Transformer's debug data directory.
@@ -166,7 +166,6 @@ def server_input_fn():
         x += [0] * (self.const_array_size - len(x))
         d = {
             "inputs": np.array(x).astype(np.int32),
-            "problem_choice": np.array(0).astype(np.int32)
         }
         yield d
 
diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index a6b4f919d..46befcb8d 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -27,7 +27,7 @@
 import numpy as np
 
 from six.moves import range  # pylint: disable=redefined-builtin
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_layers
@@ -540,7 +540,7 @@ def add_timing_signal_nd(x, min_timescale=1.0, max_timescale=1.0e4):
       (tf.to_float(num_timescales) - 1))
   inv_timescales = min_timescale * tf.exp(
       tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
-  for dim in xrange(num_dims):
+  for dim in range(num_dims):
     length = common_layers.shape_list(x)[dim + 1]
     position = tf.to_float(tf.range(length))
     scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(
@@ -549,9 +549,9 @@ def add_timing_signal_nd(x, min_timescale=1.0, max_timescale=1.0e4):
     prepad = dim * 2 * num_timescales
     postpad = channels - (dim + 1) * 2 * num_timescales
     signal = tf.pad(signal, [[0, 0], [prepad, postpad]])
-    for _ in xrange(1 + dim):
+    for _ in range(1 + dim):
       signal = tf.expand_dims(signal, 0)
-    for _ in xrange(num_dims - 1 - dim):
+    for _ in range(num_dims - 1 - dim):
       signal = tf.expand_dims(signal, -2)
     x += signal
   return x
@@ -579,7 +579,7 @@ def add_positional_embedding_nd(x, max_length, name):
   base_shape = [1] * (num_dims + 1) + [depth]
   base_start = [0] * (num_dims + 2)
   base_size = [-1] + [1] * num_dims + [depth]
-  for i in xrange(num_dims):
+  for i in range(num_dims):
     shape = base_shape[:]
     start = base_start[:]
     size = base_size[:]
@@ -3710,7 +3710,7 @@ def forward_internal(x, wqkv, wo, attention_bias, norm_scale, norm_bias):
     wqkv_split = tf.unstack(wqkv, num=num_heads)
     wo_split = tf.unstack(wo, num=num_heads)
     y = 0
-    for h in xrange(num_heads):
+    for h in range(num_heads):
       with tf.control_dependencies([y] if h > 0 else []):
         combined = tf.nn.conv1d(n, wqkv_split[h], 1, "SAME")
         q, k, v = tf.split(combined, 3, axis=2)
@@ -3737,7 +3737,7 @@ def grad_fn(x, wqkv, wo, attention_bias, norm_scale, norm_bias, dy):
         dwqkvs = []
         dwos = []
         dn = 0
-        for h in xrange(num_heads):
+        for h in range(num_heads):
           with tf.control_dependencies(deps):
             combined = tf.nn.conv1d(n, wqkv_split[h], 1, "SAME")
             q, k, v = tf.split(combined, 3, axis=2)
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
index fffe674f7..230579888 100644
--- a/tensor2tensor/layers/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -91,7 +91,6 @@ def basic_params1():
       learning_rate=0.1,
       sampling_method="argmax",  # "argmax" or "random"
       sampling_temp=1.0,  # temperature for sampling
-      problem_choice="adaptive",  # "uniform", "adaptive", "distributed"
       # expand the logits a piece at a time - saves memory.
       factored_logits=False,
       multiply_embedding_mode="sqrt_depth",
@@ -229,12 +228,19 @@ def basic_params1():
       force_full_predict=False,
       # Set this for pure model parallelism.  There is only one data shard.
       no_data_parallelism=False,
-      # Set this to the dtype used for activation. Variables will still be
-      # stored in float32.
+      # dtype used for activations. - "float32" or "bfloat16"
+      # activation_dtype="bfloat16" currently only works on TPU.
+      #    It lowers activation-memory usage
+      #    and does not appear to affect quality.
+      #    You can train on TPU with activation_dtype="bfloat16" and evaluate
+      #    on CPU/GPU with activation_dtype="float32"
       activation_dtype="float32",
-      # Experimental: set weight_dtype="bfloat16" to use bfloat16 for both
-      # weights and activations. Model quality may be worse. Model quality
-      # appears to be close to baseline with large batch sizes (>4k).
+      # dtype used for parameters: "float32" or "bfloat16"
+      # bfloat16 currently only works with optimizer="adafactor".
+      #   The savings in memory allow for training larger models.
+      #   Weights are encoded as (w*128)^8, using pseudostochastic
+      #   roundoff.  Initial experiments show that model quality is similar
+      #   to baseline for about 3M training steps, but worse thereafter.
       weight_dtype="float32",
   )
 
diff --git a/tensor2tensor/layers/common_image_attention.py b/tensor2tensor/layers/common_image_attention.py
index f80b04e49..f60fa3711 100644
--- a/tensor2tensor/layers/common_image_attention.py
+++ b/tensor2tensor/layers/common_image_attention.py
@@ -16,7 +16,7 @@
 """Utils for attention mechanism for images."""
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_layers
@@ -26,6 +26,7 @@
 
 
 class AttentionType(object):
+  """Types of attention type used in cia."""
   LOCAL_1D = "local_1d"
   LOCAL_2D = "local_2d"
   GLOBAL = "global"
@@ -33,6 +34,7 @@ class AttentionType(object):
   DILATED = "dilated"
   MOE_LOCAL_1D = "moe_local1d"
   LOCAL_BLOCK = "local_block"
+  NON_CAUSAL_1D = "local_1d_noncausal"
 
   @staticmethod
   def get_choices():
@@ -44,6 +46,7 @@ def get_choices():
         AttentionType.LOCAL_2D,
         AttentionType.LOCAL_BLOCK,
         AttentionType.DILATED,
+        AttentionType.NON_CAUSAL_1D,
     ]
 
 
@@ -288,7 +291,7 @@ def transformer_decoder_layers(inputs,
   x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout)
   if attention_type == AttentionType.DILATED:
     assert len(hparams.gap_sizes) == num_layers
-  for layer in xrange(num_layers):
+  for layer in range(num_layers):
     with tf.variable_scope("%s_layer_%d" % (name, layer)):
       # self-attention + skip connections
       if attention_type == AttentionType.LOCAL_2D:
@@ -300,6 +303,11 @@ def transformer_decoder_layers(inputs,
                                hparams,
                                attention_type="local_mask_right",
                                q_padding="LEFT", kv_padding="LEFT")
+      elif attention_type == AttentionType.NON_CAUSAL_1D:
+        y = local_attention_1d(common_layers.layer_preprocess(x, hparams),
+                               hparams,
+                               attention_type="local_unmasked",
+                               q_padding="VALID", kv_padding="VALID")
       elif attention_type == AttentionType.LOCAL_BLOCK:
         y = local_within_block_attention(
             common_layers.layer_preprocess(x, hparams),
@@ -345,7 +353,7 @@ def transformer_encoder_layers(inputs,
   x = inputs
   x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout)
 
-  for layer in xrange(num_layers):
+  for layer in range(num_layers):
     # attention layers + skip connections
     with tf.variable_scope("%s_layer_%d" % (name, layer)):
       if attention_type == AttentionType.LOCAL_2D:
@@ -433,7 +441,7 @@ def transformer_layers_sharded(dp,
   expert_fn = expert_utils.ffn_expert_fn(
       hparams.hidden_size, moe_hidden_sizes, hparams.hidden_size)
   x = dp(tf.nn.dropout, x, 1.0 - hparams.layer_prepostprocess_dropout)
-  for layer in xrange(num_layers):
+  for layer in range(num_layers):
     with tf.variable_scope("%s_layer_%d" % (name, layer)):
       # self-attention
       if attention_type == AttentionType.LOCAL_2D:
@@ -632,7 +640,7 @@ def get_channel_embeddings(io_depth, targets, hidden_size, name="channel"):
   rgb_embedding_var = tf.identity(rgb_embedding_var)
   rgb_embedding_var *= float(hidden_size)**0.5
   channel_target_embs = []
-  for i in xrange(io_depth):
+  for i in range(io_depth):
     # Adding the channel offsets to get the right embedding since the
     # embedding tensor has shape 256 * io_depth, hidden_size
     target_ids = tf.squeeze(targets_split[i], axis=3) + i * 256
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index 3c1155643..ca5f3efc8 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -27,12 +27,11 @@
 # Dependency imports
 
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 from tensor2tensor.utils import expert_utils as eu
 
 import tensorflow as tf
 
-from tensorflow.python.eager import context as tfe_context
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 
@@ -42,7 +41,7 @@
 
 
 def is_on_tpu():
-  # Support TF versions 1.4+
+  # Support TF versions 1.5+
   try:
     from tensorflow.python.ops import control_flow_util  # pylint: disable=g-import-not-at-top
     ctxt = tf.get_default_graph()._get_control_flow_context()  # pylint: disable=protected-access
@@ -51,54 +50,6 @@ def is_on_tpu():
     return tf.contrib.framework.get_name_scope().startswith("TPUReplicate")
 
 
-def bfloat16_var_getter(getter, *args, **kwargs):
-  """A custom getter function for bfloat16 variables.
-
-  Variables maintain storage in float32.
-
-  Args:
-    getter: custom getter
-    *args: arguments
-    **kwargs: keyword arguments
-  Returns:
-    variables with the correct dtype.
-  Raises:
-    KeyError: if "dtype" is not provided as a kwarg.
-  """
-  requested_dtype = kwargs["dtype"]
-  if requested_dtype == tf.bfloat16:
-    kwargs["dtype"] = tf.float32
-  var = getter(*args, **kwargs)
-  # This if statement is needed to guard the cast, because batch norm
-  # assigns directly to the return value of this custom getter. The cast
-  # makes the return value not a variable so it cannot be assigned. Batch
-  # norm variables are always in fp32 so this if statement is never
-  # triggered for them.
-  if var.dtype.base_dtype != requested_dtype:
-    var = tf.cast(var, requested_dtype)
-  return var
-
-
-def bfloat16_weights_var_getter(getter, *args, **kwargs):
-  """A custom getter function for bfloat16 variables.
-
-  Variables maintain storage in bfloat16.
-
-  Args:
-    getter: A custom getter.
-    *args: Arguments.
-    **kwargs: Keyword arguments.
-  Returns:
-    Variables with the correct dtype.
-  Raises:
-    KeyError: if "dtype" is not provided as a kwarg.
-  """
-  requested_dtype = kwargs["dtype"]
-  if requested_dtype in (tf.bfloat16, tf.float32):
-    kwargs["dtype"] = tf.bfloat16
-  return getter(*args, **kwargs)
-
-
 def dropout_with_broadcast_dims(x, keep_prob, broadcast_dims=None, **kwargs):
   """Like tf.nn.dropout but takes broadcast_dims instead of noise_shape.
 
@@ -120,8 +71,10 @@ def dropout_with_broadcast_dims(x, keep_prob, broadcast_dims=None, **kwargs):
   if broadcast_dims:
     shape = tf.shape(x)
     ndims = len(x.get_shape())
+    # Allow dimensions like "-1" as well.
+    broadcast_dims = [dim + ndims if dim < 0 else dim for dim in broadcast_dims]
     kwargs["noise_shape"] = [
-        1 if i in broadcast_dims else shape[i] for i in xrange(ndims)]
+        1 if i in broadcast_dims else shape[i] for i in range(ndims)]
   return tf.nn.dropout(x, keep_prob, **kwargs)
 
 
@@ -311,7 +264,7 @@ def embedding(x,
     # On the backwards pass, we want to convert the gradient from
     # an indexed-slices to a regular tensor before sending it back to the
     # parameter server. This avoids excess computation on the parameter server.
-    if not tfe_context.in_eager_mode():
+    if not tf.contrib.eager.in_eager_mode():
       embedding_var = eu.convert_gradient_to_tensor(embedding_var)
     x = dropout_no_scaling(x, 1.0 - symbol_dropout_rate)
     emb_x = gather(embedding_var, x, dtype)
@@ -377,7 +330,7 @@ def conv_stride2_multistep(x, nbr_steps, output_filters, name=None, reuse=None):
       out = conv(x, output_filters, (1, 1))
       return out, [out]
     hidden_layers = [x]
-    for i in xrange(nbr_steps):
+    for i in range(nbr_steps):
       hidden_layers.append(
           conv(
               hidden_layers[-1],
@@ -433,7 +386,7 @@ def deconv2d(cur, i):
       return tf.depth_to_space(thicker, 2)
 
     cur = x
-    for i in xrange(nbr_steps):
+    for i in range(nbr_steps):
       if cur.get_shape()[2] == 1:
         cur = deconv1d(cur, i)
       else:
@@ -489,7 +442,7 @@ def conv2d_kernel(kernel_size_arg, name_suffix):
   return conv2d_kernel(kernel_size, "single")
 
 
-def conv(inputs, filters, kernel_size, dilation_rate=1, **kwargs):
+def conv(inputs, filters, kernel_size, dilation_rate=(1, 1), **kwargs):
   return conv_internal(
       tf.layers.conv2d,
       inputs,
@@ -575,7 +528,7 @@ def tpu_conv1d(inputs, filters, kernel_size, padding="SAME", name="tpu_conv1d"):
   last_offset = first_offset + kernel_size - 1
   results = []
   padded = tf.pad(inputs, [[0, 0], [-first_offset, last_offset], [0, 0]])
-  for i in xrange(kernel_size):
+  for i in range(kernel_size):
     shifted = tf.slice(padded, [0, i, 0], tf.shape(inputs)) if i else inputs
     shifted.set_shape(inputs.get_shape())
     results.append(dense(
@@ -1140,7 +1093,7 @@ def multiscale_conv_and_attention(x, padding, hparams, source=None):
       x,
       hparams.hidden_size,
       [((hparams.kernel_height**i, hparams.kernel_width**i),
-        (hparams.kernel_height, hparams.kernel_width)) for i in xrange(3)],
+        (hparams.kernel_height, hparams.kernel_width)) for i in range(3)],
       "AVG",
       padding=padding)
   # For residuals a rescale if necessary if channels differ.
@@ -1955,7 +1908,7 @@ def next_state(cur_state, args_tup):
       cur_x_times_one_minus_f, cur_f = args_tup
       return cur_f * cur_state + cur_x_times_one_minus_f
     # Calculate SRU on each layer.
-    for i in xrange(num_layers):
+    for i in range(num_layers):
       # The parallel part of the SRU.
       x_orig = x
       x, f, r = tf.split(tf.layers.dense(x, 3 * x_shape[-1],
@@ -2158,7 +2111,7 @@ def approximate_split(x, num_splits, axis=0):
     a list of num_splits Tensors.
   """
   size = shape_list(x)[axis]
-  size_splits = [tf.div(size + i, num_splits) for i in xrange(num_splits)]
+  size_splits = [tf.div(size + i, num_splits) for i in range(num_splits)]
   return tf.split(x, size_splits, axis=axis)
 
 
@@ -2225,7 +2178,7 @@ def smoothing_cross_entropy_factored_grad(op, dy):
   b_grad = None
   a_grad_parts = []
   deps = []
-  for part in xrange(num_splits):
+  for part in range(num_splits):
     with tf.control_dependencies(deps):
       logits = tf.matmul(a[part], b, transpose_b=True)
       output_part = smoothing_cross_entropy(logits, labels[part], vocab_size,
@@ -2266,7 +2219,7 @@ def smoothing_cross_entropy_factored(a, b, labels, confidence):
   labels = approximate_split(labels, num_splits)
   a = approximate_split(a, num_splits)
   parts = []
-  for part in xrange(num_splits):
+  for part in range(num_splits):
     with tf.control_dependencies(parts[-1:]):
       logits = tf.matmul(a[part], b, transpose_b=True)
       parts.append(
@@ -2442,7 +2395,7 @@ def forward_internal(x, f1, f2, scale, bias):
     x_flat = tf.reshape(x, [-1, 1, shape_list(x)[2]])
     xs = approximate_split(x_flat, num_splits)
     ys = []
-    for i in xrange(num_splits):
+    for i in range(num_splits):
       with tf.control_dependencies(ys[-1:]):
         n = layer_norm_compute_python(xs[i], epsilon, scale, bias)
         y = tf.nn.conv1d(n, f1, 1, "SAME")
@@ -2476,7 +2429,7 @@ def grad_fn(x, f1, f2, scale, bias, dy):
         dscale = 0
         dbias = 0
         deps = []
-        for i in xrange(num_splits):
+        for i in range(num_splits):
           with tf.control_dependencies(deps):
             n = layer_norm_compute_python(xs[i], epsilon, scale, bias)
             y = tf.nn.conv1d(n, f1, 1, "SAME")
@@ -2530,7 +2483,7 @@ def shape_list(x):
   shape = tf.shape(x)
 
   ret = []
-  for i in xrange(len(static)):
+  for i in range(len(static)):
     dim = static[i]
     if dim is None:
       dim = shape[i]
@@ -2587,7 +2540,7 @@ def ones_matrix_band_part(rows, cols, num_lower, num_upper, out_shape=None):
 def reshape_like_all_dims(a, b):
   """Reshapes a to match the shape of b."""
   ret = tf.reshape(a, tf.shape(b))
-  if not tfe_context.in_eager_mode():
+  if not tf.contrib.eager.in_eager_mode():
     ret.set_shape(b.get_shape())
   return ret
 
@@ -2637,7 +2590,7 @@ def expand_by_device(original_parallelism, device_parallelism, data):
   """
   device_to_datum = {
       device_parallelism.devices[i]: data[i]
-      for i in xrange(device_parallelism.n)}
+      for i in range(device_parallelism.n)}
   return [device_to_datum[d] for d in original_parallelism.devices]
 
 
@@ -2684,7 +2637,7 @@ def _step(source_replica, target_replica, x_split, op="plus_eq"):
         x_split: a list of lists of tensors
         op: a string
       """
-      for shard in xrange(parallelism.n):
+      for shard in range(parallelism.n):
         source_device = (shard + source_replica) % parallelism.n
         target_device = (shard + target_replica) % parallelism.n
         source = x_split[source_device][shard]
@@ -2702,10 +2655,10 @@ def _step(source_replica, target_replica, x_split, op="plus_eq"):
     # accumulate everything towards the center.
     for i in range(center, parallelism.n - 1)[::-1]:
       _step(i + 1, i, x_split, op="plus_eq")
-    for i in xrange(center):
+    for i in range(center):
       _step(i, i + 1, x_split, op="plus_eq")
     # copy everything away from the center.
-    for i in xrange(center, parallelism.n - 1):
+    for i in range(center, parallelism.n - 1):
       _step(i, i + 1, x_split, op="copy")
     for i in range(center)[::-1]:
       _step(i + 1, i, x_split, op="copy")
@@ -2774,11 +2727,7 @@ def grad_fn(inputs, variables, outputs, output_grads):
   @fn_with_custom_grad(grad_fn)
   def fn_with_recompute(*args):
     cached_vs.append(tf.get_variable_scope())
-    # TODO(rsepassi): Rm conditional in TF 1.5
-    if hasattr(tf.contrib.framework, "current_arg_scope"):
-      cached_arg_scope.append(tf.contrib.framework.current_arg_scope())
-    else:
-      cached_arg_scope.append({})
+    cached_arg_scope.append(tf.contrib.framework.current_arg_scope())
     return fn(*args)
 
   return fn_with_recompute(*args)
diff --git a/tensor2tensor/layers/discretization.py b/tensor2tensor/layers/discretization.py
index 78577db89..f0fc57391 100644
--- a/tensor2tensor/layers/discretization.py
+++ b/tensor2tensor/layers/discretization.py
@@ -67,9 +67,7 @@ def nearest_neighbor(x,
                      block_v_size,
                      random_top_k=1,
                      soft_em=False,
-                     soft_em_startup_steps=10000,
-                     inv_temp=1.0,
-                     ema_count=None):
+                     num_samples=1):
   """Find the nearest element in means to elements in x.
 
   Args:
@@ -79,11 +77,7 @@ def nearest_neighbor(x,
     block_v_size: Number of table entries per block.
     random_top_k: Noisy top-k if this is bigger than 1 (Default: 1).
     soft_em: If True then use soft EM rather than hard EM (Default: False).
-    soft_em_startup_steps: Number of steps before soft_em activates
-      (Default: 10000).
-    inv_temp: Inverse temperature for soft EM (Default: 1.)
-    ema_count: Table of counts for each embedding corresponding to how many
-      examples in a batch it was the closest to (Default: None).
+    num_samples: Number of samples to take in soft EM (Default: 1).
 
   Returns:
     Tensor with nearest element in mean encoded in one-hot notation.
@@ -98,15 +92,15 @@ def nearest_neighbor(x,
 
   # computing cluster probabilities
   if soft_em:
-    ema_count = tf.expand_dims(ema_count, 0)
-    c_probs = ema_count / tf.reduce_sum(ema_count, 2, keepdims=True)
-    c_probs = tf.where(
-        tf.less(tf.to_int32(tf.train.get_global_step()), soft_em_startup_steps),
-        tf.ones_like(c_probs, dtype=tf.float32), c_probs)
-    mask = common_layers.inverse_lin_decay(2 * soft_em_startup_steps)
-    c_probs = mask * c_probs + (1 - mask) * tf.ones_like(c_probs)
-    nearest_hot = tf.exp(-inv_temp * dist) * c_probs
-    nearest_hot /= tf.reduce_sum(nearest_hot, 2, keepdims=True)
+    num_blocks = common_layers.shape_list(dist)[1]
+    nearest_idx = tf.stack(
+        [
+            tf.multinomial(-dist[:, i, :], num_samples=num_samples)
+            for i in range(num_blocks)
+        ],
+        axis=1)
+    nearest_hot = tf.one_hot(nearest_idx, depth=block_v_size)
+    nearest_hot = tf.reduce_mean(nearest_hot, axis=-2)
   else:
     if random_top_k > 1:
       _, top_k_idx = tf.nn.top_k(-dist, k=random_top_k)
@@ -127,9 +121,7 @@ def embedding_lookup(x,
                      block_v_size,
                      random_top_k=1,
                      soft_em=False,
-                     soft_em_startup_steps=10000,
-                     inv_temp=1.0,
-                     ema_count=None):
+                     num_samples=1):
   """Compute nearest neighbors and loss for training the embeddings via DVQ.
 
   Args:
@@ -140,11 +132,7 @@ def embedding_lookup(x,
     block_v_size: Number of table entries per block.
     random_top_k: Noisy top-k if this is bigger than 1 (Default: 1).
     soft_em: If True then use soft EM rather than hard EM (Default: False).
-    soft_em_startup_steps: Number of steps before soft_em activates
-      (Default: 10000).
-    inv_temp: Inverse temperature for soft EM (Default: 1.)
-    ema_count: Table of counts for each embedding corresponding to how many
-      examples in a batch it was the closest to (Default: None).
+    num_samples: Number of samples to use for soft EM (Default: 1).
 
   Returns:
     The nearest neighbor in one hot form, the nearest neighbor itself, the
@@ -156,14 +144,9 @@ def embedding_lookup(x,
       block_v_size,
       random_top_k,
       soft_em=soft_em,
-      soft_em_startup_steps=soft_em_startup_steps,
-      inv_temp=inv_temp,
-      ema_count=ema_count)
+      num_samples=num_samples)
   x_means_hot_flat = tf.reshape(x_means_hot, [-1, num_blocks, block_v_size])
-  x_means_idx = tf.argmax(x_means_hot_flat, axis=-1)
-  x_means = tf.matmul(
-      tf.transpose(tf.one_hot(x_means_idx, block_v_size), perm=[1, 0, 2]),
-      means)
+  x_means = tf.matmul(tf.transpose(x_means_hot_flat, perm=[1, 0, 2]), means)
   x_means = tf.transpose(x_means, [1, 0, 2])
   q_loss = tf.reduce_mean(tf.square((tf.stop_gradient(x) - x_means)))
   e_loss = tf.reduce_mean(tf.square(x - tf.stop_gradient(x_means)))
@@ -428,8 +411,7 @@ def discrete_bottleneck(x,
                         discrete_mix=0.5,
                         random_top_k=1,
                         soft_em=False,
-                        soft_em_startup_steps=10000,
-                        inv_temp=1.0,
+                        num_samples=1,
                         epsilon=1e-5,
                         softmax_k=0,
                         kl_warmup_steps=150000,
@@ -467,9 +449,7 @@ def discrete_bottleneck(x,
       (Default: 0.5).
     random_top_k: Noisy top-k for DVQ (Default: 1).
     soft_em: If True then use soft EM rather than hard EM (Default: False).
-    soft_em_startup_steps: Number of steps before soft_em activates
-      (Default: 10000).
-    inv_temp: Inverse temperature for soft EM (Default: 1.)
+    num_samples: Number of samples for soft EM (Default: 1).
     epsilon: Epsilon parameter for DVQ (Default: 1e-5).
     softmax_k: If > 1 then do top-k softmax (Default: 0).
     kl_warmup_steps: Number of steps for kl warmup (Default: 150000).
@@ -488,6 +468,7 @@ def discrete_bottleneck(x,
     ValueError: If projection_tensors is None for reshape_method project, or
     ema_count or ema_means is None if we are using ema, or unknown args.
   """
+  tf.logging.info("Shape of x = {}".format(common_layers.shape_list(x)))
   block_v_size = None
   if bottleneck_kind == "dvq":
     # Define the dvq parameters
@@ -577,7 +558,7 @@ def discrete_bottleneck(x,
       for i in range(num_residuals):
         x_means_hot_res, x_means_res, q_loss_res, e_loss_res = embedding_lookup(
             x_res, means[i], num_blocks, block_v_size, random_top_k, soft_em,
-            soft_em_startup_steps, inv_temp, ema_count[i])
+            num_samples)
 
         # Update the ema variables
         if ema:
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
index 8bac3bd30..5dba2c6f9 100644
--- a/tensor2tensor/layers/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -20,7 +20,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import expert_utils as eu
@@ -29,8 +29,6 @@
 
 import tensorflow as tf
 
-from tensorflow.python.eager import context
-
 
 @registry.register_symbol_modality("default")
 class SymbolModality(modality.Modality):
@@ -84,7 +82,7 @@ def _get_weights(self, hidden_dim=None):
       hidden_dim = self._body_input_depth
     num_shards = self._model_hparams.symbol_modality_num_shards
     shards = []
-    for i in xrange(num_shards):
+    for i in range(num_shards):
       shard_size = (self._vocab_size // num_shards) + (
           1 if i < self._vocab_size % num_shards else 0)
       var_name = "weights_%d" % i
@@ -97,7 +95,7 @@ def _get_weights(self, hidden_dim=None):
     else:
       ret = tf.concat(shards, 0)
     # Convert ret to tensor.
-    if not context.in_eager_mode():
+    if not tf.contrib.eager.in_eager_mode():
       ret = eu.convert_gradient_to_tensor(ret)
     return ret
 
@@ -211,13 +209,13 @@ class ImageModality(modality.Modality):
   def bottom(self, inputs):
     with tf.variable_scope(self.name):
       inputs = tf.to_float(inputs)
-      if not context.in_eager_mode():
+      if not tf.contrib.eager.in_eager_mode():
         tf.summary.image("inputs", inputs, max_outputs=2)
       return inputs
 
   def targets_bottom(self, inputs):
     with tf.variable_scope(self.name):
-      if not context.in_eager_mode():
+      if not tf.contrib.eager.in_eager_mode():
         tf.summary.image("targets_bottom",
                          tf.cast(inputs, tf.uint8), max_outputs=1)
       inputs_shape = common_layers.shape_list(inputs)
@@ -240,7 +238,7 @@ def targets_bottom(self, inputs):
 
   def top(self, body_output, _):
     # TODO(lukaszkaiser): is this a universal enough way to get channels?
-    num_channels = self._model_hparams.problem_instances[0].num_channels
+    num_channels = self._model_hparams.problem.num_channels
     with tf.variable_scope("rgb_softmax"):
       body_output_shape = common_layers.shape_list(body_output)
       reshape_shape = body_output_shape[:3]
@@ -258,8 +256,7 @@ def loss(self, logits, targets):
         logits,
         targets,
         self._model_hparams.label_smoothing,
-        weights_fn=self.targets_weights_fn,
-        gaussian=True)
+        weights_fn=self.targets_weights_fn)
 
 
 @registry.register_image_modality("image_channel_compress")
@@ -338,7 +335,7 @@ def get_channel_embeddings(self, io_depth, targets, hidden_size,
     rgb_embedding_var = tf.identity(rgb_embedding_var)
     rgb_embedding_var *= float(hidden_size)**0.5
     channel_target_embs = []
-    for i in xrange(io_depth):
+    for i in range(io_depth):
       # Adding the channel offsets to get the right embedding since the
       # embedding tensor has shape 256 * io_depth, hidden_size
       target_ids = tf.squeeze(targets_split[i], axis=3) + i * 256
@@ -405,7 +402,7 @@ def xnet_resblock(x, filters, res_relu, name):
 
       x = tf.to_float(inputs) / 255.
       x.set_shape([None, None, None, 1])
-      for i in xrange(self._model_hparams.audio_compression):
+      for i in range(self._model_hparams.audio_compression):
         x = xnet_resblock(x, 2**(i + 1), True, "compress_block_%d" % i)
       return xnet_resblock(x, self._body_input_depth, False,
                            "compress_block_final")
@@ -449,7 +446,7 @@ def xnet_resblock(x, filters, res_relu, name):
       # Bitcast back from int32
       x = tf.bitcast(inputs, tf.float32)
       x.set_shape([None, None, None, 1])
-      for i in xrange(self._model_hparams.audio_compression):
+      for i in range(self._model_hparams.audio_compression):
         x = xnet_resblock(x, 2**(i + 1), True, "compress_block_%d" % i)
       return xnet_resblock(x, self._body_input_depth, False,
                            "compress_block_final")
@@ -465,8 +462,9 @@ def bottom(self, inputs):
       inputs_shape = common_layers.shape_list(inputs)
       if len(inputs_shape) != 5:
         raise ValueError("Assuming videos given as tensors in the format "
-                         "[batch, time, height, width, channels].")
-      if not context.in_eager_mode():
+                         "[batch, time, height, width, channels] but got one "
+                         "of shape: %s" % str(inputs_shape))
+      if not tf.contrib.eager.in_eager_mode():
         tf.summary.image("inputs", tf.cast(inputs[:, -1, :, :, :], tf.uint8),
                          max_outputs=1)
       # Standardize frames.
@@ -485,8 +483,9 @@ def targets_bottom(self, inputs):
       inputs_shape = common_layers.shape_list(inputs)
       if len(inputs_shape) != 5:
         raise ValueError("Assuming videos given as tensors in the format "
-                         "[batch, time, height, width, channels].")
-      if not context.in_eager_mode():
+                         "[batch, time, height, width, channels] but got one "
+                         "of shape: %s" % str(inputs_shape))
+      if not tf.contrib.eager.in_eager_mode():
         tf.summary.image(
             "targets_bottom", tf.cast(inputs[:, -1, :, :, :], tf.uint8),
             max_outputs=1)
@@ -511,8 +510,8 @@ def targets_bottom(self, inputs):
       return merged
 
   def top(self, body_output, _):
-    num_channels = self._model_hparams.problem_instances[0].num_channels
-    num_frames = self._model_hparams.problem_instances[0].num_target_frames
+    num_channels = self._model_hparams.problem.num_channels
+    num_frames = self._model_hparams.problem.num_target_frames
     with tf.variable_scope("rgb_softmax"):
       body_output_shape = common_layers.shape_list(body_output)
       reshape_shape = body_output_shape[:3]
@@ -535,8 +534,7 @@ def loss(self, logits, targets):
         logits,
         targets,
         self._model_hparams.label_smoothing,
-        weights_fn=self.targets_weights_fn,
-        gaussian=True)
+        weights_fn=self.targets_weights_fn)
 
 
 @registry.register_class_label_modality("default")
@@ -684,7 +682,7 @@ def loss(self, top_out, targets):
     return loss_scale, loss_denom
 
 
-@registry.register_class_label_modality("sigmoid_pooling")
+@registry.register_class_label_modality("sigmoid_max_pooling")
 class SigmoidMaxPoolingClassLabelModality(ClassLabelModality):
   """Sigmoid cross-entropy applied on max-pooling over timesteps."""
 
diff --git a/tensor2tensor/layers/rev_block.py b/tensor2tensor/layers/rev_block.py
index aaacf0c5d..a6e462f7b 100644
--- a/tensor2tensor/layers/rev_block.py
+++ b/tensor2tensor/layers/rev_block.py
@@ -27,7 +27,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_layers
 import tensorflow as tf
@@ -117,7 +117,7 @@ def _rev_block_forward(x1,
                        gate_outputs=False):
   """Forward for a series of reversible layers."""
   out = (x1, x2)
-  for i in xrange(num_layers):
+  for i in range(num_layers):
     out = _rev_layer_forward(
         out, f[i], g[i], f_side_input, g_side_input, gate_outputs=gate_outputs)
 
@@ -216,7 +216,7 @@ def _efficient_grad_fn(self, inputs, variables, ys, grad_ys):
     f.reverse()
     g.reverse()
 
-    for i in xrange(self.num_layers):
+    for i in range(self.num_layers):
       ys, grad_ys, f_ret, g_ret = _rev_layer_backward(
           ys, grad_ys, f[i], g[i], f_vars[i], self.f_side_input, g_vars[i],
           self.g_side_input)
@@ -286,7 +286,7 @@ def backward(self, y1, y2):
     f.reverse()
     g.reverse()
 
-    for i in xrange(self.num_layers):
+    for i in range(self.num_layers):
       gy1 = g[i](y1, self.g_side_input) if self.g_side_input else g[i](y1)
       x2 = y2 - gy1
       fx2 = f[i](x2, self.f_side_input) if self.f_side_input else f[i](x2)
diff --git a/tensor2tensor/models/basic.py b/tensor2tensor/models/basic.py
index ec65e68b2..d6fdc6101 100644
--- a/tensor2tensor/models/basic.py
+++ b/tensor2tensor/models/basic.py
@@ -37,7 +37,7 @@ def body(self, features):
     x = features["inputs"]
     shape = common_layers.shape_list(x)
     x = tf.reshape(x, [-1, shape[1] * shape[2] * shape[3]])
-    for i in xrange(hparams.num_hidden_layers):
+    for i in range(hparams.num_hidden_layers):
       x = tf.layers.dense(x, hparams.hidden_size, name="layer_%d" % i)
       x = tf.nn.dropout(x, keep_prob=1.0 - hparams.dropout)
       x = tf.nn.relu(x)
@@ -74,7 +74,7 @@ def encoder(self, x):
       hparams = self._hparams
       kernel, strides = self._get_kernel_and_strides()
       # Down-convolutions.
-      for i in xrange(hparams.num_hidden_layers):
+      for i in range(hparams.num_hidden_layers):
         x = tf.layers.conv2d(
             x, hparams.hidden_size * 2**(i + 1), kernel, strides=strides,
             padding="SAME", activation=common_layers.belu, name="conv_%d" % i)
@@ -86,7 +86,7 @@ def decoder(self, x):
       hparams = self._hparams
       kernel, strides = self._get_kernel_and_strides()
       # Up-convolutions.
-      for i in xrange(hparams.num_hidden_layers):
+      for i in range(hparams.num_hidden_layers):
         j = hparams.num_hidden_layers - i - 1
         x = tf.layers.conv2d_transpose(
             x, hparams.hidden_size * 2**j, kernel, strides=strides,
@@ -159,7 +159,7 @@ def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1,
     # Sample and decode.
     # TODO(lukaszkaiser): is this a universal enough way to get channels?
     try:
-      num_channels = self._hparams.problem_instances[0].num_channels
+      num_channels = self._hparams.problem.num_channels
     except AttributeError:
       num_channels = 1
     features["targets"] = tf.zeros(
@@ -206,7 +206,7 @@ def basic_autoencoder():
   hparams.learning_rate_constant = 0.0002
   hparams.learning_rate_warmup_steps = 500
   hparams.learning_rate_schedule = "constant * linear_warmup"
-  hparams.label_smoothing = 0.05
+  hparams.label_smoothing = 0.0
   hparams.batch_size = 128
   hparams.hidden_size = 64
   hparams.num_hidden_layers = 5
diff --git a/tensor2tensor/models/basic_test.py b/tensor2tensor/models/basic_test.py
new file mode 100644
index 000000000..5a07a5502
--- /dev/null
+++ b/tensor2tensor/models/basic_test.py
@@ -0,0 +1,70 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Basic nets tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor.data_generators import mnist  # pylint: disable=unused-import
+from tensor2tensor.models import basic
+from tensor2tensor.utils import trainer_lib
+
+import tensorflow as tf
+
+
+class BasicTest(tf.test.TestCase):
+
+  def testBasicFcRelu(self):
+    x = np.random.random_integers(0, high=255, size=(1, 28, 28, 1))
+    y = np.random.random_integers(0, high=9, size=(1, 1))
+    hparams = trainer_lib.create_hparams(
+        "basic_fc_small", problem_name="image_mnist", data_dir=".")
+    with self.test_session() as session:
+      features = {
+          "inputs": tf.constant(x, dtype=tf.int32),
+          "targets": tf.constant(y, dtype=tf.int32),
+      }
+      model = basic.BasicFcRelu(hparams, tf.estimator.ModeKeys.TRAIN)
+      logits, _ = model(features)
+      session.run(tf.global_variables_initializer())
+      res = session.run(logits)
+    self.assertEqual(res.shape, (1, 1, 1, 1, 10))
+
+  def testBasicAutoencoder(self):
+    x = np.random.random_integers(0, high=255, size=(1, 28, 28, 1))
+    y = np.random.random_integers(0, high=9, size=(1, 1))
+    hparams = trainer_lib.create_hparams(
+        "basic_autoencoder", problem_name="image_mnist_rev", data_dir=".")
+    with self.test_session() as session:
+      features = {
+          "targets": tf.constant(x, dtype=tf.int32),
+          "inputs": tf.constant(y, dtype=tf.int32),
+      }
+      tf.train.create_global_step()
+      model = basic.BasicAutoencoder(hparams, tf.estimator.ModeKeys.TRAIN)
+      logits, _ = model(features)
+      session.run(tf.global_variables_initializer())
+      res = session.run(logits)
+    self.assertEqual(res.shape, (1, 28, 28, 1, 256))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/models/bytenet.py b/tensor2tensor/models/bytenet.py
index cf576a0b3..74f46c27c 100644
--- a/tensor2tensor/models/bytenet.py
+++ b/tensor2tensor/models/bytenet.py
@@ -21,7 +21,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_hparams
 from tensor2tensor.layers import common_layers
@@ -36,8 +36,8 @@ def residual_dilated_conv(x, repeat, padding, name, hparams):
   with tf.variable_scope(name):
     k = (hparams.kernel_height, hparams.kernel_width)
     dilations_and_kernels = [((2**i, 1), k)
-                             for i in xrange(hparams.num_hidden_layers)]
-    for i in xrange(repeat):
+                             for i in range(hparams.num_hidden_layers)]
+    for i in range(repeat):
       with tf.variable_scope("repeat_%d" % i):
         y = common_layers.conv_block(
             common_layers.layer_norm(x, hparams.hidden_size, name="lnorm"),
diff --git a/tensor2tensor/models/neural_gpu.py b/tensor2tensor/models/neural_gpu.py
index 1d1875743..7d6433b92 100644
--- a/tensor2tensor/models/neural_gpu.py
+++ b/tensor2tensor/models/neural_gpu.py
@@ -21,7 +21,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_hparams
 from tensor2tensor.layers import common_layers
@@ -37,7 +37,7 @@ def neural_gpu_body(inputs, hparams, name=None):
 
     def step(state, inp):  # pylint: disable=missing-docstring
       x = tf.nn.dropout(state, 1.0 - hparams.dropout)
-      for layer in xrange(hparams.num_hidden_layers):
+      for layer in range(hparams.num_hidden_layers):
         x = common_layers.conv_gru(
             x, (hparams.kernel_height, hparams.kernel_width),
             hparams.hidden_size,
@@ -70,7 +70,7 @@ def step(state_tup, inp):
       """Single step of the improved Neural GPU."""
       state, _ = state_tup
       x = state
-      for layer in xrange(hparams.num_hidden_layers):
+      for layer in range(hparams.num_hidden_layers):
         x, new_loss = common_layers.diagonal_conv_gru(
             x, (hparams.kernel_height, hparams.kernel_width),
             hparams.hidden_size,
diff --git a/tensor2tensor/models/research/adafactor_experiments.py b/tensor2tensor/models/research/adafactor_experiments.py
index c06c3f0cc..d7d3d4e2c 100644
--- a/tensor2tensor/models/research/adafactor_experiments.py
+++ b/tensor2tensor/models/research/adafactor_experiments.py
@@ -218,3 +218,12 @@ def afx_small_p8():
   hparams = afx_small()
   hparams.add_hparam("simulated_parameter_quantize_bits", 8)
   return hparams
+
+
+@registry.register_hparams
+def afx_small_bfloat16():
+  """Small transformer model with small batch size for fast step times."""
+  hparams = afx_small()
+  hparams.weight_dtype = "bfloat16"
+  hparams.activation_dtype = "bfloat16"
+  return hparams
diff --git a/tensor2tensor/models/research/attention_lm.py b/tensor2tensor/models/research/attention_lm.py
index cbc45c4e7..bf7315f07 100644
--- a/tensor2tensor/models/research/attention_lm.py
+++ b/tensor2tensor/models/research/attention_lm.py
@@ -27,7 +27,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_hparams
@@ -104,7 +104,7 @@ def attention_lm_decoder(decoder_input,
   """
   x = decoder_input
   with tf.variable_scope(name):
-    for layer in xrange(hparams.num_hidden_layers):
+    for layer in range(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
         with tf.variable_scope("self_attention"):
           y = common_attention.multihead_attention(
diff --git a/tensor2tensor/models/research/attention_lm_moe.py b/tensor2tensor/models/research/attention_lm_moe.py
index 49ca3d20f..14b633495 100644
--- a/tensor2tensor/models/research/attention_lm_moe.py
+++ b/tensor2tensor/models/research/attention_lm_moe.py
@@ -29,7 +29,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_hparams
@@ -182,7 +182,7 @@ def print_shape(x, suffix, debug=False):
 
     num_hidden_layers = (
         len(hparams.attention_layers) or hparams.num_hidden_layers)
-    for layer in xrange(num_hidden_layers):
+    for layer in range(num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
 
         # Use the layer type defined in attention_layers
diff --git a/tensor2tensor/models/research/autoencoders.py b/tensor2tensor/models/research/autoencoders.py
index a7c696499..d9c852742 100644
--- a/tensor2tensor/models/research/autoencoders.py
+++ b/tensor2tensor/models/research/autoencoders.py
@@ -30,7 +30,53 @@
 
 
 @registry.register_model
-class ResidualAutoencoder(basic.BasicAutoencoder):
+class AutoencoderAutoregressive(basic.BasicAutoencoder):
+  """Autoencoder with an autoregressive part."""
+
+  def body(self, features):
+    hparams = self._hparams
+    shape = common_layers.shape_list(features["targets"])
+    # Run the basic autoencoder part first.
+    basic_result, losses = super(AutoencoderAutoregressive, self).body(features)
+    # Prepare inputs for autoregressive modes.
+    targets_keep_prob = 1.0 - hparams.autoregressive_dropout
+    targets_dropout = common_layers.dropout_with_broadcast_dims(
+        features["targets"], targets_keep_prob, broadcast_dims=[-1])
+    targets1d = tf.reshape(targets_dropout, [shape[0], -1, shape[3]])
+    targets_shifted = common_layers.shift_right_3d(targets1d)
+    basic1d = tf.reshape(basic_result, [shape[0], -1, shape[3]])
+    concat1d = tf.concat([basic1d, targets_shifted], axis=-1)
+    # The forget_base hparam sets purely-autoregressive mode, no autoencoder.
+    if hparams.autoregressive_forget_base:
+      concat1d = tf.reshape(features["targets"], [shape[0], -1, shape[3]])
+      concat1d = common_layers.shift_right_3d(concat1d)
+    # The autoregressive part depends on the mode.
+    if hparams.autoregressive_mode == "none":
+      assert not hparams.autoregressive_forget_base
+      return basic_result, losses
+    if hparams.autoregressive_mode == "conv3":
+      res = common_layers.conv1d(concat1d, shape[3], 3, padding="LEFT",
+                                 activation=common_layers.belu,
+                                 name="autoregressive_conv3")
+      return tf.reshape(res, shape), losses
+    if hparams.autoregressive_mode == "conv5":
+      res = common_layers.conv1d(concat1d, shape[3], 5, padding="LEFT",
+                                 activation=common_layers.belu,
+                                 name="autoregressive_conv5")
+      return tf.reshape(res, shape), losses
+    if hparams.autoregressive_mode == "sru":
+      res = common_layers.conv1d(concat1d, shape[3], 3, padding="LEFT",
+                                 activation=common_layers.belu,
+                                 name="autoregressive_sru_conv3")
+      res = common_layers.sru(res)
+      return tf.reshape(res, shape), losses
+
+    raise ValueError("Unsupported autoregressive mode: %s"
+                     % hparams.autoregressive_mode)
+
+
+@registry.register_model
+class AutoencoderResidual(AutoencoderAutoregressive):
   """Residual autoencoder."""
 
   def encoder(self, x):
@@ -45,7 +91,7 @@ def encoder(self, x):
       if hparams.residual_use_separable_conv:
         residual_conv = tf.layers.separable_conv2d
       # Down-convolutions.
-      for i in xrange(hparams.num_hidden_layers):
+      for i in range(hparams.num_hidden_layers):
         with tf.variable_scope("layer_%d" % i):
           x = tf.nn.dropout(x, 1.0 - hparams.dropout)
           filters = hparams.hidden_size * 2**(i + 1)
@@ -54,7 +100,7 @@ def encoder(self, x):
               x, filters, kernel, strides=strides,
               padding="SAME", activation=common_layers.belu, name="strided")
           y = x
-          for r in xrange(hparams.num_residual_layers):
+          for r in range(hparams.num_residual_layers):
             residual_filters = filters
             if r < hparams.num_residual_layers - 1:
               residual_filters = int(
@@ -79,7 +125,7 @@ def decoder(self, x):
       if hparams.residual_use_separable_conv:
         residual_conv = tf.layers.separable_conv2d
       # Up-convolutions.
-      for i in xrange(hparams.num_hidden_layers):
+      for i in range(hparams.num_hidden_layers):
         x = tf.nn.dropout(x, 1.0 - hparams.dropout)
         j = hparams.num_hidden_layers - i - 1
         filters = hparams.hidden_size * 2**j
@@ -91,7 +137,7 @@ def decoder(self, x):
               x, filters, kernel, strides=strides,
               padding="SAME", activation=common_layers.belu, name="strided")
           y = x
-          for r in xrange(hparams.num_residual_layers):
+          for r in range(hparams.num_residual_layers):
             residual_filters = filters
             if r < hparams.num_residual_layers - 1:
               residual_filters = int(
@@ -106,7 +152,7 @@ def decoder(self, x):
 
 
 @registry.register_model
-class BasicDiscreteAutoencoder(basic.BasicAutoencoder):
+class AutoencoderBasicDiscrete(AutoencoderAutoregressive):
   """Discrete autoencoder."""
 
   def bottleneck(self, x):
@@ -132,7 +178,7 @@ def sample(self):
 
 
 @registry.register_model
-class ResidualDiscreteAutoencoder(ResidualAutoencoder):
+class AutoencoderResidualDiscrete(AutoencoderResidual):
   """Discrete residual autoencoder."""
 
   def bottleneck(self, x, bottleneck_size=None):
@@ -160,13 +206,15 @@ def sample(self):
     size = [hp.batch_size, hp.sample_height // div_x, hp.sample_width // div_y,
             hp.bottleneck_size]
     rand = tf.random_uniform(size)
-    res1 = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0
-    res2 = tf.zeros_like(rand) - 1.0
-    return tf.concat([res2[:, :, :, :2], res1[:, :, :, 2:]], axis=-1)
+    res = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0
+    # If you want to set some first bits to a fixed value, do this:
+    # fixed = tf.zeros_like(rand) - 1.0
+    # res = tf.concat([fixed[:, :, :, :2], res[:, :, :, 2:]], axis=-1)
+    return res
 
 
 @registry.register_model
-class OrderedDiscreteAutoencoder(ResidualDiscreteAutoencoder):
+class AutoencoderOrderedDiscrete(AutoencoderResidualDiscrete):
   """Ordered discrete autoencoder."""
 
   def bottleneck(self, x):
@@ -195,7 +243,7 @@ def bottleneck(self, x):
 
 
 @registry.register_model
-class StackedAutoencoder(ResidualDiscreteAutoencoder):
+class AutoencoderStacked(AutoencoderResidualDiscrete):
   """A stacked autoencoder."""
 
   def stack(self, b, size, bottleneck_size, name):
@@ -290,9 +338,19 @@ def body(self, features):
 
 
 @registry.register_hparams
-def residual_autoencoder():
-  """Residual autoencoder model."""
+def autoencoder_autoregressive():
+  """Autoregressive autoencoder model."""
   hparams = basic.basic_autoencoder()
+  hparams.add_hparam("autoregressive_forget_base", False)
+  hparams.add_hparam("autoregressive_mode", "conv3")
+  hparams.add_hparam("autoregressive_dropout", 0.4)
+  return hparams
+
+
+@registry.register_hparams
+def autoencoder_residual():
+  """Residual autoencoder model."""
+  hparams = autoencoder_autoregressive()
   hparams.optimizer = "Adam"
   hparams.learning_rate_constant = 0.0001
   hparams.learning_rate_warmup_steps = 500
@@ -311,9 +369,9 @@ def residual_autoencoder():
 
 
 @registry.register_hparams
-def basic_discrete_autoencoder():
+def autoencoder_basic_discrete():
   """Basic autoencoder model."""
-  hparams = basic.basic_autoencoder()
+  hparams = autoencoder_autoregressive()
   hparams.num_hidden_layers = 5
   hparams.hidden_size = 64
   hparams.bottleneck_size = 4096
@@ -324,9 +382,9 @@ def basic_discrete_autoencoder():
 
 
 @registry.register_hparams
-def residual_discrete_autoencoder():
+def autoencoder_residual_discrete():
   """Residual discrete autoencoder model."""
-  hparams = residual_autoencoder()
+  hparams = autoencoder_residual()
   hparams.bottleneck_size = 4096
   hparams.bottleneck_noise = 0.1
   hparams.bottleneck_warmup_steps = 3000
@@ -339,9 +397,9 @@ def residual_discrete_autoencoder():
 
 
 @registry.register_hparams
-def residual_discrete_autoencoder_big():
+def autoencoder_residual_discrete_big():
   """Residual discrete autoencoder model, big version."""
-  hparams = residual_discrete_autoencoder()
+  hparams = autoencoder_residual_discrete()
   hparams.hidden_size = 128
   hparams.max_hidden_size = 4096
   hparams.bottleneck_noise = 0.1
@@ -351,15 +409,15 @@ def residual_discrete_autoencoder_big():
 
 
 @registry.register_hparams
-def ordered_discrete_autoencoder():
+def autoencoder_ordered_discrete():
   """Basic autoencoder model."""
-  hparams = residual_discrete_autoencoder()
+  hparams = autoencoder_residual_discrete()
   return hparams
 
 
 @registry.register_hparams
-def stacked_autoencoder():
+def autoencoder_stacked():
   """Stacked autoencoder model."""
-  hparams = residual_discrete_autoencoder()
+  hparams = autoencoder_residual_discrete()
   hparams.bottleneck_size = 128
   return hparams
diff --git a/tensor2tensor/models/research/autoencoders_test.py b/tensor2tensor/models/research/autoencoders_test.py
new file mode 100644
index 000000000..9cdcd139a
--- /dev/null
+++ b/tensor2tensor/models/research/autoencoders_test.py
@@ -0,0 +1,84 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Autoencoders tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor.data_generators import mnist  # pylint: disable=unused-import
+from tensor2tensor.models.research import autoencoders  # pylint: disable=unused-import
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import trainer_lib
+
+import tensorflow as tf
+
+
+class AutoencoderTest(tf.test.TestCase):
+
+  def getMnistRandomOutput(self, model_name, hparams_set=None,
+                           mode=tf.estimator.ModeKeys.TRAIN):
+    hparams_set = hparams_set or model_name
+    x = np.random.random_integers(0, high=255, size=(1, 28, 28, 1))
+    y = np.random.random_integers(0, high=9, size=(1, 1))
+    hparams = trainer_lib.create_hparams(
+        hparams_set, problem_name="image_mnist_rev", data_dir=".")
+    with self.test_session() as session:
+      features = {
+          "targets": tf.constant(x, dtype=tf.int32),
+          "inputs": tf.constant(y, dtype=tf.int32),
+      }
+      tf.train.create_global_step()
+      model = registry.model(model_name)(hparams, mode)
+      logits, _ = model(features)
+      session.run(tf.global_variables_initializer())
+      res = session.run(logits)
+    return res
+
+  @property
+  def mnistOutputShape(self):
+    return (1, 28, 28, 1, 256)
+
+  def testAutoencoderAutoregressive(self):
+    res = self.getMnistRandomOutput("autoencoder_autoregressive")
+    self.assertEqual(res.shape, self.mnistOutputShape)
+
+  def testAutoencoderResidual(self):
+    res = self.getMnistRandomOutput("autoencoder_residual")
+    self.assertEqual(res.shape, self.mnistOutputShape)
+
+  def testAutoencoderBasicDiscrete(self):
+    res = self.getMnistRandomOutput("autoencoder_basic_discrete")
+    self.assertEqual(res.shape, self.mnistOutputShape)
+
+  def testAutoencoderResidualDiscrete(self):
+    res = self.getMnistRandomOutput("autoencoder_residual_discrete")
+    self.assertEqual(res.shape, self.mnistOutputShape)
+
+  def testAutoencoderOrderedDiscrete(self):
+    res = self.getMnistRandomOutput("autoencoder_ordered_discrete")
+    self.assertEqual(res.shape, self.mnistOutputShape)
+
+  def testAutoencoderStacked(self):
+    res = self.getMnistRandomOutput("autoencoder_stacked")
+    self.assertEqual(res.shape, self.mnistOutputShape)
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/models/research/basic_conv_gen.py b/tensor2tensor/models/research/basic_conv_gen.py
index cd98cde77..f35509237 100644
--- a/tensor2tensor/models/research/basic_conv_gen.py
+++ b/tensor2tensor/models/research/basic_conv_gen.py
@@ -21,6 +21,8 @@
 
 # Dependency imports
 
+import six
+
 from tensor2tensor.layers import common_hparams
 from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import registry
@@ -31,6 +33,7 @@
 
 @registry.register_model
 class BasicConvGen(t2t_model.T2TModel):
+  """Basic convolutional next-frame model."""
 
   def body(self, features):
     hparams = self.hparams
@@ -46,11 +49,12 @@ def body(self, features):
         x, x, final_length_divisible_by=2**hparams.num_compress_steps, axis=2)
 
     # Down-stride.
-    for _ in range(hparams.num_compress_steps):
-      x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu,
-                           strides=(2, 2), padding="SAME")
-      x = common_layers.layer_norm(x)
-      filters *= 2
+    for i in range(hparams.num_compress_steps):
+      with tf.variable_scope("downstride%d" % i):
+        x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu,
+                             strides=(2, 2), padding="SAME")
+        x = common_layers.layer_norm(x)
+        filters *= 2
 
     # Add embedded action.
     action = tf.reshape(features["input_action"][:, 1, :],
@@ -71,29 +75,62 @@ def body(self, features):
           x = common_layers.layer_norm(x + y)
 
     # Up-convolve.
-    for _ in range(hparams.num_compress_steps):
-      filters //= 2
-      x = tf.layers.conv2d_transpose(
-          x, filters, kernel2, activation=common_layers.belu,
-          strides=(2, 2), padding="SAME")
-      x = common_layers.layer_norm(x)
-      x = tf.nn.dropout(x, 1.0 - hparams.dropout)
+    for i in range(hparams.num_compress_steps):
+      with tf.variable_scope("upstride%d" % i):
+        filters //= 2
+        x = tf.layers.conv2d_transpose(
+            x, filters, kernel2, activation=common_layers.belu,
+            strides=(2, 2), padding="SAME")
+        x = common_layers.layer_norm(x)
+        x = tf.nn.dropout(x, 1.0 - hparams.dropout)
 
     # Cut down to original size.
     x = x[:, :inputs_shape[1], :inputs_shape[2], :]
 
     # Reward prediction.
-    reward_pred_h1 = tf.reduce_mean(x, axis=[1, 2], keep_dims=True)
-    # Rewards are {-1, 0, 1} so we predict 3.
-    reward_pred = tf.layers.dense(reward_pred_h1, 3, name="reward")
-    reward_gold = tf.expand_dims(tf.to_int32(
-        features["input_reward_raw"][:, 1, :]), axis=1)
-    reward_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-        labels=reward_gold, logits=reward_pred, name="reward_loss")
-    reward_loss = tf.reduce_mean(reward_loss)
-    return {"targets": x, "target_reward": reward_pred_h1}
-    # return x, {"reward": reward_loss}
-    # return x
+    reward_pred = tf.reduce_mean(x, axis=[1, 2], keep_dims=True)
+    return {"targets": x, "target_reward": reward_pred}
+
+  def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1,
+            alpha=0.0):
+    """Produce predictions from the model by running it."""
+    # Inputs and features preparation needed to handle edge cases.
+    if not features:
+      features = {}
+    inputs_old = None
+    if "inputs" in features and len(features["inputs"].shape) < 4:
+      inputs_old = features["inputs"]
+      features["inputs"] = tf.expand_dims(features["inputs"], 2)
+
+    # Get predictions.
+    try:
+      num_channels = self._hparams.problem.num_channels
+    except AttributeError:
+      num_channels = 1
+    features["targets"] = tf.zeros(
+        [self._hparams.batch_size, 1, 1, 1, num_channels], dtype=tf.int32)
+    features["target_reward"] = tf.zeros(
+        [self._hparams.batch_size, 1, 1], dtype=tf.int32)
+    logits, _ = self(features)  # pylint: disable=not-callable
+    if isinstance(logits, dict):
+      results = {}
+      for k, v in six.iteritems(logits):
+        # Argmax in TF doesn't handle more than 5 dimensions yet.
+        v_shape = common_layers.shape_list(v)
+        argmax = tf.argmax(tf.reshape(v, [-1, v_shape[-1]]), axis=-1)
+        results[k] = tf.reshape(argmax, v_shape[:-1])
+    else:
+      # Argmax in TF doesn't handle more than 5 dimensions yet.
+      logits_shape = common_layers.shape_list(logits)
+      argmax = tf.argmax(tf.reshape(logits, [-1, logits_shape[-1]]), axis=-1)
+      results = tf.reshape(argmax, logits_shape[:-1])
+
+    # Restore inputs to not confuse Estimator in edge cases.
+    if inputs_old is not None:
+      features["inputs"] = inputs_old
+
+    # Return results.
+    return results
 
 
 @registry.register_hparams
@@ -149,11 +186,11 @@ def deconv2d(cur, i, kernel_size, output_filters, activation=tf.nn.relu):
           name="deconv2d" + str(i))
       return tf.depth_to_space(thicker, 2)
 
-    # cur_frame = common_layers.standardize_images(features["inputs_0"])
-    # prev_frame = common_layers.standardize_images(features["inputs_1"])
-    # frames = tf.concat([cur_frame, prev_frame], axis=3)
-    # frames = tf.reshape(frames, [-1, 210, 160, 6])
-    frames = common_layers.standardize_images(features["inputs"])
+    cur_frame = common_layers.standardize_images(features["inputs_0"])
+    prev_frame = common_layers.standardize_images(features["inputs_1"])
+
+    frames = tf.concat([cur_frame, prev_frame], axis=3)
+    frames = tf.reshape(frames, [-1, 210, 160, 6])
 
     h1 = tf.layers.conv2d(frames, filters=64, strides=2, kernel_size=(8, 8),
                           padding="SAME", activation=tf.nn.relu)
diff --git a/tensor2tensor/models/research/gene_expression.py b/tensor2tensor/models/research/gene_expression.py
index 0235774ab..abe0a4834 100644
--- a/tensor2tensor/models/research/gene_expression.py
+++ b/tensor2tensor/models/research/gene_expression.py
@@ -20,7 +20,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_hparams
 from tensor2tensor.layers import common_layers
@@ -59,7 +59,7 @@ def body(self, features):
 
     # Conv layers
     assert hp.num_conv_layers == len(hp.pooling_windows)
-    for i in xrange(hp.num_conv_layers):
+    for i in range(hp.num_conv_layers):
       out = conv_layer(
           out,
           hp.hidden_size,
@@ -71,7 +71,7 @@ def body(self, features):
           name="conv_%d" % (i + 1))
 
     # Dense dilated conv layers
-    for i in xrange(hp.num_dconv_layers):
+    for i in range(hp.num_dconv_layers):
       dilation_rate = 2**(i + 1)
       dconv_out = conv_layer(
           out,
diff --git a/tensor2tensor/models/research/gene_expression_test.py b/tensor2tensor/models/research/gene_expression_test.py
index 899defadb..70403935c 100644
--- a/tensor2tensor/models/research/gene_expression_test.py
+++ b/tensor2tensor/models/research/gene_expression_test.py
@@ -54,7 +54,7 @@ def _testModel(self, hparams, model_cls):
         "inputs": tf.constant(inputs, dtype=tf.int32),
         "targets": tf.constant(targets, dtype=tf.float32),
     }
-    p_hparams, = hparams.problems
+    p_hparams = hparams.problem_hparams
     logits, _ = model_cls(
         hparams, tf.estimator.ModeKeys.TRAIN, p_hparams)(features)
 
@@ -70,7 +70,7 @@ def testGeneExpressionModels(self):
     for model_cls, hparams in models_hparams:
       hparams.add_hparam("data_dir", None)
       p_hparams = gene_data.GenomicsExpressionCage10().get_hparams(hparams)
-      hparams.problems = [p_hparams]
+      hparams.problem_hparams = p_hparams
       self._testModel(hparams, model_cls)
 
 
diff --git a/tensor2tensor/models/research/lm_experiments.py b/tensor2tensor/models/research/lm_experiments.py
index a8d68583d..4e34673c2 100644
--- a/tensor2tensor/models/research/lm_experiments.py
+++ b/tensor2tensor/models/research/lm_experiments.py
@@ -77,3 +77,25 @@ def lmx_h2k_f8k():
   hparams.filter_size = 8192
   return hparams
 
+
+@registry.register_hparams
+def lmx_h3k_f12k():
+  """HParams for training languagemodel_lm1b32k_packed.  880M Params."""
+  hparams = lmx_base()
+  hparams.hidden_size = 3072
+  hparams.filter_size = 12288
+  hparams.batch_size = 2048
+  hparams.weight_dtype = "bfloat16"
+  return hparams
+
+
+@registry.register_hparams
+def lmx_h4k_f16k():
+  """HParams for training languagemodel_lm1b32k_packed.  1470M Params."""
+  hparams = lmx_base()
+  hparams.hidden_size = 4096
+  hparams.filter_size = 16384
+  hparams.batch_size = 1024
+  hparams.weight_dtype = "bfloat16"
+  return hparams
+
diff --git a/tensor2tensor/models/research/multimodel.py b/tensor2tensor/models/research/multimodel.py
index 88ab3950b..4b3d93445 100644
--- a/tensor2tensor/models/research/multimodel.py
+++ b/tensor2tensor/models/research/multimodel.py
@@ -20,7 +20,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_hparams
@@ -138,7 +138,7 @@ def flatten(inputs):
     inputs_mask = dp(lambda x: 1.0 - x, inputs_pad)
     inputs_encoded = dp(common_layers.add_timing_signal, inputs)
     expert_loss = 0.0
-    for i in xrange(hparams.num_hidden_layers):
+    for i in range(hparams.num_hidden_layers):
       with tf.variable_scope("enc_layer_%d" % i):
         inputs_encoded, moe_loss = conv_experts(inputs_encoded, hparams, dp,
                                                 self._ps_devices, "SAME",
@@ -168,7 +168,7 @@ def flatten(inputs):
     expert_fn = expert_utils.ffn_expert_fn(
         hparams.hidden_size, moe_hidden_sizes, hparams.hidden_size)
     x = dp(tf.nn.dropout, decoder_input, 1.0 - hparams.dropout)
-    for layer in xrange(hparams.num_hidden_layers):
+    for layer in range(hparams.num_hidden_layers):
       with tf.variable_scope("dec_layer_%d" % layer):
         with tf.variable_scope("attention"):
           y = dp(
diff --git a/tensor2tensor/models/research/multimodel_test.py b/tensor2tensor/models/research/multimodel_test.py
index ef8e30138..c480d23e1 100644
--- a/tensor2tensor/models/research/multimodel_test.py
+++ b/tensor2tensor/models/research/multimodel_test.py
@@ -39,7 +39,7 @@ def testMultiModel(self):
     hparams.add_hparam("data_dir", "")
     problem = registry.problem("image_cifar10")
     p_hparams = problem.get_hparams(hparams)
-    hparams.problems = [p_hparams]
+    hparams.problem_hparams = p_hparams
     with self.test_session() as session:
       features = {
           "inputs": tf.constant(x, dtype=tf.int32),
diff --git a/tensor2tensor/models/research/super_lm.py b/tensor2tensor/models/research/super_lm.py
index caaef23e6..40bfb7f64 100644
--- a/tensor2tensor/models/research/super_lm.py
+++ b/tensor2tensor/models/research/super_lm.py
@@ -30,7 +30,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_hparams
@@ -56,7 +56,7 @@ def body(self, features):
     assert hparams.num_model_shards % len(ps_devices) == 0
     shards_per_device = hparams.num_model_shards // len(ps_devices)
     model_devices = [ps_devices[i // shards_per_device]
-                     for i in xrange(hparams.num_model_shards)]
+                     for i in range(hparams.num_model_shards)]
     print("model_devices = %s" % model_devices)
     mp = expert_utils.Parallelism(model_devices, reuse=False)
     vocab_size = self._problem_hparams.vocabulary["targets"].vocab_size
diff --git a/tensor2tensor/models/research/transformer_revnet_test.py b/tensor2tensor/models/research/transformer_revnet_test.py
index b3d4c0812..89e075c12 100644
--- a/tensor2tensor/models/research/transformer_revnet_test.py
+++ b/tensor2tensor/models/research/transformer_revnet_test.py
@@ -47,7 +47,7 @@ def testTransformer(self):
     vocab_size = 9
     hparams = transformer_revnet_test()
     p_hparams = problem_hparams.test_problem_hparams(vocab_size, vocab_size)
-    hparams.problems = [p_hparams]
+    hparams.problem_hparams = p_hparams
     inputs = -1 + np.random.random_integers(
         vocab_size, size=(batch_size, input_length, 1, 1))
     targets = -1 + np.random.random_integers(
diff --git a/tensor2tensor/models/research/transformer_symshard.py b/tensor2tensor/models/research/transformer_symshard.py
index 64b9fed97..e3c541a07 100644
--- a/tensor2tensor/models/research/transformer_symshard.py
+++ b/tensor2tensor/models/research/transformer_symshard.py
@@ -46,7 +46,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_hparams
@@ -69,7 +69,7 @@ def body(self, features):
     assert hparams.num_model_shards % len(ps_devices) == 0
     shards_per_device = hparams.num_model_shards // len(ps_devices)
     model_devices = [ps_devices[i // shards_per_device]
-                     for i in xrange(hparams.num_model_shards)]
+                     for i in range(hparams.num_model_shards)]
     print("model_devices = %s" % model_devices)
     mp = expert_utils.Parallelism(model_devices, reuse=False)
     targets_vocab_size = self._problem_hparams.vocabulary["targets"].vocab_size
diff --git a/tensor2tensor/models/research/transformer_vae.py b/tensor2tensor/models/research/transformer_vae.py
index ec2966b6a..6d7b35b3e 100644
--- a/tensor2tensor/models/research/transformer_vae.py
+++ b/tensor2tensor/models/research/transformer_vae.py
@@ -25,7 +25,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_image_attention as cia
@@ -46,8 +46,8 @@
 def residual_conv(x, repeat, k, hparams, name, reuse=None):
   """A stack of convolution blocks with residual connections."""
   with tf.variable_scope(name, reuse=reuse):
-    dilations_and_kernels = [((1, 1), k) for _ in xrange(3)]
-    for i in xrange(repeat):
+    dilations_and_kernels = [((1, 1), k) for _ in range(3)]
+    for i in range(repeat):
       with tf.variable_scope("repeat_%d" % i):
         y = common_layers.conv_block(
             common_layers.layer_norm(x, hparams.hidden_size, name="lnorm"),
@@ -122,7 +122,7 @@ def compress(x, c, is_2d, hparams, name):
     cur = residual_conv(cur, hparams.num_compress_steps, k1, hparams, "rc")
     if c is not None and hparams.do_attend_compress:
       cur = attend(cur, c, hparams, "compress_attend")
-    for i in xrange(hparams.num_compress_steps):
+    for i in range(hparams.num_compress_steps):
       if hparams.do_residual_compress:
         cur = residual_conv(cur, hparams.num_compress_steps, k1, hparams,
                             "rc_%d" % i)
@@ -242,12 +242,12 @@ def ae_latent_softmax(latents_pred, latents_discrete, hparams):
   latents_logits = [
       tf.layers.dense(
           latents_pred, block_vocab_size, name="extra_logits_%d" % i)
-      for i in xrange(hparams.num_decode_blocks)
+      for i in range(hparams.num_decode_blocks)
   ]
   loss = None
   if latents_discrete is not None:
     losses = []
-    for i in xrange(hparams.num_decode_blocks):
+    for i in range(hparams.num_decode_blocks):
       d = tf.floormod(tf.floordiv(latents_discrete,
                                   block_vocab_size**i), block_vocab_size)
       losses.append(tf.nn.sparse_softmax_cross_entropy_with_logits(
@@ -309,7 +309,7 @@ def next_bit(latents_discrete, i):
       return tf.concat([latents_discrete_prev[:, :(i+1), :],
                         latents_discrete[:, (i+1):, :]], axis=1)
 
-  for i in xrange(iters):
+  for i in range(iters):
     latents_discrete = next_bit(latents_discrete, i)
   return latents_discrete
 
@@ -450,7 +450,7 @@ def bn_inputs():
       mask = tf.less(masking, tf.random_uniform(
           common_layers.shape_list(targets)[:-1]))
       mask = tf.expand_dims(tf.to_float(mask), 3)
-      for i in xrange(hparams.num_compress_steps):
+      for i in range(hparams.num_compress_steps):
         j = hparams.num_compress_steps - i - 1
         d = residual_conv(d, 1, (3, 1), hparams, "decompress_rc_%d" % j)
         if hparams.do_attend_decompress:
@@ -514,8 +514,7 @@ def __init__(self, *args, **kwargs):
         discrete_mix=self._hparams.d_mix,
         random_top_k=self._hparams.random_top_k,
         soft_em=self.hparams.soft_em,
-        soft_em_startup_steps=self.hparams.soft_em_startup_steps,
-        inv_temp=self.hparams.inv_temp,
+        num_samples=self.hparams.num_samples,
         epsilon=self._hparams.epsilon,
         softmax_k=self._hparams.softmax_k,
         kl_warmup_steps=self._hparams.kl_warmup_steps,
@@ -562,7 +561,7 @@ def __init__(self, *args, **kwargs):
       ema_means = None
       if self._hparams.ema:
         ema_count = []
-        for i in xrange(self._hparams.num_residuals):
+        for i in range(self._hparams.num_residuals):
           ema_count_i = tf.get_variable(
               "ema_count_{}".format(i),
               [self._hparams.num_blocks, block_v_size],
@@ -571,7 +570,7 @@ def __init__(self, *args, **kwargs):
           ema_count.append(ema_count_i)
         with tf.colocate_with(means):
           ema_means = []
-          for i in xrange(self._hparams.num_residuals):
+          for i in range(self._hparams.num_residuals):
             ema_means_i = tf.get_variable(
                 "ema_means_{}".format(i),
                 initializer=means.initialized_value()[i],
@@ -649,7 +648,7 @@ def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1,
     # More steps.
     self.predict_mask = 0.0  # Use the provided targets this time.
     how_many_more_steps = 0  # Set to 1 or more for Gibbs-like sampling.
-    for _ in xrange(how_many_more_steps):
+    for _ in range(how_many_more_steps):
       with tf.variable_scope(tf.get_variable_scope(), reuse=True):
         features["targets"] = samples
         logits, _ = self(features)  # pylint: disable=not-callable
@@ -716,7 +715,7 @@ def transformer_ae_small():
   hparams.add_hparam("ema", True)
   hparams.add_hparam("random_top_k", 1)
   hparams.add_hparam("soft_em", False)
-  hparams.add_hparam("soft_em_startup_steps", 10000)
+  hparams.add_hparam("num_samples", 10)
   hparams.add_hparam("inv_temp", 1.0)
   hparams.kl_warmup_steps = 150000
   hparams.force_full_predict = True
diff --git a/tensor2tensor/models/research/transformer_vae_test.py b/tensor2tensor/models/research/transformer_vae_test.py
index 3c73a4da6..ae08f6dc3 100644
--- a/tensor2tensor/models/research/transformer_vae_test.py
+++ b/tensor2tensor/models/research/transformer_vae_test.py
@@ -34,7 +34,7 @@ def testTransformerAEOnDVQ(self):
     hparams.bottleneck_kind = "dvq"
     hparams.dp_strength = 0
     p_hparams = problem_hparams.test_problem_hparams(vocab_size, vocab_size)
-    hparams.problems = [p_hparams]
+    hparams.problem_hparams = p_hparams
     inputs = -1 + np.random.random_integers(
         vocab_size, size=(batch_size, input_length, 1, 1))
     targets = -1 + np.random.random_integers(
diff --git a/tensor2tensor/models/resnet.py b/tensor2tensor/models/resnet.py
index d889ba328..e953ba6a0 100644
--- a/tensor2tensor/models/resnet.py
+++ b/tensor2tensor/models/resnet.py
@@ -420,6 +420,9 @@ def body(self, features):
         data_format,
         is_training=hp.mode == tf.estimator.ModeKeys.TRAIN)
 
+    if hp.use_nchw:
+      out = tf.transpose(out, [0, 2, 3, 1])
+
     return out
 
 
@@ -473,6 +476,23 @@ def resnet_18():
   return hp
 
 
+@registry.register_hparams
+def resnet_imagenet_34():
+  """Set of hyperparameters."""
+  hp = resnet_base()
+  hp.block_fn = "residual"
+  hp.layer_sizes = [2, 4, 8, 2]
+
+  return hp
+
+
+@registry.register_hparams
+def resnet_imagenet_102():
+  hp = resnet_imagenet_34()
+  hp.layer_sizes = [3, 8, 36, 3]
+  return hp
+
+
 @registry.register_hparams
 def resnet_cifar_15():
   """Set of hyperparameters."""
diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py
index 0410ff7d1..be6c51e4d 100644
--- a/tensor2tensor/models/slicenet.py
+++ b/tensor2tensor/models/slicenet.py
@@ -20,7 +20,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_attention
@@ -117,7 +117,7 @@ def norm_fn(x, name):
         return common_layers.apply_norm(
             x, hparams.norm_type, hparams.hidden_size, hparams.norm_epsilon)
 
-    for layer in xrange(layers):
+    for layer in range(layers):
       with tf.variable_scope("layer_%d" % layer):
         y = common_layers.subseparable_conv_block(
             x,
@@ -187,19 +187,6 @@ def norm_fn(x, name):
   target_space_emb = tf.tile(target_space_emb,
                              [tf.shape(targets_flat)[0], 1, 1, 1])
 
-  # Calculate similarity loss (but don't run if not needed).
-  if len(hparams.problems) > 1 and hparams.sim_loss_mult > 0.00001:
-    targets_timed = common_layers.add_timing_signal(targets_flat)
-    extra_layers = int(hparams.num_hidden_layers * 1.5)
-    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
-      targets_encoded = multi_conv_res(targets_timed, "SAME", "encoder",
-                                       extra_layers, hparams)
-    with tf.variable_scope("similarity_loss"):
-      similarity_loss = similarity_cost(inputs_encoded, targets_encoded)
-      similarity_loss *= hparams.sim_loss_mult
-  else:
-    similarity_loss = 0.0
-
   # Use attention from each target to look at input and retrieve.
   targets_shifted = common_layers.shift_right(
       targets_flat, pad_value=target_space_emb)
@@ -224,7 +211,7 @@ def norm_fn(x, name):
       separability=4,
       name="targets_merge")
 
-  return targets_merged, similarity_loss
+  return targets_merged, 0.0
 
 
 def embed_target_space(target_space_id, hidden_size):
diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py
index 2aceb4d5e..299944b6b 100644
--- a/tensor2tensor/models/slicenet_test.py
+++ b/tensor2tensor/models/slicenet_test.py
@@ -40,7 +40,7 @@ def testSliceNet(self):
     hparams.add_hparam("data_dir", "")
     problem = registry.problem("image_cifar10")
     p_hparams = problem.get_hparams(hparams)
-    hparams.problems = [p_hparams]
+    hparams.problem_hparams = p_hparams
     with self.test_session() as session:
       features = {
           "inputs": tf.constant(x, dtype=tf.int32),
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 88be60dfd..4fb89db61 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -29,7 +29,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.data_generators import librispeech
 from tensor2tensor.layers import common_attention
@@ -310,6 +310,7 @@ def _fast_decode(self,
       partial_targets = features.get("inputs")
       if partial_targets is None:
         partial_targets = features["targets"]
+      assert partial_targets is not None
       partial_targets = common_layers.expand_squeeze_to_nd(partial_targets, 2)
       partial_targets = tf.to_int64(partial_targets)
       partial_targets_shape = common_layers.shape_list(partial_targets)
@@ -664,8 +665,8 @@ def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
       32,
       ishape_static[-1],
       name="target_space_embedding",
-      dtype=tf.bfloat16 if hparams.activation_dtype == "bfloat16" or
-      hparams.weight_dtype == "bfloat16" else tf.float32)
+      dtype=tf.bfloat16 if hparams.activation_dtype == "bfloat16"
+      else tf.float32)
   emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
   encoder_input += emb_target_space
   if hparams.pos == "timing":
@@ -674,8 +675,7 @@ def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
           encoder_input, inputs_position)
     else:
       encoder_input = common_attention.add_timing_signal_1d(encoder_input)
-  if (hparams.activation_dtype == "bfloat16" or
-      hparams.weight_dtype == "bfloat16"):
+  if hparams.activation_dtype == "bfloat16":
     encoder_self_attention_bias = tf.cast(encoder_self_attention_bias,
                                           tf.bfloat16)
     encoder_decoder_attention_bias = tf.cast(encoder_decoder_attention_bias,
@@ -724,8 +724,7 @@ def transformer_prepare_decoder(targets, hparams, features=None):
           decoder_input, targets_position)
     else:
       decoder_input = common_attention.add_timing_signal_1d(decoder_input)
-  if (hparams.activation_dtype == "bfloat16" or
-      hparams.weight_dtype == "bfloat16"):
+  if hparams.activation_dtype == "bfloat16":
     decoder_self_attention_bias = tf.cast(decoder_self_attention_bias,
                                           tf.bfloat16)
   return (decoder_input, decoder_self_attention_bias)
@@ -774,8 +773,8 @@ def transformer_encoder(encoder_input,
     pad_remover = None
     if hparams.use_pad_remover and not common_layers.is_on_tpu():
       pad_remover = expert_utils.PadRemover(padding)
-    for layer in xrange(hparams.num_encoder_layers or
-                        hparams.num_hidden_layers):
+    for layer in range(hparams.num_encoder_layers or
+                       hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
         with tf.variable_scope("self_attention"):
           y = common_attention.multihead_attention(
@@ -845,8 +844,8 @@ def transformer_decoder(decoder_input,
       common_layers.comma_separated_string_to_integer_list(
           getattr(hparams, "attention_dropout_broadcast_dims", "")))
   with tf.variable_scope(name):
-    for layer in xrange(hparams.num_decoder_layers or
-                        hparams.num_hidden_layers):
+    for layer in range(hparams.num_decoder_layers or
+                       hparams.num_hidden_layers):
       layer_name = "layer_%d" % layer
       layer_cache = cache[layer_name] if cache is not None else None
       with tf.variable_scope(layer_name):
@@ -1592,10 +1591,15 @@ def transformer_supervised_attention():
 
 @registry.register_hparams
 def transformer_tpu_1b():
-  """Hparams for training with 1B parameters."""
+  """Hparams for machine translation with ~1.1B parameters."""
   hparams = transformer_tpu()
   hparams.hidden_size = 2048
   hparams.filter_size = 8192
   hparams.num_hidden_layers = 8
+  # smaller batch size to avoid OOM
   hparams.batch_size = 1024
+  hparams.activation_dtype = "bfloat16"
+  hparams.weight_dtype = "bfloat16"
+  # maximize number of parameters relative to computation by not sharing.
+  hparams.shared_embedding_and_softmax_weights = False
   return hparams
diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py
index 9b5d6fe4d..7eaf0e285 100644
--- a/tensor2tensor/models/transformer_test.py
+++ b/tensor2tensor/models/transformer_test.py
@@ -35,36 +35,38 @@
 VOCAB_SIZE = 10
 
 
-class TransformerTest(tf.test.TestCase):
+def get_model(hparams=None, mode=tf.estimator.ModeKeys.TRAIN,
+              has_input=True, model_cls=transformer.Transformer):
+  if hparams is None:
+    hparams = transformer.transformer_tiny()
+  hparams.hidden_size = 8
+  hparams.filter_size = 32
+  hparams.num_heads = 1
+  hparams.layer_prepostprocess_dropout = 0.0
+
+  p_hparams = problem_hparams.test_problem_hparams(VOCAB_SIZE, VOCAB_SIZE)
+  if not has_input:
+    p_hparams.input_modality = {}
+  hparams.problem_hparams = p_hparams
+
+  inputs = -1 + np.random.random_integers(
+      VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1))
+  targets = -1 + np.random.random_integers(
+      VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1))
+  features = {
+      "targets": tf.constant(targets, dtype=tf.int32, name="targets"),
+      "target_space_id": tf.constant(1, dtype=tf.int32)
+  }
+  if has_input:
+    features["inputs"] = tf.constant(inputs, dtype=tf.int32, name="inputs")
+
+  return model_cls(hparams, mode, p_hparams), features
 
-  def getModel(self, hparams=None, mode=tf.estimator.ModeKeys.TRAIN,
-               has_input=True, model_cls=transformer.Transformer):
-    if hparams is None:
-      hparams = transformer.transformer_tiny()
-    hparams.hidden_size = 8
-    hparams.filter_size = 32
-    hparams.num_heads = 1
-    hparams.layer_prepostprocess_dropout = 0.0
-
-    p_hparams = problem_hparams.test_problem_hparams(VOCAB_SIZE, VOCAB_SIZE)
-    if not has_input:
-      p_hparams.input_modality = {}
-    hparams.problems = [p_hparams]
-
-    inputs = -1 + np.random.random_integers(
-        VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1))
-    targets = -1 + np.random.random_integers(
-        VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1))
-    features = {
-        "inputs": tf.constant(inputs, dtype=tf.int32, name="inputs"),
-        "targets": tf.constant(targets, dtype=tf.int32, name="targets"),
-        "target_space_id": tf.constant(1, dtype=tf.int32)
-    }
 
-    return model_cls(hparams, mode, p_hparams), features
+class TransformerTest(tf.test.TestCase):
 
   def testTransformer(self):
-    model, features = self.getModel(transformer.transformer_small())
+    model, features = get_model(transformer.transformer_small())
     logits, _ = model(features)
     with self.test_session() as session:
       session.run(tf.global_variables_initializer())
@@ -72,17 +74,17 @@ def testTransformer(self):
     self.assertEqual(res.shape, (BATCH_SIZE, TARGET_LENGTH, 1, 1, VOCAB_SIZE))
 
   def testTransformerRelative(self):
-    model, features = self.getModel(transformer.transformer_relative_tiny())
+    model, features = get_model(transformer.transformer_relative_tiny())
     logits, _ = model(features)
     with self.test_session() as session:
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
     self.assertEqual(res.shape, (BATCH_SIZE, TARGET_LENGTH, 1, 1, VOCAB_SIZE))
 
-  def testGreedyVsFast(self):
-    model, features = self.getModel(transformer.transformer_small())
+  def testSlowVsFast(self):
+    model, features = get_model(transformer.transformer_small())
 
-    decode_length = 2
+    decode_length = 3
 
     out_logits, _ = model(features)
     out_logits = tf.squeeze(out_logits, axis=[2, 3])
@@ -114,10 +116,10 @@ def testGreedyVsFast(self):
     self.assertAllClose(greedy_res, fast_res)
 
   def testSlowVsFastNoInput(self):
-    model, features = self.getModel(
+    model, features = get_model(
         transformer.transformer_small(), has_input=False)
 
-    decode_length = 2
+    decode_length = 3
 
     out_logits, _ = model(features)
     out_logits = tf.squeeze(out_logits, axis=[2, 3])
@@ -145,12 +147,12 @@ def testSlowVsFastNoInput(self):
       slow_res = slow_result.eval()
       fast_res = fast_result.eval()
 
-    self.assertEqual(fast_res.shape, (BATCH_SIZE, decode_length))
+    self.assertEqual(slow_res.shape, (BATCH_SIZE, decode_length))
     self.assertAllClose(slow_res, fast_res)
 
   def testBeamDecodeWithRelativeAttention(self):
     decode_length = 2
-    model, features = self.getModel(transformer.transformer_relative_tiny())
+    model, features = get_model(transformer.transformer_relative_tiny())
     model(features)
     model.set_mode(tf.estimator.ModeKeys.PREDICT)
 
@@ -166,7 +168,7 @@ def testBeamDecodeWithRelativeAttention(self):
     self.assertEqual(beam_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length))
 
   def testBeamVsFast(self):
-    model, features = self.getModel(transformer.transformer_small())
+    model, features = get_model(transformer.transformer_small())
 
     decode_length = 2
 
@@ -204,9 +206,8 @@ def testBeamVsFast(self):
       beam_res = beam_result.eval()
       fast_res = fast_result.eval()
 
-    # TODO(rsepassi): Fix decode length. Broken by cl/190537320.
-    # self.assertEqual(fast_res.shape,
-    #                 (BATCH_SIZE, INPUT_LENGTH + decode_length))
+    self.assertEqual(fast_res.shape,
+                     (BATCH_SIZE, INPUT_LENGTH + decode_length))
     self.assertAllClose(beam_res, fast_res)
 
   def testTransformerWithoutProblem(self):
@@ -230,7 +231,7 @@ def testTransformerWithoutProblem(self):
         [BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size])
 
   def testTransformerWithEncoderDecoderAttentionLoss(self):
-    model, features = self.getModel(
+    model, features = get_model(
         transformer.transformer_supervised_attention())
     expected_attention_weights = np.random.random_sample(
         size=(BATCH_SIZE, TARGET_LENGTH, INPUT_LENGTH))
@@ -243,10 +244,10 @@ def testTransformerWithEncoderDecoderAttentionLoss(self):
     self.assertEqual(res.shape, ())
 
 
-class TransformerScorerTest(TransformerTest):
+class TransformerScorerTest(tf.test.TestCase):
 
   def testReturnsScores(self):
-    model, features = self.getModel(
+    model, features = get_model(
         mode=tf.estimator.ModeKeys.PREDICT,
         model_cls=transformer.TransformerScorer)
     infer_out = model.infer(features)
@@ -261,21 +262,21 @@ def testReturnsScores(self):
 
   def testVarNames(self):
     with tf.Graph().as_default():
-      model, features = self.getModel(
+      model, features = get_model(
           mode=tf.estimator.ModeKeys.PREDICT,
           model_cls=transformer.TransformerScorer)
       _ = model.infer(features)
       scorer_vars = [v.name for v in tf.global_variables()]
 
     with tf.Graph().as_default():
-      model, features = self.getModel(
+      model, features = get_model(
           mode=tf.estimator.ModeKeys.EVAL,
           model_cls=transformer.TransformerScorer)
       _ = model(features)
       scorer_eval_vars = [v.name for v in tf.global_variables()]
 
     with tf.Graph().as_default():
-      model, features = self.getModel(
+      model, features = get_model(
           mode=tf.estimator.ModeKeys.EVAL,
           model_cls=transformer.Transformer)
       _ = model(features)
diff --git a/tensor2tensor/models/xception.py b/tensor2tensor/models/xception.py
index d56321049..bec758687 100644
--- a/tensor2tensor/models/xception.py
+++ b/tensor2tensor/models/xception.py
@@ -23,7 +23,7 @@
 
 # Dependency imports
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.layers import common_hparams
 from tensor2tensor.layers import common_layers
@@ -36,7 +36,7 @@
 def residual_block(x, hparams):
   """A stack of convolution blocks with residual connection."""
   k = (hparams.kernel_height, hparams.kernel_width)
-  dilations_and_kernels = [((1, 1), k) for _ in xrange(3)]
+  dilations_and_kernels = [((1, 1), k) for _ in range(3)]
   y = common_layers.subseparable_conv_block(
       x,
       hparams.hidden_size,
@@ -66,7 +66,7 @@ def xception_internal(inputs, hparams):
           force2d=True,
           name="small_image_conv")
 
-    for i in xrange(hparams.num_hidden_layers):
+    for i in range(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % i):
         cur = residual_block(cur, hparams)
 
diff --git a/tensor2tensor/notebooks/hello_t2t.ipynb b/tensor2tensor/notebooks/hello_t2t.ipynb
index 7d56dbede..820744500 100644
--- a/tensor2tensor/notebooks/hello_t2t.ipynb
+++ b/tensor2tensor/notebooks/hello_t2t.ipynb
@@ -783,7 +783,7 @@
         "\n",
         "def to_tokens(ids):\n",
         "  ids = np.squeeze(ids)\n",
-        "  subtokenizer = hparams.problems[0].vocabulary['targets']\n",
+        "  subtokenizer = hparams.problem_hparams.vocabulary['targets']\n",
         "  tokens = []\n",
         "  for _id in ids:\n",
         "    if _id == 0:\n",
diff --git a/tensor2tensor/rl/README.md b/tensor2tensor/rl/README.md
index b163a16a5..7c723126f 100644
--- a/tensor2tensor/rl/README.md
+++ b/tensor2tensor/rl/README.md
@@ -13,7 +13,7 @@ Currently the only supported algorithm is Proximal Policy Optimization - PPO.
 
 ```
 python rl/t2t_rl_trainer.py \
-  --problems=Pendulum-v0 \
+  --problem=Pendulum-v0 \
   --hparams_set continuous_action_base \
   [--output_dir dir_location]
 ```
@@ -45,7 +45,7 @@ python tensor2tensor/bin/t2t-trainer \
   --generate_data \
   --data_dir=~/t2t_data \
   --output_dir=~/t2t_data/output \
-  --problems=gym_pong_random5k \
+  --problem=gym_pong_random5k \
   --model=basic_conv_gen \
   --hparams_set=basic_conv_small \
   --train_steps=1000 \
diff --git a/tensor2tensor/rl/envs/simulated_batch_env.py b/tensor2tensor/rl/envs/simulated_batch_env.py
index 69dfcff94..5d4a7e066 100644
--- a/tensor2tensor/rl/envs/simulated_batch_env.py
+++ b/tensor2tensor/rl/envs/simulated_batch_env.py
@@ -22,12 +22,12 @@
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 # Dependency imports
 
 import gym
 
-import pkg_resources
-
 from tensor2tensor.rl.envs.in_graph_batch_env import InGraphBatchEnv
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import trainer_lib
@@ -39,6 +39,9 @@
 FLAGS = flags.FLAGS
 
 
+flags.DEFINE_string("frames_path", "", "Path to the first frames.")
+
+
 class SimulatedBatchEnv(InGraphBatchEnv):
   """Batch of environments inside the TensorFlow graph.
 
@@ -52,7 +55,7 @@ def __init__(self, length, observ_shape, observ_dtype, action_shape,
     """Batch of environments inside the TensorFlow graph."""
     self.length = length
     hparams = trainer_lib.create_hparams(
-        FLAGS.hparams_set, problem_name=FLAGS.problems, data_dir="UNUSED")
+        FLAGS.hparams_set, problem_name=FLAGS.problem, data_dir="UNUSED")
     hparams.force_full_predict = True
     self._model = registry.model(FLAGS.model)(
         hparams, tf.estimator.ModeKeys.PREDICT)
@@ -60,12 +63,10 @@ def __init__(self, length, observ_shape, observ_dtype, action_shape,
     self.action_shape = action_shape
     self.action_dtype = action_dtype
 
-    with open(pkg_resources.resource_filename(
-        "tensor2tensor.rl.envs", "frame1.png"), "rb") as f:
+    with open(os.path.join(FLAGS.frames_path, "frame1.png"), "rb") as f:
       png_frame_1_raw = f.read()
 
-    with open(pkg_resources.resource_filename(
-        "tensor2tensor.rl.envs", "frame2.png"), "rb") as f:
+    with open(os.path.join(FLAGS.frames_path, "frame2.png"), "rb") as f:
       png_frame_2_raw = f.read()
 
     self.frame_1 = tf.expand_dims(tf.cast(tf.image.decode_png(png_frame_1_raw),
@@ -81,13 +82,6 @@ def __init__(self, length, observ_shape, observ_dtype, action_shape,
                                         trainable=False)
 
     observ_dtype = tf.int64
-    self._observ_not_sure_why_we_need_this = tf.Variable(
-        tf.zeros((self.length,) + observ_shape, observ_dtype),
-        name="observ_new", trainable=False)
-
-    self._reward_not_sure_why_we_need_this = tf.Variable(
-        tf.zeros((self.length, 1), observ_dtype),
-        name="reward_new", trainable=False)
 
   @property
   def action_space(self):
@@ -99,15 +93,19 @@ def __len__(self):
 
   def simulate(self, action):
     with tf.name_scope("environment/simulate"):
-      inputs = {"inputs_0": self._prev_observ.read_value(),
-                "inputs_1": self._observ.read_value(),
-                "action": action,
-                "targets": self._observ_not_sure_why_we_need_this,
-                "reward": self._reward_not_sure_why_we_need_this}
-      model_output = self._model(inputs)
-      observ_expaned = model_output[0]["targets"]
-      observ = tf.cast(tf.argmax(observ_expaned, axis=-1), tf.float32)
-      reward = tf.constant(0, tf.float32, shape=(self.length,))
+      input0 = self._prev_observ.read_value()
+      input1 = self._observ.read_value()
+      # Note: the merging below must be consistent with video_utils format.
+      inputs_merged = tf.concat([input0, input1], axis=0)
+      action = tf.expand_dims(action, axis=0)  # Action needs time too.
+      action = tf.concat([action, action], axis=0)
+      inputs = {"inputs": tf.expand_dims(inputs_merged, axis=0),  # Add batch.
+                "input_action": tf.expand_dims(action, axis=0)}
+      model_output = self._model.infer(inputs)
+      observ = model_output["targets"]
+      observ = tf.cast(observ[:, 0, :, :, :], tf.float32)
+      reward = model_output["target_reward"][:, 0, 0, 0] - 1
+      reward = tf.cast(reward, tf.float32)
       done = tf.constant(False, tf.bool, shape=(self.length,))
 
       with tf.control_dependencies([observ]):
diff --git a/tensor2tensor/rl/envs/tf_atari_wrappers.py b/tensor2tensor/rl/envs/tf_atari_wrappers.py
index 61bff7ab2..cfe25caa2 100644
--- a/tensor2tensor/rl/envs/tf_atari_wrappers.py
+++ b/tensor2tensor/rl/envs/tf_atari_wrappers.py
@@ -53,7 +53,7 @@ class TransformWrapper(WrapperBase):
 
   def __init__(self, batch_env, transform_observation=None,
                transform_reward=tf.identity, transform_done=tf.identity):
-    super().__init__(batch_env)
+    super(TransformWrapper, self).__init__(batch_env)
     if transform_observation is not None:
       _, observ_shape, observ_dtype = transform_observation  # pylint: disable=unpacking-non-sequence
       self._observ = tf.Variable(
@@ -88,7 +88,7 @@ def __init__(self, batch_env):
     nature_transform = lambda o: tf.image.rgb_to_grayscale(  # pylint: disable=g-long-lambda
         tf.image.resize_images(o, dims))
 
-    super().__init__(batch_env, transform_observation=(
+    super(WarpFrameWrapper, self).__init__(batch_env, transform_observation=(
         nature_transform, dims, tf.float32))
 
 
@@ -97,14 +97,15 @@ class ShiftRewardWrapper(TransformWrapper):
 
   def __init__(self, batch_env, add_value):
     shift_reward = lambda r: tf.add(r, add_value)
-    super().__init__(batch_env, transform_reward=shift_reward)
+    super(ShiftRewardWrapper, self).__init__(
+        batch_env, transform_reward=shift_reward)
 
 
 class MaxAndSkipWrapper(WrapperBase):
   """Max and skip wrapper."""
 
   def __init__(self, batch_env, skip=4):
-    super().__init__(batch_env)
+    super(MaxAndSkipWrapper, self).__init__(batch_env)
     self.skip = skip
     self._observ = None
     observs_shape = batch_env.observ.shape
@@ -141,7 +142,7 @@ class TimeLimitWrapper(WrapperBase):
 
   # TODO(lukaszkaiser): Check if TimeLimitWrapper does what it's supposed to do.
   def __init__(self, batch_env, timelimit=100):
-    super().__init__(batch_env)
+    super(TimeLimitWrapper, self).__init__(batch_env)
     self.timelimit = timelimit
     self._time_elapsed = tf.Variable(tf.zeros((len(self),), tf.int32),
                                      trainable=False)
@@ -167,7 +168,7 @@ class MemoryWrapper(WrapperBase):
   """Memory wrapper."""
 
   def __init__(self, batch_env):
-    super().__init__(batch_env)
+    super(MemoryWrapper, self).__init__(batch_env)
     MemoryWrapper.singleton = self
     assert self._length == 1, "We support only one environment"
     infinity = 10000000
diff --git a/tensor2tensor/rl/model_rl_experiment.py b/tensor2tensor/rl/model_rl_experiment.py
index 8b691a203..139cf6f46 100644
--- a/tensor2tensor/rl/model_rl_experiment.py
+++ b/tensor2tensor/rl/model_rl_experiment.py
@@ -22,11 +22,11 @@
 
 # Dependency imports
 
-from tensor2tensor import problems
 from tensor2tensor.bin import t2t_trainer
 from tensor2tensor.rl import rl_trainer_lib
 from tensor2tensor.rl.envs.tf_atari_wrappers import ShiftRewardWrapper
 from tensor2tensor.rl.envs.tf_atari_wrappers import TimeLimitWrapper
+from tensor2tensor.utils import registry
 from tensor2tensor.utils import trainer_lib
 
 import tensorflow as tf
@@ -52,11 +52,10 @@ def train(hparams, output_dir):
     time_delta = time.time() - start_time
     print(line+"Step {}.1. - generate data from policy. "
           "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
-    # FLAGS.problems = "gym_discrete_problem_with_agent"
-    FLAGS.problems = "gym_discrete_problem_with_agent2"
+    FLAGS.problem = "gym_discrete_problem_with_agent"
     FLAGS.agent_policy_path = last_model
-    gym_problem = problems.problem(FLAGS.problems)
-    # gym_problem.num_steps = hparams.true_env_generator_num_steps
+    gym_problem = registry.problem(FLAGS.problem)
+    gym_problem.settable_num_steps = hparams.true_env_generator_num_steps
     iter_data_dir = os.path.join(data_dir, str(iloop))
     tf.gfile.MakeDirs(iter_data_dir)
     gym_problem.generate_data(iter_data_dir, tmp_dir)
@@ -67,23 +66,25 @@ def train(hparams, output_dir):
     # 2. generate env model
     FLAGS.data_dir = iter_data_dir
     FLAGS.output_dir = output_dir
-    # FLAGS.model = hparams.generative_model
-    FLAGS.model = "basic_conv_gen"
-    # FLAGS.model = "michigan_basic_conv_gen"
+    FLAGS.model = hparams.generative_model
     FLAGS.hparams_set = hparams.generative_model_params
-    # FLAGS.train_steps = hparams.model_train_steps
+    FLAGS.train_steps = hparams.model_train_steps
     FLAGS.train_steps = 1
     FLAGS.eval_steps = 1
     t2t_trainer.main([])
 
+    # Dump frames from env model.
     time_delta = time.time() - start_time
-    print(line+"Step {}.3. - evaluate env model. "
+    print(line+"Step {}.3. - evalue env model. "
           "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
-    gym_simulated_problem = problems.problem("gym_simulated_discrete_problem_with_agent")
-    gym_simulated_problem.num_steps = hparams.simulated_env_generator_num_steps
+    gym_simulated_problem = registry.problem(
+        "gym_simulated_discrete_problem_with_agent")
+    sim_steps = hparams.simulated_env_generator_num_steps
+    gym_simulated_problem.settable_num_steps = sim_steps
     gym_simulated_problem.generate_data(iter_data_dir, tmp_dir)
 
-    # time_delta = time.time() - start_time
+    # PPO.
+    time_delta = time.time() - start_time
     print(line + "Step {}.4. - train PPO in model env."
           " Time: {}".format(iloop,
                              str(datetime.timedelta(seconds=time_delta))))
@@ -100,7 +101,8 @@ def train(hparams, output_dir):
         (ShiftRewardWrapper, {"add_value": -2})]
     in_graph_wrappers += gym_problem.in_graph_wrappers
     ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers)
-    rl_trainer_lib.train(ppo_hparams, "PongNoFrameskip-v4", ppo_dir)
+    ppo_hparams.num_agents = 1
+    rl_trainer_lib.train(ppo_hparams, "PongDeterministic-v4", ppo_dir)
 
     last_model = ppo_dir + "/model{}.ckpt".format(ppo_epochs_num)
 
@@ -109,14 +111,14 @@ def main(_):
   hparams = tf.contrib.training.HParams(
       epochs=100,
       true_env_generator_num_steps=100,
-      generative_model="static_basic_conv_gen",
-      generative_model_params="basic_conv_small",
-      model_train_steps=80,
+      generative_model="basic_conv_gen",
+      generative_model_params="basic_conv",
+      model_train_steps=5000,
       simulated_env_generator_num_steps=300,
       ppo_epochs_num=2,
       ppo_epoch_length=300,
   )
-  train(hparams, tempfile.mkdtemp())
+  train(hparams, FLAGS.output_dir)
 
 
 if __name__ == "__main__":
diff --git a/tensor2tensor/rl/t2t_rl_trainer.py b/tensor2tensor/rl/t2t_rl_trainer.py
index bd3780a9b..188433789 100644
--- a/tensor2tensor/rl/t2t_rl_trainer.py
+++ b/tensor2tensor/rl/t2t_rl_trainer.py
@@ -36,7 +36,7 @@
 
 def main(_):
   hparams = trainer_lib.create_hparams(FLAGS.hparams_set, FLAGS.hparams)
-  rl_trainer_lib.train(hparams, FLAGS.problems, FLAGS.output_dir)
+  rl_trainer_lib.train(hparams, FLAGS.problem, FLAGS.output_dir)
 
 
 if __name__ == "__main__":
diff --git a/tensor2tensor/serving/README.md b/tensor2tensor/serving/README.md
index 633479132..8bb35da27 100644
--- a/tensor2tensor/serving/README.md
+++ b/tensor2tensor/serving/README.md
@@ -3,8 +3,6 @@
 Tensor2Tensor and the TensorFlow ecosystem make it easy to serve a model once
 trained.
 
-**Note**: Requires TF 1.5+.
-
 ## 1. Export for Serving
 
 First, export it for serving:
@@ -13,7 +11,7 @@ First, export it for serving:
 t2t-exporter \
   --model=transformer \
   --hparams_set=transformer_tiny \
-  --problems=translate_ende_wmt8k \
+  --problem=translate_ende_wmt8k \
   --data_dir=~/t2t/data \
   --output_dir=/tmp/t2t_train
 ```
diff --git a/tensor2tensor/serving/export.py b/tensor2tensor/serving/export.py
index 5fffdbff6..5b5dccf5b 100644
--- a/tensor2tensor/serving/export.py
+++ b/tensor2tensor/serving/export.py
@@ -45,7 +45,7 @@ def create_hparams():
       FLAGS.hparams_set,
       FLAGS.hparams,
       data_dir=os.path.expanduser(FLAGS.data_dir),
-      problem_name=FLAGS.problems)
+      problem_name=FLAGS.problem)
 
 
 def main(_):
@@ -61,7 +61,7 @@ def main(_):
 
   estimator = create_estimator(run_config, hparams)
 
-  problem = hparams.problem_instances[0]
+  problem = hparams.problem
   strategy = trainer_lib.create_export_strategy(problem, hparams)
 
   export_dir = os.path.join(ckpt_dir, "export", strategy.name)
diff --git a/tensor2tensor/serving/query.py b/tensor2tensor/serving/query.py
index 9d8eed092..1af0e9f2d 100644
--- a/tensor2tensor/serving/query.py
+++ b/tensor2tensor/serving/query.py
@@ -87,14 +87,16 @@ def main(_):
   while True:
     inputs = FLAGS.inputs_once if FLAGS.inputs_once else input(">> ")
     outputs = serving_utils.predict([inputs], problem, request_fn)
+    outputs, = outputs
+    output, score = outputs
     print_str = """
 Input:
 {inputs}
 
-Output:
-{outputs}
+Output (Score {score:.3f}):
+{output}
     """
-    print(print_str.format(inputs=inputs, outputs=outputs[0]))
+    print(print_str.format(inputs=inputs, output=output, score=score))
     if FLAGS.inputs_once:
       break
 
diff --git a/tensor2tensor/serving/serving_utils.py b/tensor2tensor/serving/serving_utils.py
index e22ddfb2c..5bb2fe724 100644
--- a/tensor2tensor/serving/serving_utils.py
+++ b/tensor2tensor/serving/serving_utils.py
@@ -119,7 +119,8 @@ def predict(inputs_list, problem, request_fn):
   predictions = request_fn(examples)
   output_decoder = problem.feature_info["targets"].encoder
   outputs = [
-      _decode(prediction["outputs"], output_decoder)
+      (_decode(prediction["outputs"], output_decoder),
+       prediction["scores"])
       for prediction in predictions
   ]
   return outputs
diff --git a/tensor2tensor/test_data/transformer_test_ckpt/flags.txt b/tensor2tensor/test_data/transformer_test_ckpt/flags.txt
index 2587e3e2d..2ecee8328 100644
--- a/tensor2tensor/test_data/transformer_test_ckpt/flags.txt
+++ b/tensor2tensor/test_data/transformer_test_ckpt/flags.txt
@@ -27,7 +27,7 @@
 --ps_job=/job:ps
 --tmp_dir=/tmp/t2t_datagen
 --schedule=continuous_train_and_eval
---problems=translate_ende_wmt8k
+--problem=translate_ende_wmt8k
 --hparams=
 --use_tpu=False
 --eval_early_stopping_metric_delta=0.1
diff --git a/tensor2tensor/test_data/transformer_test_ckpt/hparams.json b/tensor2tensor/test_data/transformer_test_ckpt/hparams.json
index b07ac9486..df9a654c0 100644
--- a/tensor2tensor/test_data/transformer_test_ckpt/hparams.json
+++ b/tensor2tensor/test_data/transformer_test_ckpt/hparams.json
@@ -1 +1 @@
-{"daisy_chain_variables": true, "optimizer_adam_beta1": 0.9, "scheduled_sampling_prob": 0.0, "num_hidden_layers": 2, "moe_loss_coef": 0.01, "max_target_seq_length": 0, "clip_grad_norm": 0.0, "pos": "timing", "scheduled_sampling_gold_mixin_prob": 0.5, "initializer": "uniform_unit_scaling", "grad_noise_scale": 0.0, "optimizer_momentum_momentum": 0.9, "nbr_decoder_problems": 1, "attention_key_channels": 0, "eval_drop_long_sequences": false, "learning_rate_cosine_cycle_steps": 250000, "prepend_mode": "none", "weight_decay": 0.0, "symbol_modality_skip_top": false, "weight_noise": 0.0, "target_modality": "default", "attention_dropout": 0.1, "parameter_attention_value_channels": 0, "factored_logits": false, "relu_dropout": 0.1, "no_data_parallelism": false, "layer_preprocess_sequence": "n", "sampling_method": "argmax", "learning_rate": 0.2, "num_heads": 2, "max_length": 256, "summarize_grads": false, "attention_value_channels": 0, "num_encoder_layers": 0, "label_smoothing": 0.1, "use_fixed_batch_size": false, "optimizer": "Adam", "moe_k": 2, "self_attention_type": "dot_product", "learning_rate_decay_scheme": "noam", "sampling_temp": 1.0, "kernel_height": 3, "use_pad_remover": true, "batch_size": 4096, "problem_choice": "adaptive", "max_relative_position": 0, "force_full_predict": false, "min_length_bucket": 8, "layer_prepostprocess_dropout": 0.1, "eval_run_autoregressive": false, "shared_embedding_and_softmax_weights": true, "symbol_modality_num_shards": 16, "dropout": 0.2, "compress_steps": 0, "parameter_attention_key_channels": 0, "length_bucket_step": 1.1, "kernel_width": 1, "hidden_size": 16, "num_decoder_layers": 0, "input_modalities": "default", "filter_size": 8, "optimizer_adam_beta2": 0.98, "scheduled_sampling_warmup_steps": 50000, "norm_type": "layer", "min_length": 0, "moe_num_experts": 64, "multiply_embedding_mode": "sqrt_depth", "max_input_seq_length": 0, "learning_rate_warmup_steps": 8000, "proximity_bias": false, "ffn_layer": "dense_relu_dense", "initializer_gain": 1.0, "layer_postprocess_sequence": "da", "moe_hidden_sizes": "2048", "optimizer_adam_epsilon": 1e-09, "norm_epsilon": 1e-06}
+{"daisy_chain_variables": true, "optimizer_adam_beta1": 0.9, "scheduled_sampling_prob": 0.0, "num_hidden_layers": 2, "moe_loss_coef": 0.01, "max_target_seq_length": 0, "clip_grad_norm": 0.0, "pos": "timing", "scheduled_sampling_gold_mixin_prob": 0.5, "initializer": "uniform_unit_scaling", "grad_noise_scale": 0.0, "optimizer_momentum_momentum": 0.9, "nbr_decoder_problems": 1, "attention_key_channels": 0, "eval_drop_long_sequences": false, "learning_rate_cosine_cycle_steps": 250000, "prepend_mode": "none", "weight_decay": 0.0, "symbol_modality_skip_top": false, "weight_noise": 0.0, "target_modality": "default", "attention_dropout": 0.1, "parameter_attention_value_channels": 0, "factored_logits": false, "relu_dropout": 0.1, "no_data_parallelism": false, "layer_preprocess_sequence": "n", "sampling_method": "argmax", "learning_rate": 0.2, "num_heads": 2, "max_length": 256, "summarize_grads": false, "attention_value_channels": 0, "num_encoder_layers": 0, "label_smoothing": 0.1, "use_fixed_batch_size": false, "optimizer": "Adam", "moe_k": 2, "self_attention_type": "dot_product", "learning_rate_decay_scheme": "noam", "sampling_temp": 1.0, "kernel_height": 3, "use_pad_remover": true, "batch_size": 4096, "max_relative_position": 0, "force_full_predict": false, "min_length_bucket": 8, "layer_prepostprocess_dropout": 0.1, "eval_run_autoregressive": false, "shared_embedding_and_softmax_weights": true, "symbol_modality_num_shards": 16, "dropout": 0.2, "compress_steps": 0, "parameter_attention_key_channels": 0, "length_bucket_step": 1.1, "kernel_width": 1, "hidden_size": 16, "num_decoder_layers": 0, "input_modalities": "default", "filter_size": 8, "optimizer_adam_beta2": 0.98, "scheduled_sampling_warmup_steps": 50000, "norm_type": "layer", "min_length": 0, "moe_num_experts": 64, "multiply_embedding_mode": "sqrt_depth", "max_input_seq_length": 0, "learning_rate_warmup_steps": 8000, "proximity_bias": false, "ffn_layer": "dense_relu_dense", "initializer_gain": 1.0, "layer_postprocess_sequence": "da", "moe_hidden_sizes": "2048", "optimizer_adam_epsilon": 1e-09, "norm_epsilon": 1e-06}
diff --git a/tensor2tensor/utils/adafactor.py b/tensor2tensor/utils/adafactor.py
index 11daaf890..69a240733 100644
--- a/tensor2tensor/utils/adafactor.py
+++ b/tensor2tensor/utils/adafactor.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 # Dependency imports
+from tensor2tensor.utils import quantization
 
 import tensorflow as tf
 
@@ -28,7 +29,6 @@ class AdafactorOptimizer(tf.train.Optimizer):
 
   Adafactor is described in https://arxiv.org/abs/1804.04235.
 
-
   Adafactor is most similar to Adam (Kingma and Ba), the major differences are:
 
   1. For a two-dimensional AxB weight matrix, Adafactor uses only A+B auxiliary
@@ -108,6 +108,7 @@ def __init__(self,
                clipping_threshold=1.0,
                factored=True,
                simulated_quantize_bits=None,
+               parameter_encoding=None,
                use_locking=False,
                name="Adafactor"):
     """Construct a new Adafactor optimizer.
@@ -124,6 +125,8 @@ def __init__(self,
         for 2d variables
       simulated_quantize_bits: train with simulated quantized parameters
         (experimental)
+      parameter_encoding: a ParameterEncoding object to use in the case of
+        bfloat16 variables.
       use_locking: If True use locks for update operations.
       name: Optional name for the operations created when applying gradients.
         Defaults to "AdafactorOptimizer".
@@ -144,8 +147,8 @@ def __init__(self,
     self._clipping_threshold = clipping_threshold
     self._factored = factored
     self._simulated_quantize_bits = simulated_quantize_bits
-    if self._simulated_quantize_bits:
-      self._quantization_noise = _quantization_noise_from_step_num()
+    self._parameter_encoding = parameter_encoding
+    self._quantization_noise = quantization.noise_from_step_num()
 
   def _should_use_factored_second_moment_estimate(self, shape):
     """Should we use a factored second moment estimator.
@@ -201,8 +204,11 @@ def _resource_apply_dense(self, grad, var):
     grad_squared_mean = tf.reduce_mean(grad_squared)
     decay_rate = self._decay_rate
     update_scale = self._learning_rate
+    old_val = var
+    if var.dtype.base_dtype == tf.bfloat16:
+      old_val = tf.to_float(self._parameter_encoding.decode(old_val))
     if self._multiply_by_parameter_scale:
-      update_scale *= tf.to_float(self._parameter_scale(var))
+      update_scale *= tf.to_float(self._parameter_scale(old_val))
     # HACK: Make things dependent on grad.
     # This confounds the XLA rewriter and keeps it from fusing computations
     # across different variables.  This fusion is a bad for HBM usage, since
@@ -243,11 +249,12 @@ def _resource_apply_dense(self, grad, var):
       subtrahend = new_m
       new_m = tf.cast(new_m, var.dtype)
       updates.append(tf.assign(m, new_m, use_locking=self._use_locking))
-    new_val = tf.to_float(var) - subtrahend
-    if var.dtype == tf.bfloat16:
-      new_val = _to_bfloat16_unbiased(new_val)
+    new_val = tf.to_float(old_val) - subtrahend
+    if var.dtype.base_dtype == tf.bfloat16:
+      new_val = self._parameter_encoding.encode(
+          new_val, self._quantization_noise)
     if self._simulated_quantize_bits:
-      new_val = _simulated_quantize(
+      new_val = quantization.simulated_quantize(
           var - subtrahend, self._simulated_quantize_bits,
           self._quantization_noise)
     var_update = tf.assign(var, new_val, use_locking=self._use_locking)
@@ -312,6 +319,10 @@ def adafactor_optimizer_from_hparams(hparams, lr):
         hparams.optimizer_adafactor_memory_exponent)
   else:
     raise ValueError("unknown optimizer_adafactor_decay_type")
+  if hparams.weight_dtype == "bfloat16":
+    parameter_encoding = quantization.EighthPowerEncoding()
+  else:
+    parameter_encoding = None
   return AdafactorOptimizer(
       multiply_by_parameter_scale=(
           hparams.optimizer_adafactor_multiply_by_parameter_scale),
@@ -322,145 +333,10 @@ def adafactor_optimizer_from_hparams(hparams, lr):
       factored=hparams.optimizer_adafactor_factored,
       simulated_quantize_bits=getattr(
           hparams, "simulated_parameter_quantize_bits", 0),
+      parameter_encoding=parameter_encoding,
       use_locking=False,
       name="Adafactor")
 
 
 def reduce_rms(x):
   return tf.sqrt(tf.reduce_mean(tf.square(x)))
-
-
-def _simulated_quantize(x, num_bits, quantization_noise):
-  """Simulate quantization to num_bits bits, with externally-stored scale.
-
-  num_bits is the number of bits used to store each value.
-  quantization_noise is a float32 Tensor containing values in [0, 1).
-  Each value in quantization_noise should take different values across
-  different steps, approximating a uniform distribution over [0, 1).
-  In the case of replicated TPU training, quantization_noise should be identical
-  across replicas in order to keep the parameters identical across replicas.
-
-  The natural choice for quantization_noise would be tf.random_uniform(),
-  but this is not possible for TPU, since there is currently no way to seed
-  the different cores to produce identical values across replicas.  Instead we
-  use _quantization_noise_from_step_num() (see below).
-
-  The quantization scheme is as follows:
-
-  Compute the maximum absolute value by row (call this max_abs).
-  Store this either in an auxiliary variable or in an extra column.
-
-  Divide the parameters by (max_abs / (2^(num_bits-1)-1)).  This gives a
-  float32 value in the range [-2^(num_bits-1)-1, 2^(num_bits-1)-1]
-
-  Unbiased randomized roundoff by adding quantization_noise and rounding down.
-
-  This produces a signed integer with num_bits bits which can then be stored.
-
-  Args:
-    x: a float32 Tensor
-    num_bits: an integer between 1 and 22
-    quantization_noise: a float Tensor broadcastable to the shape of x.
-
-  Returns:
-    a float32 Tensor
-  """
-  shape = x.get_shape().as_list()
-  if not (len(shape) >= 2 and shape[-1] > 1):
-    return x
-  max_abs = tf.reduce_max(tf.abs(x), -1, keep_dims=True) + 1e-9
-  max_int = 2 ** (num_bits - 1) - 1
-  scale = max_abs / max_int
-  x /= scale
-  x = tf.floor(x + quantization_noise)
-  # dequantize before storing (since this is a simulation)
-  x *= scale
-  return x
-
-
-def _quantization_noise_from_step_num():
-  """A quantization noise equal to (phi * (step_num + 1)) mod 1.0.
-
-  See _simulated_quantize.
-
-  Returns:
-    a float32 scalar
-  """
-  step = tf.to_int32(tf.train.get_or_create_global_step()) + 1
-  phi = ((5 ** 0.5) - 1) / 2
-  # Naive computation tf.mod(phi * step, 1.0) in float32 would be disastrous
-  # due to loss of precision when the step number gets large.
-  # Computation in doubles does not work on TPU, so we use this complicated
-  # alternative computation which does not suffer from these roundoff errors.
-  ret = 0.0
-  for i in xrange(30):
-    ret += (((phi * (2 ** i)) % 1.0)  # double-precision computation in python
-            * tf.to_float(tf.mod(step // (2 ** i), 2)))
-  return tf.mod(ret, 1.0)
-
-
-def _randomized_roundoff_to_bfloat16(x, quantization_noise, cand1, cand2):
-  """Round-off x to cand1 or to cand2 in an unbiased way.
-
-  Cand1 and cand2 are the same shape as x.
-  For every element of x, the corresponding elements of cand1 and cand2 should
-  be the two closest bfloat16 values to x.  Order does not matter.
-  cand1 and cand2 must differ from each other.
-
-  Args:
-    x: A float32 Tensor.
-    quantization_noise: A Tensor broadcastable to the shape of x containing
-    random uniform values in [0.0, 1.0].
-    cand1: A bfloat16 Tensor the same shape as x.
-    cand2: A bfloat16 Tensor the same shape as x.
-
-  Returns:
-    A bfloat16 Tensor.
-  """
-  cand1_f = tf.to_float(cand1)
-  cand2_f = tf.to_float(cand2)
-  step_size = cand2_f - cand1_f
-  fpart = (x - cand1_f) / step_size
-  ret = tf.where(tf.greater(fpart, quantization_noise), cand2, cand1)
-  return ret
-
-
-def _to_bfloat16_unbiased(x):
-  """Convert a float32 to a bfloat16 using randomized roundoff.
-
-  Note: If this ever produces worse results than using float32 all the way
-  through, we should try to diagnose and fix it.  There are several things
-  to try:
-
-  1. Encode parameter x for storage purposes as
-     _to_bfloat16_unbiased(tf.pow(x, 5)) .  This gives 5x the
-     resolution while incurring overflow and underflow at 10^9 and 10^-9
-     instead of 10^37 and 10^-37.  Comes at a cost of extracting fifth roots
-     to decode parameters.  Or use some other such scheme.
-
-  2. In this function, use actual random numbers, different for each parameter
-     as opposed to the same for every parameter in the graph.
-
-  3. Look for bugs in this function.
-
-  Args:
-    x: A float32 Tensor.
-  Returns:
-    A float32 Tensor.
-  """
-  # Not using random_uniform here due to a problem on TPU in that random seeds
-  # are not respected, which may cause the parameters on different replicas
-  # to go out-of-sync.
-  quantization_noise = _quantization_noise_from_step_num()
-  x_sign = tf.sign(x)
-  # Make sure x is positive.  If it is zero, the two candidates are identical.
-  x = x * x_sign + 1e-30
-  cand1 = tf.to_bfloat16(x)
-  cand1_f = tf.to_float(cand1)
-  # This relies on the fact that for a positive bfloat16 b,
-  # b * 1.005 gives you the next higher bfloat16 and b*0.995 gives you the
-  # next lower one. Both 1.005 and 0.995 are ballpark estimation.
-  cand2 = tf.to_bfloat16(
-      tf.where(tf.greater(x, cand1_f), cand1_f * 1.005, cand1_f * 0.995))
-  ret = _randomized_roundoff_to_bfloat16(x, quantization_noise, cand1, cand2)
-  return ret * tf.to_bfloat16(x_sign)
diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py
index 11d455c17..e42fd9621 100644
--- a/tensor2tensor/utils/beam_search.py
+++ b/tensor2tensor/utils/beam_search.py
@@ -356,11 +356,8 @@ def grow_topk(i, alive_seq, alive_log_probs, states):
           lambda t: _unmerge_beam_dim(t, batch_size, beam_size), flat_states)
     else:
       flat_logits = symbols_to_logits_fn(flat_ids)
-    if len(flat_logits.shape)>=3:
-        logits = tf.reshape(flat_logits, [batch_size, beam_size, -1])
-    elif len(flat_logits.shape)<3:
-        logits = tf.reshape(flat_logits, [beam_size, batch_size, -1])
-        logits = tf.transpose(logits, perm=[1, 0, 2])
+
+    logits = tf.reshape(flat_logits, [batch_size, beam_size, -1])
 
     # Convert logits to normalized log probs
     candidate_log_probs = log_prob_from_logits(logits)
diff --git a/tensor2tensor/utils/bleu_hook.py b/tensor2tensor/utils/bleu_hook.py
index fa3d25213..e18b72c0b 100644
--- a/tensor2tensor/utils/bleu_hook.py
+++ b/tensor2tensor/utils/bleu_hook.py
@@ -25,18 +25,18 @@
 import sys
 import time
 import unicodedata
-#To fix issue:#706
-import io
 
 # Dependency imports
 
 import numpy as np
 import six
 # pylint: disable=redefined-builtin
-from six.moves import xrange
+from six.moves import range
 from six.moves import zip
 # pylint: enable=redefined-builtin
 
+from tensor2tensor.data_generators import text_encoder
+
 import tensorflow as tf
 
 
@@ -53,8 +53,8 @@ def _get_ngrams(segment, max_order):
     with a count of how many times each n-gram occurred.
   """
   ngram_counts = collections.Counter()
-  for order in xrange(1, max_order + 1):
-    for i in xrange(0, len(segment) - order + 1):
+  for order in range(1, max_order + 1):
+    for i in range(0, len(segment) - order + 1):
       ngram = tuple(segment[i:i + order])
       ngram_counts[ngram] += 1
   return ngram_counts
@@ -102,7 +102,7 @@ def compute_bleu(reference_corpus,
       possible_matches_by_order[len(ngram)-1] += translation_ngram_counts[ngram]
   precisions = [0] * max_order
   smooth = 1.0
-  for i in xrange(0, max_order):
+  for i in range(0, max_order):
     if possible_matches_by_order[i] > 0:
       precisions[i] = matches_by_order[i] / possible_matches_by_order[i]
       if matches_by_order[i] > 0:
@@ -196,9 +196,10 @@ def bleu_tokenize(string):
 
 def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False):
   """Compute BLEU for two files (reference and hypothesis translation)."""
-  #To fix the issue #706
-  ref_lines = io.open(ref_filename, 'rt', encoding='utf-8').read().splitlines()
-  hyp_lines = io.open(hyp_filename, 'rt', encoding='utf-8').read().splitlines()
+  ref_lines = text_encoder.native_to_unicode(
+      tf.gfile.Open(ref_filename, "r").read()).splitlines()
+  hyp_lines = text_encoder.native_to_unicode(
+      tf.gfile.Open(hyp_filename, "r").read()).splitlines()
   assert len(ref_lines) == len(hyp_lines)
   if not case_sensitive:
     ref_lines = [x.lower() for x in ref_lines]
diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py
index 703f1fa0d..36e3bfcb5 100755
--- a/tensor2tensor/utils/cloud_mlengine.py
+++ b/tensor2tensor/utils/cloud_mlengine.py
@@ -23,6 +23,8 @@
 
 from googleapiclient import discovery
 from oauth2client.client import GoogleCredentials
+
+from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.layers import common_hparams
 from tensor2tensor.utils import cloud_tpu as cloud
 from tensor2tensor.utils import registry
@@ -31,7 +33,7 @@
 
 FLAGS = tf.flags.FLAGS
 
-CONSOLE_URL = 'https://console.cloud.google.com/mlengine/jobs/'
+CONSOLE_URL = "https://console.cloud.google.com/mlengine/jobs/"
 
 # TODO(rsepassi):
 # * Enable multi-machine sync/async training
@@ -44,8 +46,8 @@ def get_setup_file(name, packages=None):
 from setuptools import find_packages
 from setuptools import setup
 setup(
-    name='{name}',
-    version='0.1',
+    name="{name}",
+    version="0.1",
     packages=find_packages(),
     install_requires={pypi_packages}
 )
@@ -54,53 +56,53 @@ def get_setup_file(name, packages=None):
 
 def job_dir():
   # The flag --job-dir is parsed differently before and after switching to absl
-  return getattr(FLAGS, 'job-dir', '') or getattr(FLAGS, 'job_dir', '')
+  return getattr(FLAGS, "job-dir", "") or getattr(FLAGS, "job_dir", "")
 
 
 def get_requirements(usr_dir):
-  requirements_file = os.path.join(usr_dir, 'requirements.txt')
+  requirements_file = os.path.join(usr_dir, "requirements.txt")
   if not tf.gfile.Exists(requirements_file):
     return []
   with tf.gfile.Open(requirements_file) as f:
     pkg_list = f.readlines()
-    return [pkg.strip() for pkg in pkg_list if 'tensor2tensor' not in pkg]
+    return [pkg.strip() for pkg in pkg_list if "tensor2tensor" not in pkg]
 
 
 def flags_as_args():
   """Convert FLAGS to list of args suitable for passing on cmd line."""
-  if hasattr(FLAGS, 'flag_values_dict'):
+  if hasattr(FLAGS, "flag_values_dict"):
     args_dict = FLAGS.flag_values_dict()
   else:
-    args_dict = dict(FLAGS.__dict__['__flags'])
-  del args_dict['cloud_mlengine']
+    args_dict = dict(FLAGS.__dict__["__flags"])
+  del args_dict["cloud_mlengine"]
   # Configured later
-  del args_dict['t2t_usr_dir']
-  args_dict.pop('h', None)
-  args_dict.pop('helpfull', None)
-  args_dict.pop('helpshort', None)
-  args_dict.pop('help', None)
+  del args_dict["t2t_usr_dir"]
+  args_dict.pop("h", None)
+  args_dict.pop("helpfull", None)
+  args_dict.pop("helpshort", None)
+  args_dict.pop("help", None)
   args = []
   for name, val in args_dict.items():
     if val is None:
       continue
-    if name.startswith('autotune'):
+    if name.startswith("autotune"):
       continue
-    args.extend(['--%s' % name, str(val)])
+    args.extend(["--%s" % name, str(val)])
   return args
 
 
 def get_default_master_type(num_gpus=1, use_tpu=False):
   """Returns master_type for trainingInput."""
   if use_tpu:
-    return 'standard_tpu'
+    return "standard_tpu"
   elif num_gpus <= 0:
-    return 'standard'
+    return "standard"
   elif num_gpus == 1:
-    return 'standard_p100'
+    return "standard_p100"
   elif num_gpus == 4:
-    return 'complex_model_m_p100'
+    return "complex_model_m_p100"
   elif num_gpus == 8:
-    return 'complex_model_l_gpu'
+    return "complex_model_l_gpu"
   assert False
 
 
@@ -109,20 +111,20 @@ def configure_job():
   # See documentation:
   # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput
   training_input = {
-      'pythonModule': 'tensor2tensor.bin.t2t_trainer',
-      'args': flags_as_args(),
-      'region': cloud.default_region().decode('utf-8'),
-      'runtimeVersion': '1.5',
-      'pythonVersion': '3.5' if sys.version_info.major == 3 else '2.7',
-      'jobDir': FLAGS.output_dir,
-      'scaleTier': 'CUSTOM',
-      'masterType': FLAGS.cloud_mlengine_master_type or get_default_master_type(
+      "pythonModule": "tensor2tensor.bin.t2t_trainer",
+      "args": flags_as_args(),
+      "region": text_encoder.native_to_unicode(cloud.default_region()),
+      "runtimeVersion": "1.5",
+      "pythonVersion": "3.5" if sys.version_info.major == 3 else "2.7",
+      "jobDir": FLAGS.output_dir,
+      "scaleTier": "CUSTOM",
+      "masterType": FLAGS.cloud_mlengine_master_type or get_default_master_type(
           num_gpus=FLAGS.worker_gpu,
           use_tpu=FLAGS.use_tpu)
   }
   if FLAGS.hparams_range:
-    tf.logging.info('Configuring hyperparameter tuning.')
-    training_input['hyperparameters'] = configure_autotune(
+    tf.logging.info("Configuring hyperparameter tuning.")
+    training_input["hyperparameters"] = configure_autotune(
         FLAGS.hparams_range,
         FLAGS.autotune_objective,
         FLAGS.autotune_maximize,
@@ -130,17 +132,18 @@ def configure_job():
         FLAGS.autotune_parallel_trials,
     )
 
-  timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
-  job_name = '%s_%s_t2t_%s' % (FLAGS.model, FLAGS.problems, timestamp)
-  job_spec = {'jobId': job_name, 'trainingInput': training_input}
+  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+  job_name = "%s_%s_t2t_%s" % (FLAGS.model, FLAGS.problem, timestamp)
+  job_spec = {"jobId": job_name, "trainingInput": training_input}
   return job_spec
 
 
 def launch_job(job_spec):
   """Launch job on ML Engine."""
-  project_id = 'projects/{}'.format(cloud.default_project().decode('utf-8'))
+  project_id = "projects/{}".format(
+      text_encoder.native_to_unicode(cloud.default_project()))
   credentials = GoogleCredentials.get_application_default()
-  cloudml = discovery.build('ml', 'v1', credentials=credentials,
+  cloudml = discovery.build("ml", "v1", credentials=credentials,
                             cache_discovery=False)
   request = cloudml.projects().jobs().create(body=job_spec, parent=project_id)
   request.execute()
@@ -148,19 +151,19 @@ def launch_job(job_spec):
 
 def _tar_and_copy(src_dir, target_dir):
   """Tar and gzip src_dir and copy to GCS target_dir."""
-  src_dir = src_dir.rstrip('/')
-  target_dir = target_dir.rstrip('/')
-  tmp_dir = tempfile.gettempdir().rstrip('/')
+  src_dir = src_dir.rstrip("/")
+  target_dir = target_dir.rstrip("/")
+  tmp_dir = tempfile.gettempdir().rstrip("/")
   src_base = os.path.basename(src_dir)
   cloud.shell_run(
-      'tar -zcf {tmp_dir}/{src_base}.tar.gz -C {src_dir} .',
+      "tar -zcf {tmp_dir}/{src_base}.tar.gz -C {src_dir} .",
       src_dir=src_dir,
       src_base=src_base,
       tmp_dir=tmp_dir)
-  final_destination = '%s/%s.tar.gz' % (target_dir, src_base)
+  final_destination = "%s/%s.tar.gz" % (target_dir, src_base)
   cloud.shell_run(
-      ('gsutil cp {tmp_dir}/{src_base}.tar.gz '
-       '{final_destination}'),
+      ("gsutil cp {tmp_dir}/{src_base}.tar.gz "
+       "{final_destination}"),
       tmp_dir=tmp_dir,
       src_base=src_base,
       final_destination=final_destination)
@@ -169,38 +172,39 @@ def _tar_and_copy(src_dir, target_dir):
 
 def tar_and_copy_t2t(train_dir):
   """Tar Tensor2Tensor and cp to train_dir."""
-  tf.logging.info('Tarring and pushing local Tensor2Tensor package.')
+  tf.logging.info("Tarring and pushing local Tensor2Tensor package.")
 
-  output = cloud.shell_output('pip show tensor2tensor').decode('utf-8').split('\n')
-  assert output[1].startswith('Version')
-  assert output[7].startswith('Location')
-  t2t_version = output[1].split(':')[1].strip()
-  t2t_dir = output[7].split(':')[1].strip()
+  output = text_encoder.native_to_unicode(cloud.shell_output(
+      "pip show tensor2tensor")).split("\n")
+  assert output[1].startswith("Version")
+  assert output[7].startswith("Location")
+  t2t_version = output[1].split(":")[1].strip()
+  t2t_dir = output[7].split(":")[1].strip()
 
   # A local installation cloned from GitHub will have a setup.py file and a docs
   # folder
   is_local_t2t = all([
       tf.gfile.Exists(os.path.join(t2t_dir, fname))
-      for fname in ['setup.py', 'docs/cloud_mlengine.md']
+      for fname in ["setup.py", "docs/cloud_mlengine.md"]
   ])
 
   if is_local_t2t:
-    tf.logging.info('Found local T2T installation. Tarring directory %s',
+    tf.logging.info("Found local T2T installation. Tarring directory %s",
                     t2t_dir)
   else:
     # PyPI installation
     # Create a folder with just a setup.py file pointing to the right version
-    tf.logging.info('Found PyPI T2T installation. Launching tensor2tensor==%s',
+    tf.logging.info("Found PyPI T2T installation. Launching tensor2tensor==%s",
                     t2t_version)
-    t2t_dir = os.path.join(tempfile.gettempdir(), 'tensor2tensor_tmp')
+    t2t_dir = os.path.join(tempfile.gettempdir(), "tensor2tensor_tmp")
     shutil.rmtree(t2t_dir, ignore_errors=True)
     os.mkdir(t2t_dir)
-    setup_fname = os.path.join(t2t_dir, 'setup.py')
+    setup_fname = os.path.join(t2t_dir, "setup.py")
     setup_file_str = get_setup_file(
-        name='DummyT2TPackage',
-        packages=['tensor2tensor==%s' % t2t_version]
+        name="DummyT2TPackage",
+        packages=["tensor2tensor==%s" % t2t_version]
     )
-    with tf.gfile.Open(setup_fname, 'w') as f:
+    with tf.gfile.Open(setup_fname, "w") as f:
       f.write(setup_file_str)
   t2t_tar = _tar_and_copy(t2t_dir, train_dir)
   return t2t_tar
@@ -208,20 +212,20 @@ def tar_and_copy_t2t(train_dir):
 
 def tar_and_copy_usr_dir(usr_dir, train_dir):
   """Package, tar, and copy usr_dir to GCS train_dir."""
-  tf.logging.info('Tarring and pushing t2t_usr_dir.')
+  tf.logging.info("Tarring and pushing t2t_usr_dir.")
   usr_dir = os.path.abspath(os.path.expanduser(usr_dir))
   # Copy usr dir to a temp location
-  top_dir = os.path.join(tempfile.gettempdir(), 't2t_usr_container')
+  top_dir = os.path.join(tempfile.gettempdir(), "t2t_usr_container")
   tmp_usr_dir = os.path.join(top_dir, usr_dir_lib.INTERNAL_USR_DIR_PACKAGE)
   shutil.rmtree(top_dir, ignore_errors=True)
   shutil.copytree(usr_dir, tmp_usr_dir)
   # Insert setup.py if one does not exist
-  top_setup_fname = os.path.join(top_dir, 'setup.py')
+  top_setup_fname = os.path.join(top_dir, "setup.py")
   setup_file_str = get_setup_file(
-      name='DummyUsrDirPackage',
+      name="DummyUsrDirPackage",
       packages=get_requirements(usr_dir)
   )
-  with tf.gfile.Open(top_setup_fname, 'w') as f:
+  with tf.gfile.Open(top_setup_fname, "w") as f:
     f.write(setup_file_str)
   usr_tar = _tar_and_copy(top_dir, train_dir)
   return usr_tar
@@ -230,7 +234,7 @@ def tar_and_copy_usr_dir(usr_dir, train_dir):
 def autotune_paramspecs(hparams_range):
   rhp = common_hparams.RangedHParams()
   registry.ranged_hparams(hparams_range)(rhp)
-  return rhp.to_parameter_specs(name_prefix='hp_')
+  return rhp.to_parameter_specs(name_prefix="hp_")
 
 
 def configure_autotune(hparams_range,
@@ -239,32 +243,32 @@ def configure_autotune(hparams_range,
                        max_trials=10,
                        parallel_trials=1):
   return {
-      'goal': 'MAXIMIZE' if maximize else 'MINIMIZE',
-      'params': autotune_paramspecs(hparams_range),
-      'maxTrials': max_trials,
-      'maxParallelTrials': parallel_trials,
-      'hyperparameterMetricTag': objective,
+      "goal": "MAXIMIZE" if maximize else "MINIMIZE",
+      "params": autotune_paramspecs(hparams_range),
+      "maxTrials": max_trials,
+      "maxParallelTrials": parallel_trials,
+      "hyperparameterMetricTag": objective,
   }
 
 
 def configure_trainer_package(job_spec, t2t_tar):
-  assert t2t_tar.startswith('gs://')
-  job_spec['trainingInput']['packageUris'] = [t2t_tar]
+  assert t2t_tar.startswith("gs://")
+  job_spec["trainingInput"]["packageUris"] = [t2t_tar]
 
 
 def configure_usr_dir(job_spec, usr_tar):
-  assert usr_tar.startswith('gs://')
-  job_spec['trainingInput']['packageUris'].append(usr_tar)
-  usr_args = ['--t2t_usr_dir', usr_dir_lib.INTERNAL_USR_DIR_PACKAGE]
-  job_spec['trainingInput']['args'].extend(usr_args)
+  assert usr_tar.startswith("gs://")
+  job_spec["trainingInput"]["packageUris"].append(usr_tar)
+  usr_args = ["--t2t_usr_dir", usr_dir_lib.INTERNAL_USR_DIR_PACKAGE]
+  job_spec["trainingInput"]["args"].extend(usr_args)
 
 
 def validate_flags():
   """Validates flags are set to acceptable values for CloudML Engine runs."""
   assert not FLAGS.cloud_tpu
   assert not job_dir()
-  assert FLAGS.output_dir.startswith('gs://')
-  assert FLAGS.data_dir.startswith('gs://')
+  assert FLAGS.output_dir.startswith("gs://")
+  assert FLAGS.data_dir.startswith("gs://")
   assert FLAGS.worker_replicas <= 1
   assert FLAGS.ps_replicas <= 0
   if FLAGS.hparams_range:
@@ -273,29 +277,29 @@ def validate_flags():
     assert FLAGS.worker_gpu in [1, 4, 8]
   if FLAGS.cloud_mlengine_master_type:
     if FLAGS.use_tpu:
-      assert FLAGS.cloud_mlengine_master_type == 'standard_tpu'
+      assert FLAGS.cloud_mlengine_master_type == "standard_tpu"
     elif FLAGS.worker_gpu:
       if FLAGS.worker_gpu == 1:
-        assert FLAGS.cloud_mlengine_master_type in ['standard_gpu',
-                                                    'standard_p100']
+        assert FLAGS.cloud_mlengine_master_type in ["standard_gpu",
+                                                    "standard_p100"]
       elif FLAGS.worker_gpu == 4:
-        assert FLAGS.cloud_mlengine_master_type in ['complex_model_m_gpu',
-                                                    'complex_model_m_p100']
+        assert FLAGS.cloud_mlengine_master_type in ["complex_model_m_gpu",
+                                                    "complex_model_m_p100"]
       else:
-        assert FLAGS.cloud_mlengine_master_type == 'complex_model_l_gpu'
+        assert FLAGS.cloud_mlengine_master_type == "complex_model_l_gpu"
     else:
-      assert FLAGS.cloud_mlengine_master_type in ['standard', 'large_model',
-                                                  'complex_model_s',
-                                                  'complex_model_m',
-                                                  'complex_model_l']
+      assert FLAGS.cloud_mlengine_master_type in ["standard", "large_model",
+                                                  "complex_model_s",
+                                                  "complex_model_m",
+                                                  "complex_model_l"]
 
 
 def launch():
   """Launch t2t_trainer on Cloud ML Engine."""
   validate_flags()
   job_spec = configure_job()
-  job_name = job_spec['jobId']
-  tf.logging.info('Launching job %s with ML Engine spec:\n%s', job_name,
+  job_name = job_spec["jobId"]
+  tf.logging.info("Launching job %s with ML Engine spec:\n%s", job_name,
                   job_spec)
   assert cloud.confirm()
   train_dir = FLAGS.output_dir
@@ -305,5 +309,5 @@ def launch():
     usr_tar = tar_and_copy_usr_dir(FLAGS.t2t_usr_dir, train_dir)
     configure_usr_dir(job_spec, usr_tar)
   launch_job(job_spec)
-  tf.logging.info('Launched %s. See console to track: %s.', job_name,
+  tf.logging.info("Launched %s. See console to track: %s.", job_name,
                   CONSOLE_URL)
diff --git a/tensor2tensor/utils/cloud_tpu.py b/tensor2tensor/utils/cloud_tpu.py
index 1518e69ae..d1ea417be 100644
--- a/tensor2tensor/utils/cloud_tpu.py
+++ b/tensor2tensor/utils/cloud_tpu.py
@@ -29,6 +29,7 @@
 import time
 
 from six.moves import input  # pylint: disable=redefined-builtin
+from tensor2tensor.data_generators import text_encoder
 import tensorflow as tf
 
 TPU_IP = "10.240.%d.2"
@@ -216,7 +217,7 @@ def shell_background(cmd_, **kwargs):
 
 
 def shell_output(cmd_, **kwargs):
-  return sp.check_output(format_cmd(cmd_, **kwargs))
+  return text_encoder.to_unicode(sp.check_output(format_cmd(cmd_, **kwargs)))
 
 
 def shell_run(cmd_, **kwargs):
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index d7be24e7e..9ad3a712f 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -23,7 +23,7 @@
 import numpy as np
 
 import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 import tensorflow as tf
 
@@ -191,7 +191,7 @@ def _batching_scheme(batch_size,
   ]
   window_size = max(
       [i for i in highly_composite_numbers if i <= 3 * max_batch_size])
-  divisors = [i for i in xrange(1, window_size + 1) if window_size % i == 0]
+  divisors = [i for i in range(1, window_size + 1) if window_size % i == 0]
   batch_sizes = [max([d for d in divisors if d <= bs]) for bs in batch_sizes]
   window_size *= shard_multiplier
   batch_sizes = [bs * shard_multiplier for bs in batch_sizes]
diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py
index db039b799..ec5897092 100644
--- a/tensor2tensor/utils/data_reader_test.py
+++ b/tensor2tensor/utils/data_reader_test.py
@@ -25,7 +25,7 @@
 # Dependency imports
 
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem as problem_mod
@@ -39,7 +39,7 @@
 class TestProblem(problem_mod.Problem):
 
   def generator(self, data_dir, tmp_dir, is_training):
-    for i in xrange(30):
+    for i in range(30):
       yield {"inputs": [i] * (i + 1), "targets": [i], "floats": [i + 0.5]}
 
   def generate_data(self, data_dir, tmp_dir, task_id=-1):
@@ -98,7 +98,7 @@ def testBasicExampleReading(self):
     with tf.train.MonitoredSession() as sess:
       # Check that there are multiple examples that have the right fields of the
       # right type (lists of int/float).
-      for _ in xrange(10):
+      for _ in range(10):
         ex_val = sess.run(examples)
         inputs, targets, floats = (ex_val["inputs"], ex_val["targets"],
                                    ex_val["floats"])
@@ -130,7 +130,7 @@ def testLengthFilter(self):
     examples = dataset.make_one_shot_iterator().get_next()
     with tf.train.MonitoredSession() as sess:
       ex_lens = []
-      for _ in xrange(max_len):
+      for _ in range(max_len):
         ex_lens.append(len(sess.run(examples)["inputs"]))
 
     self.assertAllEqual(list(range(1, max_len + 1)), sorted(ex_lens))
diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py
index 65616191c..7daa12b21 100644
--- a/tensor2tensor/utils/decoding.py
+++ b/tensor2tensor/utils/decoding.py
@@ -28,8 +28,8 @@
 
 from six.moves import input  # pylint: disable=redefined-builtin
 
+from tensor2tensor.data_generators import problem as problem_lib
 from tensor2tensor.data_generators import text_encoder
-from tensor2tensor.data_generators.problem import problem_hparams_to_features
 import tensorflow as tf
 
 FLAGS = tf.flags.FLAGS
@@ -43,7 +43,6 @@ def decode_hparams(overrides=""):
   hp = tf.contrib.training.HParams(
       save_images=False,
       log_targets=True,
-      problem_idx=0,
       extra_length=100,
       batch_size=0,
       beam_size=4,
@@ -102,14 +101,14 @@ def log_decode_results(inputs,
 
 
 def decode_from_dataset(estimator,
-                        problem_names,
+                        problem_name,
                         hparams,
                         decode_hp,
                         decode_to_file=None,
                         dataset_split=None):
   """Perform decoding from dataset."""
   tf.logging.info("Performing local inference from dataset for %s.",
-                  str(problem_names))
+                  str(problem_name))
   # We assume that worker_id corresponds to shard number.
   shard = decode_hp.shard_id if decode_hp.shards > 1 else None
 
@@ -123,76 +122,59 @@ def decode_from_dataset(estimator,
       "dataset_split": dataset_split,
   }
 
-  for problem_idx, problem_name in enumerate(problem_names):
-    # Build the inference input function
-    problem = hparams.problem_instances[problem_idx]
-    infer_input_fn = problem.make_estimator_input_fn(
-        tf.estimator.ModeKeys.PREDICT, hparams, dataset_kwargs=dataset_kwargs)
+  # Build the inference input function
+  problem = hparams.problem
+  infer_input_fn = problem.make_estimator_input_fn(
+      tf.estimator.ModeKeys.PREDICT, hparams, dataset_kwargs=dataset_kwargs)
 
-    # Get the predictions as an iterable
-    predictions = estimator.predict(infer_input_fn)
+  # Get the predictions as an iterable
+  predictions = estimator.predict(infer_input_fn)
 
-    # Prepare output file writers if decode_to_file passed
-    if decode_to_file:
-      if decode_hp.shards > 1:
-        decode_filename = decode_to_file + ("%.2d" % decode_hp.shard_id)
-      else:
-        decode_filename = decode_to_file
-      output_filepath = _decode_filename(decode_filename, problem_name,
-                                         decode_hp)
-      parts = output_filepath.split(".")
-      parts[-1] = "targets"
-      target_filepath = ".".join(parts)
-      parts[-1] = "inputs"
-      input_filepath = ".".join(parts)
-
-      output_file = tf.gfile.Open(output_filepath, "w")
-      target_file = tf.gfile.Open(target_filepath, "w")
-      input_file = tf.gfile.Open(input_filepath, "w")
-
-    problem_hparams = hparams.problems[problem_idx]
-    # Inputs vocabulary is set to targets if there are no inputs in the problem,
-    # e.g., for language models where the inputs are just a prefix of targets.
-    has_input = "inputs" in problem_hparams.vocabulary
-    inputs_vocab_key = "inputs" if has_input else "targets"
-    inputs_vocab = problem_hparams.vocabulary[inputs_vocab_key]
-    targets_vocab = problem_hparams.vocabulary["targets"]
-    for num_predictions, prediction in enumerate(predictions):
-      num_predictions += 1
-      inputs = prediction["inputs"]
-      targets = prediction["targets"]
-      outputs = prediction["outputs"]
-
-      # Log predictions
-      decoded_outputs = []
-      decoded_scores = []
-      if decode_hp.return_beams:
-        output_beams = np.split(outputs, decode_hp.beam_size, axis=0)
-        scores = None
-        if "scores" in prediction:
-          scores = np.split(prediction["scores"], decode_hp.beam_size, axis=0)
-        for i, beam in enumerate(output_beams):
-          tf.logging.info("BEAM %d:" % i)
-          score = scores and scores[i]
-          decoded = log_decode_results(
-              inputs,
-              beam,
-              problem_name,
-              num_predictions,
-              inputs_vocab,
-              targets_vocab,
-              save_images=decode_hp.save_images,
-              model_dir=estimator.model_dir,
-              identity_output=decode_hp.identity_output,
-              targets=targets,
-              log_targets=decode_hp.log_targets)
-          decoded_outputs.append(decoded)
-          if decode_hp.write_beam_scores:
-            decoded_scores.append(score)
-      else:
+  # Prepare output file writers if decode_to_file passed
+  if decode_to_file:
+    if decode_hp.shards > 1:
+      decode_filename = decode_to_file + ("%.2d" % decode_hp.shard_id)
+    else:
+      decode_filename = decode_to_file
+    output_filepath = _decode_filename(decode_filename, problem_name,
+                                       decode_hp)
+    parts = output_filepath.split(".")
+    parts[-1] = "targets"
+    target_filepath = ".".join(parts)
+    parts[-1] = "inputs"
+    input_filepath = ".".join(parts)
+
+    output_file = tf.gfile.Open(output_filepath, "w")
+    target_file = tf.gfile.Open(target_filepath, "w")
+    input_file = tf.gfile.Open(input_filepath, "w")
+
+  problem_hparams = hparams.problem_hparams
+  # Inputs vocabulary is set to targets if there are no inputs in the problem,
+  # e.g., for language models where the inputs are just a prefix of targets.
+  has_input = "inputs" in problem_hparams.vocabulary
+  inputs_vocab_key = "inputs" if has_input else "targets"
+  inputs_vocab = problem_hparams.vocabulary[inputs_vocab_key]
+  targets_vocab = problem_hparams.vocabulary["targets"]
+  for num_predictions, prediction in enumerate(predictions):
+    num_predictions += 1
+    inputs = prediction["inputs"]
+    targets = prediction["targets"]
+    outputs = prediction["outputs"]
+
+    # Log predictions
+    decoded_outputs = []
+    decoded_scores = []
+    if decode_hp.return_beams:
+      output_beams = np.split(outputs, decode_hp.beam_size, axis=0)
+      scores = None
+      if "scores" in prediction:
+        scores = np.split(prediction["scores"], decode_hp.beam_size, axis=0)
+      for i, beam in enumerate(output_beams):
+        tf.logging.info("BEAM %d:" % i)
+        score = scores and scores[i]
         decoded = log_decode_results(
             inputs,
-            outputs,
+            beam,
             problem_name,
             num_predictions,
             inputs_vocab,
@@ -203,28 +185,44 @@ def decode_from_dataset(estimator,
             targets=targets,
             log_targets=decode_hp.log_targets)
         decoded_outputs.append(decoded)
+        if decode_hp.write_beam_scores:
+          decoded_scores.append(score)
+    else:
+      decoded = log_decode_results(
+          inputs,
+          outputs,
+          problem_name,
+          num_predictions,
+          inputs_vocab,
+          targets_vocab,
+          save_images=decode_hp.save_images,
+          model_dir=estimator.model_dir,
+          identity_output=decode_hp.identity_output,
+          targets=targets,
+          log_targets=decode_hp.log_targets)
+      decoded_outputs.append(decoded)
+
+    # Write out predictions if decode_to_file passed
+    if decode_to_file:
+      for i, (d_input, d_output, d_target) in enumerate(decoded_outputs):
+        beam_score_str = ""
+        if decode_hp.write_beam_scores:
+          beam_score_str = "\t%.2f" % decoded_scores[i]
+        output_file.write(
+            str(d_output) + beam_score_str + decode_hp.delimiter)
+        target_file.write(str(d_target) + decode_hp.delimiter)
+        input_file.write(str(d_input) + decode_hp.delimiter)
 
-      # Write out predictions if decode_to_file passed
-      if decode_to_file:
-        for i, (d_input, d_output, d_target) in enumerate(decoded_outputs):
-          beam_score_str = ""
-          if decode_hp.write_beam_scores:
-            beam_score_str = "\t%.2f" % decoded_scores[i]
-          output_file.write(
-              str(d_output) + beam_score_str + decode_hp.delimiter)
-          target_file.write(str(d_target) + decode_hp.delimiter)
-          input_file.write(str(d_input) + decode_hp.delimiter)
-
-      if (decode_hp.num_samples >= 0 and
-          num_predictions >= decode_hp.num_samples):
-        break
+    if (decode_hp.num_samples >= 0 and
+        num_predictions >= decode_hp.num_samples):
+      break
 
-    if decode_to_file:
-      output_file.close()
-      target_file.close()
-      input_file.close()
+  if decode_to_file:
+    output_file.close()
+    target_file.close()
+    input_file.close()
 
-    tf.logging.info("Completed inference on %d samples." % num_predictions)  # pylint: disable=undefined-loop-variable
+  tf.logging.info("Completed inference on %d samples." % num_predictions)  # pylint: disable=undefined-loop-variable
 
 
 def decode_from_file(estimator,
@@ -239,14 +237,14 @@ def decode_from_file(estimator,
     tf.logging.info(
         "decode_hp.batch_size not specified; default=%d" % decode_hp.batch_size)
 
-  problem_id = decode_hp.problem_idx
   # Inputs vocabulary is set to targets if there are no inputs in the problem,
   # e.g., for language models where the inputs are just a prefix of targets.
-  has_input = "inputs" in hparams.problems[problem_id].vocabulary
+  p_hp = hparams.problem_hparams
+  has_input = "inputs" in p_hp.vocabulary
   inputs_vocab_key = "inputs" if has_input else "targets"
-  inputs_vocab = hparams.problems[problem_id].vocabulary[inputs_vocab_key]
-  targets_vocab = hparams.problems[problem_id].vocabulary["targets"]
-  problem_name = FLAGS.problems.split("-")[problem_id]
+  inputs_vocab = p_hp.vocabulary[inputs_vocab_key]
+  targets_vocab = p_hp.vocabulary["targets"]
+  problem_name = FLAGS.problem
   tf.logging.info("Performing decoding from a file.")
   sorted_inputs, sorted_keys = _get_sorted_inputs(filename, decode_hp.shards,
                                                   decode_hp.delimiter)
@@ -254,7 +252,7 @@ def decode_from_file(estimator,
 
   def input_fn():
     input_gen = _decode_batch_input_fn(
-        problem_id, num_decode_batches, sorted_inputs, inputs_vocab,
+        num_decode_batches, sorted_inputs, inputs_vocab,
         decode_hp.batch_size, decode_hp.max_input_size)
     gen_fn = make_input_fn_from_generator(input_gen)
     example = gen_fn()
@@ -355,9 +353,8 @@ def input_fn():
 
   result_iter = estimator.predict(input_fn, checkpoint_path=checkpoint_path)
   for result in result_iter:
-    problem_idx = result["problem_choice"]
     is_image = False  # TODO(lukaszkaiser): find out from problem id / class.
-    targets_vocab = hparams.problems[problem_idx].vocabulary["targets"]
+    targets_vocab = hparams.problem_hparams.vocabulary["targets"]
 
     if decode_hp.return_beams:
       beams = np.split(result["outputs"], decode_hp.beam_size, axis=0)
@@ -379,7 +376,7 @@ def input_fn():
             targets_vocab.decode(_save_until_eos(result["outputs"], is_image)))
 
 
-def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs,
+def _decode_batch_input_fn(num_decode_batches, sorted_inputs,
                            vocabulary, batch_size, max_input_size):
   tf.logging.info(" batch %d" % num_decode_batches)
   # First reverse all the input sentences so that if you're going to get OOMs,
@@ -406,7 +403,6 @@ def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs,
 
     yield {
         "inputs": np.array(final_batch_inputs).astype(np.int32),
-        "problem_choice": np.array(problem_id).astype(np.int32),
     }
 
 
@@ -432,8 +428,7 @@ def _interactive_input_fn(hparams, decode_hp):
   num_samples = decode_hp.num_samples if decode_hp.num_samples > 0 else 1
   decode_length = decode_hp.extra_length
   input_type = "text"
-  problem_id = 0
-  p_hparams = hparams.problems[problem_id]
+  p_hparams = hparams.problem_hparams
   has_input = "inputs" in p_hparams.input_modality
   vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"]
   # This should be longer than the longest input.
@@ -447,9 +442,6 @@ def _interactive_input_fn(hparams, decode_hp):
     prompt = ("INTERACTIVE MODE  num_samples=%d  decode_length=%d  \n"
               "  it=<input_type>     ('text' or 'image' or 'label', default: "
               "text)\n"
-              "  pr=<problem_num>    (set the problem number, default: 0)\n"
-              "  in=<input_problem>  (set the input problem number)\n"
-              "  ou=<output_problem> (set the output problem number)\n"
               "  ns=<num_samples>    (changes number of samples, default: 1)\n"
               "  dl=<decode_length>  (changes decode length, default: 100)\n"
               "  <%s>                (decode)\n"
@@ -459,19 +451,6 @@ def _interactive_input_fn(hparams, decode_hp):
     input_string = input(prompt)
     if input_string == "q":
       return
-    elif input_string[:3] == "pr=":
-      problem_id = int(input_string[3:])
-      p_hparams = hparams.problems[problem_id]
-      has_input = "inputs" in p_hparams.input_modality
-      vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"]
-    elif input_string[:3] == "in=":
-      problem = int(input_string[3:])
-      p_hparams.input_modality = hparams.problems[problem].input_modality
-      p_hparams.input_space_id = hparams.problems[problem].input_space_id
-    elif input_string[:3] == "ou=":
-      problem = int(input_string[3:])
-      p_hparams.target_modality = hparams.problems[problem].target_modality
-      p_hparams.target_space_id = hparams.problems[problem].target_space_id
     elif input_string[:3] == "ns=":
       num_samples = int(input_string[3:])
     elif input_string[:3] == "dl=":
@@ -503,7 +482,8 @@ def _interactive_input_fn(hparams, decode_hp):
         }
       else:
         raise Exception("Unsupported input type.")
-      for k, v in six.iteritems(problem_hparams_to_features(p_hparams)):
+      for k, v in six.iteritems(
+          problem_lib.problem_hparams_to_features(p_hparams)):
         features[k] = np.array(v).astype(np.int32)
       yield features
 
@@ -574,8 +554,7 @@ def _interactive_input_tensor_to_features_dict(feature_map, hparams):
   """Convert the interactive input format (see above) to a dictionary.
 
   Args:
-    feature_map: a dictionary with keys `problem_choice` and `input` containing
-      Tensors.
+    feature_map: dict with inputs.
     hparams: model hyperparameters
 
   Returns:
@@ -584,31 +563,26 @@ def _interactive_input_tensor_to_features_dict(feature_map, hparams):
   inputs = tf.convert_to_tensor(feature_map["inputs"])
   input_is_image = False if len(inputs.get_shape()) < 3 else True
 
-  def input_fn(problem_choice, x=inputs):  # pylint: disable=missing-docstring
-    if input_is_image:
-      x = tf.image.resize_images(x, [299, 299])
-      x = tf.reshape(x, [1, 299, 299, -1])
-      x = tf.to_int32(x)
-    else:
-      # Remove the batch dimension.
-      num_samples = x[0]
-      length = x[2]
-      x = tf.slice(x, [3], tf.to_int32([length]))
-      x = tf.reshape(x, [1, -1, 1, 1])
-      # Transform into a batch of size num_samples to get that many random
-      # decodes.
-      x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1]))
-
-    p_hparams = hparams.problems[problem_choice]
-    return (tf.constant(p_hparams.input_space_id), tf.constant(
-        p_hparams.target_space_id), x)
-
-  input_space_id, target_space_id, x = cond_on_index(
-      input_fn, feature_map["problem_choice"], len(hparams.problems) - 1)
+  x = inputs
+  if input_is_image:
+    x = tf.image.resize_images(x, [299, 299])
+    x = tf.reshape(x, [1, 299, 299, -1])
+    x = tf.to_int32(x)
+  else:
+    # Remove the batch dimension.
+    num_samples = x[0]
+    length = x[2]
+    x = tf.slice(x, [3], tf.to_int32([length]))
+    x = tf.reshape(x, [1, -1, 1, 1])
+    # Transform into a batch of size num_samples to get that many random
+    # decodes.
+    x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1]))
+
+  p_hparams = hparams.problem_hparams
+  input_space_id = tf.constant(p_hparams.input_space_id)
+  target_space_id = tf.constant(p_hparams.target_space_id)
 
   features = {}
-  features["problem_choice"] = tf.convert_to_tensor(
-      feature_map["problem_choice"])
   features["input_space_id"] = input_space_id
   features["target_space_id"] = target_space_id
   features["decode_length"] = (
@@ -621,8 +595,7 @@ def _decode_input_tensor_to_features_dict(feature_map, hparams):
   """Convert the interactive input format (see above) to a dictionary.
 
   Args:
-    feature_map: a dictionary with keys `problem_choice` and `input` containing
-      Tensors.
+    feature_map: dict with inputs.
     hparams: model hyperparameters
 
   Returns:
@@ -631,34 +604,18 @@ def _decode_input_tensor_to_features_dict(feature_map, hparams):
   inputs = tf.convert_to_tensor(feature_map["inputs"])
   input_is_image = False
 
-  def input_fn(problem_choice, x=inputs):  # pylint: disable=missing-docstring
-    p_hparams = hparams.problems[problem_choice]
-    # Add a third empty dimension
-    x = tf.expand_dims(x, axis=[2])
-    x = tf.to_int32(x)
-    return (tf.constant(p_hparams.input_space_id), tf.constant(
-        p_hparams.target_space_id), x)
-
-  input_space_id, target_space_id, x = cond_on_index(
-      input_fn, feature_map["problem_choice"], len(hparams.problems) - 1)
+  x = inputs
+  p_hparams = hparams.problem_hparams
+  # Add a third empty dimension
+  x = tf.expand_dims(x, axis=[2])
+  x = tf.to_int32(x)
+  input_space_id = tf.constant(p_hparams.input_space_id)
+  target_space_id = tf.constant(p_hparams.target_space_id)
 
   features = {}
-  features["problem_choice"] = feature_map["problem_choice"]
   features["input_space_id"] = input_space_id
   features["target_space_id"] = target_space_id
   features["decode_length"] = (
       IMAGE_DECODE_LENGTH if input_is_image else tf.shape(x)[1] + 50)
   features["inputs"] = x
   return features
-
-
-def cond_on_index(fn, index_tensor, max_idx, cur_idx=0):
-  """Call fn(index_tensor) using tf.cond in [cur_id, max_idx]."""
-  if cur_idx == max_idx:
-    return fn(cur_idx)
-
-  return tf.cond(
-      tf.equal(index_tensor, cur_idx),
-      lambda: fn(cur_idx),
-      lambda: cond_on_index(fn, index_tensor, max_idx, cur_idx + 1)
-  )
diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py
index 2bfd35f01..a83616500 100644
--- a/tensor2tensor/utils/expert_utils.py
+++ b/tensor2tensor/utils/expert_utils.py
@@ -29,11 +29,10 @@
 # Dependency imports
 
 import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import range  # pylint: disable=redefined-builtin
 from six.moves import zip  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import function
 
 DEFAULT_DEV_STRING = "existing_device"
@@ -117,7 +116,7 @@ class Parallelism(object):
 
       e = []
       f = []
-      for i in xrange(len(devices)):
+      for i in range(len(devices)):
         with tf.device(devices[i]):
           e_, f_ = func(a[i], b[i], c)
           e.append(e_)
@@ -177,11 +176,11 @@ def __call__(self, fn, *args, **kwargs):
       my_args = transpose_list_of_lists(
           [self._maybe_repeat(arg) for arg in args])
     else:
-      my_args = [[] for _ in xrange(self.n)]
-    my_kwargs = [{} for _ in xrange(self.n)]
+      my_args = [[] for _ in range(self.n)]
+    my_kwargs = [{} for _ in range(self.n)]
     for k, v in six.iteritems(kwargs):
       vals = self._maybe_repeat(v)
-      for i in xrange(self.n):
+      for i in range(self.n):
         my_kwargs[i][k] = vals[i]
 
     # Construct lists of functions.
@@ -191,7 +190,7 @@ def __call__(self, fn, *args, **kwargs):
     outputs = []
     cache = {}
     tensor_to_var = {}
-    for i in xrange(self.n):
+    for i in range(self.n):
 
       def daisy_chain_getter(getter, name, *args, **kwargs):
         """Get a variable and cache in a daisy chain."""
@@ -427,7 +426,7 @@ def _my_top_k(x, k):
   values = []
   indices = []
   depth = tf.shape(x)[1]
-  for i in xrange(k):
+  for i in range(k):
     values.append(tf.reduce_max(x, 1))
     argmax = tf.argmax(x, 1)
     indices.append(argmax)
@@ -560,7 +559,7 @@ def remove(self, x):
           x,
           indices=self.nonpad_ids,
       )
-      if not context.in_eager_mode():
+      if not tf.contrib.eager.in_eager_mode():
         # This is a hack but for some reason, gather_nd return a tensor of
         # undefined shape, so the shape is set up manually
         x.set_shape([None] + x_shape[1:])
@@ -895,7 +894,7 @@ def ffn_expert_fn(input_size,
   """
   def my_fn(x):
     layer_sizes = [input_size] + hidden_sizes + [output_size]
-    for i in xrange(1 + len(hidden_sizes)):
+    for i in range(1 + len(hidden_sizes)):
       w = tf.get_variable("w_%d" % i, layer_sizes[i:i+2], tf.float32)
       x = tf.matmul(x, w)
       if i < len(hidden_sizes):
@@ -909,7 +908,7 @@ def my_fn(x):
 def reshape_like(a, b):
   """Reshapes a to match the shape of b in all but the last dimension."""
   ret = tf.reshape(a, tf.concat([tf.shape(b)[:-1], tf.shape(a)[-1:]], 0))
-  if not context.in_eager_mode():
+  if not tf.contrib.eager.in_eager_mode():
     ret.set_shape(b.get_shape().as_list()[:-1] + a.get_shape().as_list()[-1:])
   return ret
 
@@ -917,7 +916,7 @@ def reshape_like(a, b):
 def flatten_all_but_last(a):
   """Flatten all dimensions of a except the last."""
   ret = tf.reshape(a, [-1, tf.shape(a)[-1]])
-  if not context.in_eager_mode():
+  if not tf.contrib.eager.in_eager_mode():
     ret.set_shape([None] + a.get_shape().as_list()[-1:])
   return ret
 
@@ -962,7 +961,7 @@ def distributed_moe(data_parallelism,
   #   We use the default of reuse=False.  Otherwise, the experts would all
   #   use the same variables.
   ep = Parallelism(
-      [expert_devices[i % len(expert_devices)] for i in xrange(num_experts)],
+      [expert_devices[i % len(expert_devices)] for i in range(num_experts)],
       reuse=None)
   # Experts expect 2d input tensors, so flatten the batch dimension and all
   # spatial dimensions together.
diff --git a/tensor2tensor/utils/flags.py b/tensor2tensor/utils/flags.py
index 08b40efdf..20827e69a 100644
--- a/tensor2tensor/utils/flags.py
+++ b/tensor2tensor/utils/flags.py
@@ -43,8 +43,7 @@
     hyperparameters or when using Vizier. If a hyperparameter setting is
     specified by this flag then it must be a valid hyperparameter name for the
     model.""")
-flags.DEFINE_string("problems", None, "Dash separated list of problems to "
-                    "solve.")
+flags.DEFINE_string("problem", None, "Problem name.")
 
 # data_dir is a common flag name - catch conflicts and define it once.
 try:
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index dc3b71607..864ff49d5 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -362,7 +362,7 @@ def create_evaluation_metrics(problems, model_hparams):
   Returns:
     dict<metric name, metric function>. The metric functions have signature
     (Tensor predictions, features) -> (metric Tensor, update op), where features
-    is a dict with keys {targets, problem_choice}.
+    is a dict with keys {targets}.
 
   Raises:
     ValueError: if the metrics specified by a problem are not recognized (i.e.
@@ -379,13 +379,11 @@ def reduce_dimensions(predictions, labels):
           labels, [-1] + common_layers.shape_list(labels)[-3:])
     return predictions, labels
 
-  def make_problem_specific_metric_fn(metric_fn, problem_idx, weights_fn):
-    """Create a metric fn conditioned on problem_idx."""
+  def make_problem_specific_metric_fn(metric_fn, weights_fn):
+    """Create a metric fn."""
 
     def problem_metric_fn(predictions, features, labels):
       """Metric fn."""
-      problem_choice = features.get("problem_choice", 0)
-
       # Send along the entire features dict if the metric fn has the kwarg
       # "features".
       kwargs = {}
@@ -395,19 +393,14 @@ def problem_metric_fn(predictions, features, labels):
 
       predictions, labels = reduce_dimensions(predictions, labels)
 
-      def wrapped_metric_fn():
-        return metric_fn(predictions, labels, weights_fn=weights_fn, **kwargs)
-
-      (scores, weights) = tf.cond(
-          tf.equal(problem_idx, problem_choice), wrapped_metric_fn,
-          lambda: (tf.constant(0.0), tf.constant(0.0)))
-      # The tf.metrics.mean function assures correct aggregation.
+      scores, weights = metric_fn(predictions, labels,
+                                  weights_fn=weights_fn, **kwargs)
       return tf.metrics.mean(scores, weights)
 
     return problem_metric_fn
 
   eval_metrics = dict()
-  for problem_idx, problem_instance in enumerate(problems):
+  for problem_instance in problems:
     problem_name = problem_instance.name
     metrics = problem_instance.eval_metrics()
     if not all([m in METRICS_FNS for m in metrics]):
@@ -440,7 +433,7 @@ def image_wrapped_metric_fn(predictions,
             eval_metrics[metric_name] = image_wrapped_metric_fn
           else:
             problem_metric_fn = make_problem_specific_metric_fn(
-                metric_fn, problem_idx, weights_fn)
+                metric_fn, weights_fn)
             eval_metrics[metric_name] = problem_metric_fn
     else:
       if isinstance(tm, tuple):
@@ -454,7 +447,7 @@ def image_wrapped_metric_fn(predictions,
           eval_metrics[metric_name] = image_wrapped_metric_fn
         else:
           problem_metric_fn = make_problem_specific_metric_fn(
-              metric_fn, problem_idx, weights_fn)
+              metric_fn, weights_fn)
           eval_metrics[metric_name] = problem_metric_fn
 
   return eval_metrics
diff --git a/tensor2tensor/utils/quantization.py b/tensor2tensor/utils/quantization.py
new file mode 100644
index 000000000..339bcf9f5
--- /dev/null
+++ b/tensor2tensor/utils/quantization.py
@@ -0,0 +1,279 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities related to using bfloat16 activations and/or parameters."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import tensorflow as tf
+
+from tensorflow.python.framework import function
+
+
+def bfloat16_activations_var_getter(getter, *args, **kwargs):
+  """A custom getter function for float32 parameters and bfloat16 activations.
+
+  Args:
+    getter: custom getter
+    *args: arguments
+    **kwargs: keyword arguments
+  Returns:
+    variables with the correct dtype.
+  Raises:
+    KeyError: if "dtype" is not provided as a kwarg.
+  """
+  requested_dtype = kwargs["dtype"]
+  if requested_dtype == tf.bfloat16:
+    kwargs["dtype"] = tf.float32
+  var = getter(*args, **kwargs)
+  # This if statement is needed to guard the cast, because batch norm
+  # assigns directly to the return value of this custom getter. The cast
+  # makes the return value not a variable so it cannot be assigned. Batch
+  # norm variables are always in fp32 so this if statement is never
+  # triggered for them.
+  if var.dtype.base_dtype != requested_dtype:
+    var = tf.cast(var, requested_dtype)
+  return var
+
+
+def simulated_quantize(x, num_bits, noise):
+  """Simulate quantization to num_bits bits, with externally-stored scale.
+
+  num_bits is the number of bits used to store each value.
+  noise is a float32 Tensor containing values in [0, 1).
+  Each value in noise should take different values across
+  different steps, approximating a uniform distribution over [0, 1).
+  In the case of replicated TPU training, noise should be identical
+  across replicas in order to keep the parameters identical across replicas.
+
+  The natural choice for noise would be tf.random_uniform(),
+  but this is not possible for TPU, since there is currently no way to seed
+  the different cores to produce identical values across replicas.  Instead we
+  use noise_from_step_num() (see below).
+
+  The quantization scheme is as follows:
+
+  Compute the maximum absolute value by row (call this max_abs).
+  Store this either in an auxiliary variable or in an extra column.
+
+  Divide the parameters by (max_abs / (2^(num_bits-1)-1)).  This gives a
+  float32 value in the range [-2^(num_bits-1)-1, 2^(num_bits-1)-1]
+
+  Unbiased randomized roundoff by adding noise and rounding down.
+
+  This produces a signed integer with num_bits bits which can then be stored.
+
+  Args:
+    x: a float32 Tensor
+    num_bits: an integer between 1 and 22
+    noise: a float Tensor broadcastable to the shape of x.
+
+  Returns:
+    a float32 Tensor
+  """
+  shape = x.get_shape().as_list()
+  if not (len(shape) >= 2 and shape[-1] > 1):
+    return x
+  max_abs = tf.reduce_max(tf.abs(x), -1, keep_dims=True) + 1e-9
+  max_int = 2 ** (num_bits - 1) - 1
+  scale = max_abs / max_int
+  x /= scale
+  x = tf.floor(x + noise)
+  # dequantize before storing (since this is a simulation)
+  x *= scale
+  return x
+
+
+def noise_from_step_num():
+  """Quantization noise equal to (phi * (step_num + 1)) mod 1.0.
+
+  Not using random_uniform here due to a problem on TPU in that random seeds
+  are not respected, which may cause the parameters on different replicas
+  to go out-of-sync.
+
+  Returns:
+    a float32 scalar
+  """
+  step = tf.to_int32(tf.train.get_or_create_global_step()) + 1
+  phi = ((5 ** 0.5) - 1) / 2
+  # Naive computation tf.mod(phi * step, 1.0) in float32 would be disastrous
+  # due to loss of precision when the step number gets large.
+  # Computation in doubles does not work on TPU, so we use this complicated
+  # alternative computation which does not suffer from these roundoff errors.
+  ret = 0.0
+  for i in range(30):
+    ret += (((phi * (2 ** i)) % 1.0)  # double-precision computation in python
+            * tf.to_float(tf.mod(step // (2 ** i), 2)))
+  return tf.mod(ret, 1.0)
+
+
+def _randomized_roundoff_to_bfloat16(x, noise, cand1, cand2):
+  """Round-off x to cand1 or to cand2 in an unbiased way.
+
+  Cand1 and cand2 are the same shape as x.
+  For every element of x, the corresponding elements of cand1 and cand2 should
+  be the two closest bfloat16 values to x.  Order does not matter.
+  cand1 and cand2 must differ from each other.
+
+  Args:
+    x: A float32 Tensor.
+    noise: A Tensor broadcastable to the shape of x containing
+    random uniform values in [0.0, 1.0].
+    cand1: A bfloat16 Tensor the same shape as x.
+    cand2: A bfloat16 Tensor the same shape as x.
+
+  Returns:
+    A bfloat16 Tensor.
+  """
+  cand1_f = tf.to_float(cand1)
+  cand2_f = tf.to_float(cand2)
+  step_size = cand2_f - cand1_f
+  fpart = (x - cand1_f) / step_size
+  ret = tf.where(tf.greater(fpart, noise), cand2, cand1)
+  return ret
+
+
+def _to_bfloat16_unbiased(x, noise):
+  """Convert a float32 to a bfloat16 using randomized roundoff.
+
+  Args:
+    x: A float32 Tensor.
+    noise: a float32 Tensor with values in [0, 1), broadcastable to tf.shape(x)
+  Returns:
+    A float32 Tensor.
+  """
+  x_sign = tf.sign(x)
+  # Make sure x is positive.  If it is zero, the two candidates are identical.
+  x = x * x_sign + 1e-30
+  cand1 = tf.to_bfloat16(x)
+  cand1_f = tf.to_float(cand1)
+  # This relies on the fact that for a positive bfloat16 b,
+  # b * 1.005 gives you the next higher bfloat16 and b*0.995 gives you the
+  # next lower one. Both 1.005 and 0.995 are ballpark estimation.
+  cand2 = tf.to_bfloat16(
+      tf.where(tf.greater(x, cand1_f), cand1_f * 1.005, cand1_f * 0.995))
+  ret = _randomized_roundoff_to_bfloat16(x, noise, cand1, cand2)
+  return ret * tf.to_bfloat16(x_sign)
+
+
+class ParameterEncoding(object):
+  """Helper class for encoding weights as bfloat16.
+
+  For now, the parameters are always stored (encoded) as bfloat16 and decoded
+  to bfloat32.  Confusingly, the custom getter then converts the bfloat32 back
+  to a bfloat16 to use as an activation, assuming that we use bfloat16 for
+  activations.
+
+  TODO(noam): Add options for activation dtype=float32, and for different
+  storage dtypes.
+  """
+
+  def encode(self, x, noise):
+    """Encode float32 to bfloat16.
+
+    Args:
+      x: a float32 Tensor
+      noise: a float32 Tensor with values in [0, 1), broadcastable to shape(x)
+
+    Returns:
+      a bfloat16 Tensor
+    """
+    raise NotImplementedError("encode not implemented")
+
+  def decode(self, x):
+    """Decode bfloat16 to float32."""
+    raise NotImplementedError("decode not implemented")
+
+  def _decode_with_identity_gradient(self, x):
+    # identity backprop through the decoder.
+    # This means that the optimizer must call encode when updating weights.
+    @function.Defun(python_grad_func=lambda op, dy: dy,
+                    shape_func=lambda op: [op.inputs[0].get_shape()])
+    def my_fn(x):
+      return self.decode(x)
+    return my_fn(x)
+
+  def custom_getter(self, activation_dtype=tf.bfloat16):
+    """A custom getter that uses the encoding for bfloat16 and float32 vars.
+
+    When a bfloat16 or float32 variable is requsted, an encoded float16
+    varaible is created, which is then decoded and cast to a bfloat16
+    activation.
+
+    Args:
+      activation_dtype: a dtype to which to convert the decoded value.
+
+    Returns:
+      a function.
+    """
+    def getter_fn(getter, *args, **kwargs):
+      requested_dtype = kwargs["dtype"]
+      if requested_dtype in (tf.bfloat16, tf.float32):
+        kwargs["dtype"] = tf.bfloat16
+        kwargs["initializer"] = _EncodingInitializer(
+            kwargs["initializer"], self)
+        ret = self._decode_with_identity_gradient(getter(*args, **kwargs))
+        return tf.cast(ret, activation_dtype)
+      return getter(*args, **kwargs)
+    return getter_fn
+
+
+class _EncodingInitializer(object):
+  """Helper class for ParameterEncoding.
+
+  Initializes variables by calling base initializer, then encoding.
+  """
+
+  def __init__(self, base_initializer, parameter_encoding):
+    self._base_initializer = base_initializer
+    self._parameter_encoding = parameter_encoding
+
+  def __call__(self, shape, dtype, partition_info=None):
+    if self._base_initializer is None:
+      # mimic default initialization in tf.get_variable()
+      if dtype.is_floating:
+        ret = tf.glorot_uniform_initializer()(shape, dtype)
+      else:
+        ret = tf.zeros(shape, dtype)
+    else:
+      ret = self._base_initializer(shape, dtype, partition_info=partition_info)
+    noise = 0.0  # no random noise in the initializer.
+    return tf.cast(self._parameter_encoding.encode(ret, noise), dtype)
+
+
+class EighthPowerEncoding(ParameterEncoding):
+  """enc(x) = sign(x) * (abs(x)*128)^8.
+
+  This provides less range and more resolution.
+  The range of representable positive values is approximately [2^-23, 2^9]
+  Resolution is 8x better than bfloat16.
+  """
+
+  def encode(self, x, noise):
+    x = tf.to_float(x)
+    # we can't use tf.pow(..., 8.0) because of a high-error approximation
+    # on TPU.  Instead we square three times.
+    x = tf.sign(x) * tf.square(tf.square(tf.square(tf.abs(x) * 128.0)))
+    x = _to_bfloat16_unbiased(x, noise)
+    return x
+
+  def decode(self, x):
+    x = tf.to_float(x)
+    # we can't use tf.pow(..., 0.125) because of a high-error approximation
+    # on TPU.  Instead we sqrt three times.
+    return tf.sign(x) * (tf.sqrt(tf.sqrt(tf.sqrt(tf.abs(x)))) / 128.0)
diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py
index 16a6c7437..ef0a6cfc0 100644
--- a/tensor2tensor/utils/registry.py
+++ b/tensor2tensor/utils/registry.py
@@ -245,8 +245,7 @@ def parse_problem_name(problem_name):
     """Determines if problem_name specifies a copy and/or reversal.
 
     Args:
-      problem_name: A string containing a single problem name from
-        FLAGS.problems.
+      problem_name: str, problem name, possibly with suffixes.
 
     Returns:
       base_name: A string with the base problem name.
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index 577b47af0..50d036f33 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -21,6 +21,7 @@
 import collections
 import contextlib
 import copy
+import functools
 import math
 import time
 
@@ -31,17 +32,18 @@
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.data_generators.problem import problem_hparams_to_features
 from tensor2tensor.layers import common_layers
+from tensor2tensor.layers import modalities  # pylint: disable=unused-import
 from tensor2tensor.utils import beam_search
 from tensor2tensor.utils import decoding
 from tensor2tensor.utils import expert_utils as eu
 from tensor2tensor.utils import learning_rate
 from tensor2tensor.utils import metrics
 from tensor2tensor.utils import optimize
+from tensor2tensor.utils import quantization
 from tensor2tensor.utils import registry
 
 import tensorflow as tf
 
-from tensorflow.python.eager import context
 from tensorflow.python.layers import base
 from tensorflow.python.ops import variable_scope
 
@@ -73,7 +75,7 @@ def __init__(self,
       hparams: tf.contrib.training.HParams, model hyperparameters.
       mode: tf.estimator.ModeKeys, the execution mode.
       problem_hparams: tf.contrib.training.HParams, hyperparameters for the
-        Problem. If provided here or in hparams.problems, the model will
+        Problem. If provided here or in hparams.problem_hparams, the model will
         automatically determine bottom, top, and loss methods. If not provided,
         calling the model will only invoke body.
       data_parallelism: a expert_utils.Parallelism object,
@@ -90,8 +92,8 @@ def __init__(self,
     super(T2TModel, self).__init__(
         trainable=mode == tf.estimator.ModeKeys.TRAIN, name=name)
 
-    if not problem_hparams and hasattr(hparams, "problems"):
-      problem_hparams = hparams.problems[0]
+    if not problem_hparams and hasattr(hparams, "problem_hparams"):
+      problem_hparams = hparams.problem_hparams
     self._problem_hparams = problem_hparams
 
     # Setup hparams
@@ -129,13 +131,22 @@ def has_input(self):
     else:
       return True
 
-  def call(self, features):
-    custom_getter = None
-    if self.hparams.activation_dtype == "bfloat16":
-      custom_getter = common_layers.bfloat16_var_getter
+  @property
+  def _custom_getter(self):
     if self.hparams.weight_dtype == "bfloat16":
-      custom_getter = common_layers.bfloat16_weights_var_getter
-    tf.get_variable_scope().set_custom_getter(custom_getter)
+      if self.hparams.optimizer != "Adafactor":
+        raise NotImplementedError(
+            "weight_dtype=bfloat16 only implemented with Adafactor optimizer")
+      return quantization.EighthPowerEncoding().custom_getter(
+          activation_dtype=tf.bfloat16
+          if self.hparams.activation_dtype == "bfloat16" else tf.float32)
+    elif self.hparams.activation_dtype == "bfloat16":
+      return quantization.bfloat16_activations_var_getter
+    else:
+      return None
+
+  def call(self, features):
+    set_custom_getter_compose(self._custom_getter)
     tf.get_variable_scope().set_initializer(
         optimize.get_variable_initializer(self.hparams))
     with self._eager_var_store.as_default():
@@ -222,8 +233,7 @@ def model_fn_sharded(self, sharded_features):
   def model_fn(self, features):
     transformed_features = self.bottom(features)
 
-    if (self.hparams.activation_dtype == "bfloat16" or
-        self.hparams.weight_dtype == "bfloat16"):
+    if self.hparams.activation_dtype == "bfloat16":
       for k, v in six.iteritems(transformed_features):
         if v.dtype == tf.float32:
           transformed_features[k] = tf.cast(v, tf.bfloat16)
@@ -239,7 +249,9 @@ def model_fn(self, features):
       logits = output
     else:
       logits = self.top(output, features)
-      losses["training"] = self.loss(logits, features)
+      losses["training"] = 0.0
+      if self._hparams.mode != tf.estimator.ModeKeys.PREDICT:
+        losses["training"] = self.loss(logits, features)
 
     return logits, losses
 
@@ -338,9 +350,9 @@ def top(self, body_output, features):
         target_modality = self._problem_hparams.target_modality
       else:
         target_modality = {k: None for k in body_output.keys()}
-      # assert set(body_output.keys()) == set(target_modality.keys()), (
-      #    "The keys of model_body's returned logits dict must match the keys "
-      #    "of problem_hparams.target_modality's dict.")
+      assert set(body_output.keys()) == set(target_modality.keys()), (
+          "The keys of model_body's returned logits dict must match the keys "
+          "of problem_hparams.target_modality's dict.")
       logits = {}
       for k, v in six.iteritems(body_output):
         with tf.variable_scope(k):  # TODO(aidangomez): share variables here?
@@ -351,9 +363,9 @@ def top(self, body_output, features):
         target_modality = self._problem_hparams.target_modality
       else:
         target_modality = None
-      # assert not isinstance(target_modality, dict), (
-      #    "model_body must return a dictionary of logits when "
-      #    "problem_hparams.target_modality is a dict.")
+      assert not isinstance(target_modality, dict), (
+          "model_body must return a dictionary of logits when "
+          "problem_hparams.target_modality is a dict.")
       return self._top_single(body_output, target_modality, features)
 
   def _loss_single(self, logits, target_modality, feature):
@@ -517,6 +529,7 @@ def infer(self,
           "losses": a dictionary: {loss-name (string): floating point `Scalar`
       }
     """
+    set_custom_getter_compose(self._custom_getter)
     with self._eager_var_store.as_default():
       # TODO(rsepassi): Make decoding work with real-valued model outputs
       # (i.e. if the target modality is RealModality).
@@ -693,7 +706,13 @@ def _slow_greedy_infer(self, features, decode_length):
       inputs_old = features["inputs"]
       features["inputs"] = tf.expand_dims(features["inputs"], 2)
     if not self.has_input:
-      features["partial_targets"] = tf.to_int64(features["inputs"])
+      # Prepare partial targets.
+      # In either features["inputs"] or features["targets"].
+      # We force the outputs to begin with these sequences.
+      partial_targets = features.get("inputs")
+      if partial_targets is None:
+        partial_targets = features["targets"]
+      features["partial_targets"] = tf.to_int64(partial_targets)
     # Save the targets in a var and reassign it after the tf.while loop to avoid
     # having targets being in a 'while' frame. This ensures targets when used
     # in metric functions stays in the same frame as other vars.
@@ -703,7 +722,7 @@ def _slow_greedy_infer(self, features, decode_length):
 
     def infer_step(recent_output, recent_logits, unused_loss):
       """Inference step."""
-      if not context.in_eager_mode():
+      if not tf.contrib.eager.in_eager_mode():
         recent_output.set_shape([None, None, None, 1])
       padded = tf.pad(recent_output, [[0, 0], [0, 1], [0, 0], [0, 0]])
       features["targets"] = padded
@@ -719,7 +738,7 @@ def infer_step(recent_output, recent_logits, unused_loss):
                              common_layers.shape_list(recent_output)[1], :, :]
       cur_sample = tf.to_int64(tf.expand_dims(cur_sample, axis=1))
       samples = tf.concat([recent_output, cur_sample], axis=1)
-      if not context.in_eager_mode():
+      if not tf.contrib.eager.in_eager_mode():
         samples.set_shape([None, None, None, 1])
 
       # Assuming we have one shard for logits.
@@ -745,13 +764,19 @@ def infer_step(recent_output, recent_logits, unused_loss):
     if target_modality.is_class_modality:
       decode_length = 1
     else:
-      decode_length = common_layers.shape_list(
-          features["inputs"])[1] + decode_length
+      if "partial_targets" in features:
+        prefix_length = common_layers.shape_list(
+            features["partial_targets"])[1]
+      else:
+        prefix_length = common_layers.shape_list(
+            features["inputs"])[1]
+      decode_length = prefix_length + decode_length
+
     # Initial values of result, logits and loss.
     result = initial_output
     # tensor of shape [batch_size, time, 1, 1, vocab_size]
     logits = tf.zeros((batch_size, 0, 1, 1, target_modality.top_dimensionality))
-    if not context.in_eager_mode():
+    if not tf.contrib.eager.in_eager_mode():
       logits.set_shape([None, None, None, None, None])
     loss = 0.0
 
@@ -1002,10 +1027,10 @@ def estimator_spec_eval(self, features, logits, labels, loss, losses_dict):
     """Construct EstimatorSpec for EVAL mode."""
     hparams = self.hparams
 
-    if not hasattr(hparams, "problem_instances"):
+    if not hasattr(hparams, "problem"):
       raise NotImplementedError(_no_problem_err("estimator_spec_eval"))
 
-    problem = hparams.problem_instances[0]
+    problem = hparams.problem
     if common_layers.is_on_tpu():
       _remove_summaries()
       if isinstance(logits, dict):
@@ -1066,15 +1091,11 @@ def estimator_spec_predict(self, features):
     if inputs is None:
       inputs = features["targets"]
 
-    batched_problem_choice = (
-        features["problem_choice"] * tf.ones(
-            (common_layers.shape_list(inputs)[0],), dtype=tf.int32))
     predictions = {
         "outputs": outputs,
         "scores": scores,
         "inputs": inputs,
         "targets": features.get("infer_targets"),
-        "problem_choice": batched_problem_choice,
         "batch_prediction_key": features.get("batch_prediction_key"),
     }
     _del_dict_nones(predictions)
@@ -1294,7 +1315,7 @@ def as_default(self):
 
 
 def create_eager_var_store():
-  if context.in_eager_mode():
+  if tf.contrib.eager.in_eager_mode():
     return variable_scope.EagerVariableStore()
   else:
     return DummyVariableStore()
@@ -1395,7 +1416,7 @@ def summarize_features(features, num_shards=1):
 
 
 def _eager_log(level, *args):
-  if context.in_eager_mode() and args in _already_logged:
+  if tf.contrib.eager.in_eager_mode() and args in _already_logged:
     return
   _already_logged.add(args)
   getattr(tf.logging, level)(*args)
@@ -1407,3 +1428,42 @@ def log_info(*args):
 
 def log_warn(*args):
   _eager_log("warn", *args)
+
+
+def _compose_custom_getters(getter_a, getter_b):
+  """Compose two custom getters.
+
+  Example use:
+  tf.get_variable_scope().set_custom_getter(
+    compose_custom_getters(tf.get_variable_scope().custom_getter, new_getter))
+
+  This composes getters in the same way as creating a new variable scope with
+  the new_getter, but it does not actually create a new variable scope.
+
+  Args:
+    getter_a: a custom getter - generally from the existing variable scope.
+    getter_b: a custom getter
+
+  Returns:
+    a custom getter
+  """
+  if not getter_a:
+    return getter_b
+  if not getter_b:
+    return getter_a
+  def getter_fn(getter, *args, **kwargs):
+    return getter_b(functools.partial(getter_a, getter), *args, **kwargs)
+  return getter_fn
+
+
+def set_custom_getter_compose(custom_getter):
+  """Set a custom getter in the current variable scope.
+
+  Do not overwrite the existing custom getter - rather compose with it.
+
+  Args:
+    custom_getter: a custom getter.
+  """
+  tf.get_variable_scope().set_custom_getter(
+      _compose_custom_getters(
+          tf.get_variable_scope().custom_getter, custom_getter))
diff --git a/tensor2tensor/utils/trainer_lib.py b/tensor2tensor/utils/trainer_lib.py
index 4a6e5f2b6..9cfd1264a 100644
--- a/tensor2tensor/utils/trainer_lib.py
+++ b/tensor2tensor/utils/trainer_lib.py
@@ -185,7 +185,7 @@ def create_estimator(model_name,
       model_name, hparams, decode_hparams=decode_hparams, use_tpu=use_tpu)
 
   if use_tpu:
-    problem = hparams.problem_instances[0]
+    problem = hparams.problem
     batch_size = (problem.tpu_batch_size_per_shard(hparams) *
                   run_config.tpu_config.num_shards)
     return tf.contrib.tpu.TPUEstimator(
@@ -217,12 +217,7 @@ def create_hooks(use_tfdbg=False, use_dbgprofile=False, dbgprofile_kwargs=None,
     tf.logging.info("Using ProfilerHook")
     defaults = dict(save_steps=10, show_dataflow=True, show_memory=True)
     defaults.update(dbgprofile_kwargs)
-    # To handle different versions of TF
-    if hasattr(tf.train, "ProfilerHook"):
-      hook_mod = tf.train
-    else:
-      hook_mod = tf.contrib.hooks
-    train_monitors.append(hook_mod.ProfilerHook(**defaults))
+    train_monitors.append(tf.train.ProfilerHook(**defaults))
 
   if use_validation_monitor:
     tf.logging.info("Using ValidationMonitor")
@@ -276,7 +271,7 @@ def create_experiment(run_config,
       use_tpu=use_tpu)
 
   # Input fns from Problem
-  problem = hparams.problem_instances[0]
+  problem = hparams.problem
   train_input_fn = problem.make_estimator_input_fn(
       tf.estimator.ModeKeys.TRAIN, hparams)
   eval_input_fn = problem.make_estimator_input_fn(
@@ -358,8 +353,8 @@ def add_problem_hparams(hparams, problem_name):
   problem = registry.problem(problem_name)
   p_hparams = problem.get_hparams(hparams)
 
-  hparams.problem_instances = [problem]
-  hparams.problems = [p_hparams]
+  hparams.problem = problem
+  hparams.problem_hparams = p_hparams
 
 
 def set_random_seed(seed):
diff --git a/tensor2tensor/utils/trainer_lib_test.py b/tensor2tensor/utils/trainer_lib_test.py
index a9a6e692c..6ae599721 100644
--- a/tensor2tensor/utils/trainer_lib_test.py
+++ b/tensor2tensor/utils/trainer_lib_test.py
@@ -80,7 +80,7 @@ def testModel(self):
         "transformer_tiny", data_dir=self.data_dir, problem_name="tiny_algo")
 
     # Dataset
-    problem = hparams.problem_instances[0]
+    problem = hparams.problem
     dataset = problem.dataset(tf.estimator.ModeKeys.TRAIN, self.data_dir)
     dataset = dataset.repeat(None).padded_batch(10, dataset.output_shapes)
     features = dataset.make_one_shot_iterator().get_next()
@@ -105,15 +105,15 @@ def testMultipleTargetModalities(self):
     # HParams
     hparams = trainer_lib.create_hparams(
         "transformer_tiny", data_dir=self.data_dir, problem_name="tiny_algo")
-    tm = hparams.problem_instances[0].get_hparams().target_modality
-    hparams.problem_instances[0].get_hparams().target_modality = {
+    tm = hparams.problem.get_hparams().target_modality
+    hparams.problem.get_hparams().target_modality = {
         "targets": tm,
         "A": tm,
         "B": tm
     }
 
     # Dataset
-    problem = hparams.problem_instances[0]
+    problem = hparams.problem
     dataset = problem.dataset(tf.estimator.ModeKeys.TRAIN, self.data_dir)
     dataset = dataset.repeat(None).padded_batch(10, dataset.output_shapes)
     features = dataset.make_one_shot_iterator().get_next()