diff --git a/.gitignore b/.gitignore index b14c224ec..f2b475d4e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,5 @@ # Compiled python modules. *.pyc -*DS_Store - # Byte-compiled _pycache__/ @@ -18,3 +16,9 @@ dist/ # Sublime project files *.sublime-project *.sublime-workspace + +# Tests +.pytest_cache/ + +# Other +*.DS_Store diff --git a/.travis.yml b/.travis.yml index 4cf0843a2..ecfcb699a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,14 +8,11 @@ env: - T2T_DATA_DIR=/tmp/t2t-data - T2T_TRAIN_DIR=/tmp/t2t-train matrix: - - TF_VERSION="1.4.*" - TF_VERSION="1.5.*" - TF_VERSION="1.6.*" - TF_VERSION="1.7.*" matrix: exclude: - - python: "3.6" - env: TF_VERSION="1.4.*" - python: "3.6" env: TF_VERSION="1.5.*" - python: "3.6" @@ -57,13 +54,13 @@ script: # Run data generation, training, and decoding on a dummy problem - t2t-datagen --problem=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR - - t2t-trainer --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --train_steps=5 --eval_steps=5 --output_dir=$T2T_TRAIN_DIR - - t2t-decoder --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR --decode_hparams='num_samples=10' + - t2t-trainer --problem=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --train_steps=5 --eval_steps=5 --output_dir=$T2T_TRAIN_DIR + - t2t-decoder --problem=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR --decode_hparams='num_samples=10' # Export and query (on Python 2 only) # Bug: https://github.com/tensorflow/serving/issues/819 #- if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]] && [[ "$TF_VERSION" == "1.6.*" ]]; then - # t2t-exporter --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR; + # t2t-exporter --problem=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR; # pip install tensorflow-serving-api; # tensorflow_model_server --port=9000 --model_name=my_model --model_base_path=$T2T_TRAIN_DIR/export/Servo & # sleep 10; diff --git a/README.md b/README.md index a59f69c98..31b25562f 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ pip install tensor2tensor && t2t-trainer \ --generate_data \ --data_dir=~/t2t_data \ --output_dir=~/t2t_train/mnist \ - --problems=image_mnist \ + --problem=image_mnist \ --model=shake_shake \ --hparams_set=shake_shake_quick \ --train_steps=1000 \ @@ -78,13 +78,13 @@ to modify the hyperparameters if you run on a different setup. ### Image Classification For image classification, we have a number of standard data-sets: -* ImageNet (a large data-set): `--problems=image_imagenet`, or one +* ImageNet (a large data-set): `--problem=image_imagenet`, or one of the re-scaled versions (`image_imagenet224`, `image_imagenet64`, `image_imagenet32`) -* CIFAR-10: `--problems=image_cifar10` (or - `--problems=image_cifar10_plain` to turn off data augmentation) -* CIFAR-100: `--problems=image_cifar100` -* MNIST: `--problems=image_mnist` +* CIFAR-10: `--problem=image_cifar10` (or + `--problem=image_cifar10_plain` to turn off data augmentation) +* CIFAR-100: `--problem=image_cifar100` +* MNIST: `--problem=image_mnist` For ImageNet, we suggest to use the ResNet or Xception, i.e., use `--model=resnet --hparams_set=resnet_50` or @@ -99,11 +99,11 @@ close to 97% accuracy on CIFAR-10. ### Language Modeling For language modeling, we have these data-sets in T2T: -* PTB (a small data-set): `--problems=languagemodel_ptb10k` for - word-level modeling and `--problems=languagemodel_ptb_characters` +* PTB (a small data-set): `--problem=languagemodel_ptb10k` for + word-level modeling and `--problem=languagemodel_ptb_characters` for character-level modeling. -* LM1B (a billion-word corpus): `--problems=languagemodel_lm1b32k` for - subword-level modeling and `--problems=languagemodel_lm1b_characters` +* LM1B (a billion-word corpus): `--problem=languagemodel_lm1b32k` for + subword-level modeling and `--problem=languagemodel_lm1b_characters` for character-level modeling. We suggest to start with `--model=transformer` on this task and use @@ -113,7 +113,7 @@ We suggest to start with `--model=transformer` on this task and use ### Sentiment Analysis For the task of recognizing the sentiment of a sentence, use -* the IMDB data-set: `--problems=sentiment_imdb` +* the IMDB data-set: `--problem=sentiment_imdb` We suggest to use `--model=transformer_encoder` here and since it is a small data-set, try `--hparams_set=transformer_tiny` and train for @@ -122,15 +122,15 @@ few steps (e.g., `--train_steps=2000`). ### Speech Recognition For speech-to-text, we have these data-sets in T2T: -* Librispeech (English speech to text): `--problems=librispeech` for - the whole set and `--problems=librispeech_clean` for a smaller +* Librispeech (English speech to text): `--problem=librispeech` for + the whole set and `--problem=librispeech_clean` for a smaller but nicely filtered part. ### Summarization For summarizing longer text into shorter one we have these data-sets: * CNN/DailyMail articles summarized into a few sentences: - `--problems=summarize_cnn_dailymail32k` + `--problem=summarize_cnn_dailymail32k` We suggest to use `--model=transformer` and `--hparams_set=transformer_prepend` for this task. @@ -139,15 +139,15 @@ This yields good ROUGE scores. ### Translation There are a number of translation data-sets in T2T: -* English-German: `--problems=translate_ende_wmt32k` -* English-French: `--problems=translate_enfr_wmt32k` -* English-Czech: `--problems=translate_encs_wmt32k` -* English-Chinese: `--problems=translate_enzh_wmt32k` -* English-Vietnamese: `--problems=translate_envi_iwslt32k` +* English-German: `--problem=translate_ende_wmt32k` +* English-French: `--problem=translate_enfr_wmt32k` +* English-Czech: `--problem=translate_encs_wmt32k` +* English-Chinese: `--problem=translate_enzh_wmt32k` +* English-Vietnamese: `--problem=translate_envi_iwslt32k` You can get translations in the other direction by appending `_rev` to the problem name, e.g., for German-English use -`--problems=translate_ende_wmt32k_rev`. +`--problem=translate_ende_wmt32k_rev`. For all translation problems, we suggest to try the Transformer model: `--model=transformer`. At first it is best to try the base setting, @@ -193,7 +193,7 @@ t2t-datagen \ # * If you run out of memory, add --hparams='batch_size=1024'. t2t-trainer \ --data_dir=$DATA_DIR \ - --problems=$PROBLEM \ + --problem=$PROBLEM \ --model=$MODEL \ --hparams_set=$HPARAMS \ --output_dir=$TRAIN_DIR @@ -210,7 +210,7 @@ ALPHA=0.6 t2t-decoder \ --data_dir=$DATA_DIR \ - --problems=$PROBLEM \ + --problem=$PROBLEM \ --model=$MODEL \ --hparams_set=$HPARAMS \ --output_dir=$TRAIN_DIR \ @@ -325,7 +325,7 @@ and hyperparameter set functions can compose other hyperparameter set functions. The **trainer** binary is the main entrypoint for training, evaluation, and inference. Users can easily switch between problems, models, and hyperparameter -sets by using the `--model`, `--problems`, and `--hparams_set` flags. Specific +sets by using the `--model`, `--problem`, and `--hparams_set` flags. Specific hyperparameters can be overridden with the `--hparams` flag. `--schedule` and related flags control local and distributed training/evaluation ([distributed training documentation](https://github.com/tensorflow/tensor2tensor/tree/master/docs/distributed_training.md)). diff --git a/docs/cloud_mlengine.md b/docs/cloud_mlengine.md index 709582f65..25673901e 100644 --- a/docs/cloud_mlengine.md +++ b/docs/cloud_mlengine.md @@ -14,7 +14,7 @@ It's the same `t2t-trainer` you know and love with the addition of the DATA_DIR=gs://my-bucket/data OUTPUT_DIR=gs://my-bucket/train t2t-trainer \ - --problems=translate_ende_wmt32k \ + --problem=translate_ende_wmt32k \ --model=transformer \ --hparams_set=transformer_base \ --data_dir=$DATA_DIR \ @@ -57,7 +57,7 @@ with `--hparams_range` and the `--autotune_*` flags: ``` t2t-trainer \ - --problems=translate_ende_wmt32k \ + --problem=translate_ende_wmt32k \ --model=transformer \ --hparams_set=transformer_base \ --data_dir=$DATA_DIR \ diff --git a/docs/cloud_tpu.md b/docs/cloud_tpu.md index d923ee02e..d508aa125 100644 --- a/docs/cloud_tpu.md +++ b/docs/cloud_tpu.md @@ -39,8 +39,6 @@ work on any image classification data-set. ## Tutorial: Transformer En-De translation on TPU -**Note**: You'll need TensorFlow 1.5+. - Configure the `gcloud` CLI: ``` gcloud components update @@ -71,7 +69,7 @@ Launch! It's as simple as adding the `--cloud_tpu` flag. t2t-trainer \ --model=transformer \ --hparams_set=transformer_tpu \ - --problems=translate_ende_wmt8k \ + --problem=translate_ende_wmt8k \ --train_steps=10 \ --eval_steps=10 \ --local_eval_frequency=10 \ @@ -109,7 +107,7 @@ For example, to train a shake-shake model on CIFAR you can run this command. t2t-trainer \ --model=shake_shake \ --hparams_set=shakeshake_tpu \ - --problems=image_cifar10 \ + --problem=image_cifar10 \ --train_steps=180000 \ --eval_steps=9 \ --local_eval_frequency=100 \ diff --git a/docs/index.md b/docs/index.md index 060e10471..9262461c7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -42,13 +42,13 @@ to modify the hyperparameters if you run on a different setup. ### Image Classification For image classification, we have a number of standard data-sets: -* ImageNet (a large data-set): `--problems=image_imagenet`, or one +* ImageNet (a large data-set): `--problem=image_imagenet`, or one of the re-scaled versions (`image_imagenet224`, `image_imagenet64`, `image_imagenet32`) -* CIFAR-10: `--problems=image_cifar10` (or - `--problems=image_cifar10_plain` to turn off data augmentation) -* CIFAR-100: `--problems=image_cifar100` -* MNIST: `--problems=image_mnist` +* CIFAR-10: `--problem=image_cifar10` (or + `--problem=image_cifar10_plain` to turn off data augmentation) +* CIFAR-100: `--problem=image_cifar100` +* MNIST: `--problem=image_mnist` For ImageNet, we suggest to use the ResNet or Xception, i.e., use `--model=resnet --hparams_set=resnet_50` or @@ -63,11 +63,11 @@ close to 97% accuracy on CIFAR-10. ### Language Modeling For language modeling, we have these data-sets in T2T: -* PTB (a small data-set): `--problems=languagemodel_ptb10k` for - word-level modeling and `--problems=languagemodel_ptb_characters` +* PTB (a small data-set): `--problem=languagemodel_ptb10k` for + word-level modeling and `--problem=languagemodel_ptb_characters` for character-level modeling. -* LM1B (a billion-word corpus): `--problems=languagemodel_lm1b32k` for - subword-level modeling and `--problems=languagemodel_lm1b_characters` +* LM1B (a billion-word corpus): `--problem=languagemodel_lm1b32k` for + subword-level modeling and `--problem=languagemodel_lm1b_characters` for character-level modeling. We suggest to start with `--model=transformer` on this task and use @@ -77,7 +77,7 @@ We suggest to start with `--model=transformer` on this task and use ### Sentiment Analysis For the task of recognizing the sentiment of a sentence, use -* the IMDB data-set: `--problems=sentiment_imdb` +* the IMDB data-set: `--problem=sentiment_imdb` We suggest to use `--model=transformer_encoder` here and since it is a small data-set, try `--hparams_set=transformer_tiny` and train for @@ -86,15 +86,15 @@ few steps (e.g., `--train_steps=2000`). ### Speech Recognition For speech-to-text, we have these data-sets in T2T: -* Librispeech (English speech to text): `--problems=librispeech` for - the whole set and `--problems=librispeech_clean` for a smaller +* Librispeech (English speech to text): `--problem=librispeech` for + the whole set and `--problem=librispeech_clean` for a smaller but nicely filtered part. ### Summarization For summarizing longer text into shorter one we have these data-sets: * CNN/DailyMail articles summarized into a few sentences: - `--problems=summarize_cnn_dailymail32k` + `--problem=summarize_cnn_dailymail32k` We suggest to use `--model=transformer` and `--hparams_set=transformer_prepend` for this task. @@ -103,15 +103,15 @@ This yields good ROUGE scores. ### Translation There are a number of translation data-sets in T2T: -* English-German: `--problems=translate_ende_wmt32k` -* English-French: `--problems=translate_enfr_wmt32k` -* English-Czech: `--problems=translate_encs_wmt32k` -* English-Chinese: `--problems=translate_enzh_wmt32k` -* English-Vietnamese: `--problems=translate_envi_iwslt32k` +* English-German: `--problem=translate_ende_wmt32k` +* English-French: `--problem=translate_enfr_wmt32k` +* English-Czech: `--problem=translate_encs_wmt32k` +* English-Chinese: `--problem=translate_enzh_wmt32k` +* English-Vietnamese: `--problem=translate_envi_iwslt32k` You can get translations in the other direction by appending `_rev` to the problem name, e.g., for German-English use -`--problems=translate_ende_wmt32k_rev`. +`--problem=translate_ende_wmt32k_rev`. For all translation problems, we suggest to try the Transformer model: `--model=transformer`. At first it is best to try the base setting, diff --git a/docs/new_problem.md b/docs/new_problem.md index 371ae3daa..d9a7987fd 100644 --- a/docs/new_problem.md +++ b/docs/new_problem.md @@ -239,6 +239,6 @@ clone the repository and install it in developer mode with `pip install -e .`. # Train! You can train exactly as you do in the [walkthrough](walkthrough.md) with flags -`--problems=poetry_lines` and `--t2t_usr_dir=$USR_DIR`. +`--problem=poetry_lines` and `--t2t_usr_dir=$USR_DIR`. All done. Let us know what amazing poetry your model writes! diff --git a/docs/tutorials/asr_with_transformer.md b/docs/tutorials/asr_with_transformer.md index 92c847ba8..728d85c4b 100644 --- a/docs/tutorials/asr_with_transformer.md +++ b/docs/tutorials/asr_with_transformer.md @@ -29,7 +29,7 @@ To train a model on GPU set up`OUT_DIR` and run the trainer: t2t-trainer \ --model=transformer \ --hparams_set=transformer_librispeech \ - --problems=librispeech \ + --problem=librispeech \ --train_steps=120000 \ --eval_steps=3 \ --local_eval_frequency=100 \ @@ -48,7 +48,7 @@ To train a model on TPU set up `OUT_DIR` and run the trainer: t2t-trainer \ --model=transformer \ --hparams_set=transformer_librispeech_tpu \ - --problems=librispeech \ + --problem=librispeech \ --train_steps=120000 \ --eval_steps=3 \ --local_eval_frequency=100 \ diff --git a/docs/walkthrough.md b/docs/walkthrough.md index a59f69c98..31b25562f 100644 --- a/docs/walkthrough.md +++ b/docs/walkthrough.md @@ -36,7 +36,7 @@ pip install tensor2tensor && t2t-trainer \ --generate_data \ --data_dir=~/t2t_data \ --output_dir=~/t2t_train/mnist \ - --problems=image_mnist \ + --problem=image_mnist \ --model=shake_shake \ --hparams_set=shake_shake_quick \ --train_steps=1000 \ @@ -78,13 +78,13 @@ to modify the hyperparameters if you run on a different setup. ### Image Classification For image classification, we have a number of standard data-sets: -* ImageNet (a large data-set): `--problems=image_imagenet`, or one +* ImageNet (a large data-set): `--problem=image_imagenet`, or one of the re-scaled versions (`image_imagenet224`, `image_imagenet64`, `image_imagenet32`) -* CIFAR-10: `--problems=image_cifar10` (or - `--problems=image_cifar10_plain` to turn off data augmentation) -* CIFAR-100: `--problems=image_cifar100` -* MNIST: `--problems=image_mnist` +* CIFAR-10: `--problem=image_cifar10` (or + `--problem=image_cifar10_plain` to turn off data augmentation) +* CIFAR-100: `--problem=image_cifar100` +* MNIST: `--problem=image_mnist` For ImageNet, we suggest to use the ResNet or Xception, i.e., use `--model=resnet --hparams_set=resnet_50` or @@ -99,11 +99,11 @@ close to 97% accuracy on CIFAR-10. ### Language Modeling For language modeling, we have these data-sets in T2T: -* PTB (a small data-set): `--problems=languagemodel_ptb10k` for - word-level modeling and `--problems=languagemodel_ptb_characters` +* PTB (a small data-set): `--problem=languagemodel_ptb10k` for + word-level modeling and `--problem=languagemodel_ptb_characters` for character-level modeling. -* LM1B (a billion-word corpus): `--problems=languagemodel_lm1b32k` for - subword-level modeling and `--problems=languagemodel_lm1b_characters` +* LM1B (a billion-word corpus): `--problem=languagemodel_lm1b32k` for + subword-level modeling and `--problem=languagemodel_lm1b_characters` for character-level modeling. We suggest to start with `--model=transformer` on this task and use @@ -113,7 +113,7 @@ We suggest to start with `--model=transformer` on this task and use ### Sentiment Analysis For the task of recognizing the sentiment of a sentence, use -* the IMDB data-set: `--problems=sentiment_imdb` +* the IMDB data-set: `--problem=sentiment_imdb` We suggest to use `--model=transformer_encoder` here and since it is a small data-set, try `--hparams_set=transformer_tiny` and train for @@ -122,15 +122,15 @@ few steps (e.g., `--train_steps=2000`). ### Speech Recognition For speech-to-text, we have these data-sets in T2T: -* Librispeech (English speech to text): `--problems=librispeech` for - the whole set and `--problems=librispeech_clean` for a smaller +* Librispeech (English speech to text): `--problem=librispeech` for + the whole set and `--problem=librispeech_clean` for a smaller but nicely filtered part. ### Summarization For summarizing longer text into shorter one we have these data-sets: * CNN/DailyMail articles summarized into a few sentences: - `--problems=summarize_cnn_dailymail32k` + `--problem=summarize_cnn_dailymail32k` We suggest to use `--model=transformer` and `--hparams_set=transformer_prepend` for this task. @@ -139,15 +139,15 @@ This yields good ROUGE scores. ### Translation There are a number of translation data-sets in T2T: -* English-German: `--problems=translate_ende_wmt32k` -* English-French: `--problems=translate_enfr_wmt32k` -* English-Czech: `--problems=translate_encs_wmt32k` -* English-Chinese: `--problems=translate_enzh_wmt32k` -* English-Vietnamese: `--problems=translate_envi_iwslt32k` +* English-German: `--problem=translate_ende_wmt32k` +* English-French: `--problem=translate_enfr_wmt32k` +* English-Czech: `--problem=translate_encs_wmt32k` +* English-Chinese: `--problem=translate_enzh_wmt32k` +* English-Vietnamese: `--problem=translate_envi_iwslt32k` You can get translations in the other direction by appending `_rev` to the problem name, e.g., for German-English use -`--problems=translate_ende_wmt32k_rev`. +`--problem=translate_ende_wmt32k_rev`. For all translation problems, we suggest to try the Transformer model: `--model=transformer`. At first it is best to try the base setting, @@ -193,7 +193,7 @@ t2t-datagen \ # * If you run out of memory, add --hparams='batch_size=1024'. t2t-trainer \ --data_dir=$DATA_DIR \ - --problems=$PROBLEM \ + --problem=$PROBLEM \ --model=$MODEL \ --hparams_set=$HPARAMS \ --output_dir=$TRAIN_DIR @@ -210,7 +210,7 @@ ALPHA=0.6 t2t-decoder \ --data_dir=$DATA_DIR \ - --problems=$PROBLEM \ + --problem=$PROBLEM \ --model=$MODEL \ --hparams_set=$HPARAMS \ --output_dir=$TRAIN_DIR \ @@ -325,7 +325,7 @@ and hyperparameter set functions can compose other hyperparameter set functions. The **trainer** binary is the main entrypoint for training, evaluation, and inference. Users can easily switch between problems, models, and hyperparameter -sets by using the `--model`, `--problems`, and `--hparams_set` flags. Specific +sets by using the `--model`, `--problem`, and `--hparams_set` flags. Specific hyperparameters can be overridden with the `--hparams` flag. `--schedule` and related flags control local and distributed training/evaluation ([distributed training documentation](https://github.com/tensorflow/tensor2tensor/tree/master/docs/distributed_training.md)). diff --git a/setup.py b/setup.py index 9f9035efa..cc22c8a0f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='tensor2tensor', - version='1.5.7', + version='1.6.0', description='Tensor2Tensor', author='Google Inc.', author_email='no-reply@google.com', @@ -45,8 +45,8 @@ 'six', ], extras_require={ - 'tensorflow': ['tensorflow>=1.4.1'], - 'tensorflow_gpu': ['tensorflow-gpu>=1.4.1'], + 'tensorflow': ['tensorflow>=1.5.0'], + 'tensorflow_gpu': ['tensorflow-gpu>=1.5.0'], 'tests': ['pytest', 'h5py', 'mock'], }, classifiers=[ diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer index 01c2dcb56..1d848d04d 100755 --- a/tensor2tensor/bin/t2t-trainer +++ b/tensor2tensor/bin/t2t-trainer @@ -7,7 +7,7 @@ For example, to train a shake-shake model on MNIST run this: t2t-trainer \ --generate_data \ - --problems=image_mnist \ + --problem=image_mnist \ --data_dir=~/t2t_data \ --tmp_dir=~/t2t_data/tmp --model=shake_shake \ diff --git a/tensor2tensor/bin/t2t_datagen.py b/tensor2tensor/bin/t2t_datagen.py index 862abca84..d1dcab834 100644 --- a/tensor2tensor/bin/t2t_datagen.py +++ b/tensor2tensor/bin/t2t_datagen.py @@ -70,7 +70,7 @@ flags.DEFINE_integer("task_id_start", -1, "For distributed data generation.") flags.DEFINE_integer("task_id_end", -1, "For distributed data generation.") flags.DEFINE_integer( - "num_concurrent_processes", 10, + "num_concurrent_processes", None, "Applies only to problems for which multiprocess_generate=True.") flags.DEFINE_string("t2t_usr_dir", "", "Path to a Python module that will be imported. The " diff --git a/tensor2tensor/bin/t2t_decoder.py b/tensor2tensor/bin/t2t_decoder.py index 25f47eace..08d8c7ee5 100644 --- a/tensor2tensor/bin/t2t_decoder.py +++ b/tensor2tensor/bin/t2t_decoder.py @@ -21,7 +21,7 @@ t2t-decoder \ --data_dir ~/data \ - --problems=algorithmic_identity_binary40 \ + --problem=algorithmic_identity_binary40 \ --model=transformer --hparams_set=transformer_base @@ -70,7 +70,7 @@ def create_hparams(): FLAGS.hparams_set, FLAGS.hparams, data_dir=os.path.expanduser(FLAGS.data_dir), - problem_name=FLAGS.problems) + problem_name=FLAGS.problem) def create_decode_hparams(): @@ -94,7 +94,7 @@ def decode(estimator, hparams, decode_hp): else: decoding.decode_from_dataset( estimator, - FLAGS.problems.split("-"), + FLAGS.problem, hparams, decode_hp, decode_to_file=FLAGS.decode_to_file, @@ -105,7 +105,7 @@ def score_file(filename): """Score each line in a file and return the scores.""" # Prepare model. hparams = create_hparams() - encoders = registry.problem(FLAGS.problems).feature_encoders(FLAGS.data_dir) + encoders = registry.problem(FLAGS.problem).feature_encoders(FLAGS.data_dir) has_inputs = "inputs" in encoders # Prepare features for feeding into the model. diff --git a/tensor2tensor/bin/t2t_distill.py b/tensor2tensor/bin/t2t_distill.py index be31a2ba7..75c14ca55 100644 --- a/tensor2tensor/bin/t2t_distill.py +++ b/tensor2tensor/bin/t2t_distill.py @@ -91,5 +91,71 @@ def main(argv): # ========================== +def create_teacher_experiment(run_config, hparams, argv): + """Creates experiment function.""" + tf.logging.info("training teacher") + tf.logging.set_verbosity(tf.logging.INFO) + trainer_lib.set_random_seed(FLAGS.random_seed) + usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) + t2t_trainer.log_registry() + + if FLAGS.cloud_mlengine: + return cloud_mlengine.launch() + + if FLAGS.generate_data: + t2t_trainer.generate_data() + + if cloud_mlengine.job_dir(): + FLAGS.output_dir = cloud_mlengine.job_dir() + + if argv: + t2t_trainer.set_hparams_from_args(argv[1:]) + + with t2t_trainer.maybe_cloud_tpu(): + hparams.distill_phase = "train" + exp_fn = t2t_trainer.create_experiment_fn() + exp = exp_fn(run_config, hparams) + return exp + + +def create_student_experiment(run_config, hparams, argv): + """Creates experiment function.""" + tf.logging.info("training student") + tf.logging.set_verbosity(tf.logging.INFO) + trainer_lib.set_random_seed(FLAGS.random_seed) + usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) + t2t_trainer.log_registry() + + if FLAGS.cloud_mlengine: + return cloud_mlengine.launch() + + if FLAGS.generate_data: + t2t_trainer.generate_data() + + if cloud_mlengine.job_dir(): + FLAGS.output_dir = cloud_mlengine.job_dir() + + if argv: + t2t_trainer.set_hparams_from_args(argv[1:]) + + with t2t_trainer.maybe_cloud_tpu(): + hparams.add_hparam("teacher_dir", FLAGS.teacher_dir) + hparams.distill_phase = "distill" + exp_fn = t2t_trainer.create_experiment_fn() + exp = exp_fn(run_config, hparams) + return exp + + +def create_experiment_fn(argv, train_teacher): + + def teacher_experiment_fn(run_config, hparams): + return create_teacher_experiment(run_config, hparams, argv) + + def student_experiment_fn(run_config, hparams): + return create_student_experiment(run_config, hparams, argv) + + return teacher_experiment_fn if train_teacher else student_experiment_fn + + if __name__ == "__main__": tf.app.run() diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py index 628d7a9a9..87443ad47 100644 --- a/tensor2tensor/bin/t2t_trainer.py +++ b/tensor2tensor/bin/t2t_trainer.py @@ -57,10 +57,12 @@ flags.DEFINE_string("tmp_dir", "/tmp/t2t_datagen", "Temporary storage directory, used if --generate_data.") flags.DEFINE_bool("profile", False, "Profile performance?") -flags.DEFINE_integer("inter_op_parallelism_threads", 0, "Number of inter_op_parallelism_threads " - "to use for CPU. See TensorFlow config.proto for details.") -flags.DEFINE_integer("intra_op_parallelism_threads", 0, "Number of intra_op_parallelism_threads " - "to use for CPU. See TensorFlow config.proto for details.") +flags.DEFINE_integer("inter_op_parallelism_threads", 0, + "Number of inter_op_parallelism_threads to use for CPU. " + "See TensorFlow config.proto for details.") +flags.DEFINE_integer("intra_op_parallelism_threads", 0, + "Number of intra_op_parallelism_threads to use for CPU. " + "See TensorFlow config.proto for details.") # To maintain compatibility with some internal libs, we guard against these flag # definitions possibly erring. Apologies for the ugliness. @@ -115,12 +117,6 @@ "during hyperparameter tuning. Overrides --output_dir.") -def get_problem_name(): - problems = FLAGS.problems.split("-") - assert len(problems) == 1 - return problems[0] - - def set_hparams_from_args(args): """Set hparams overrides from unparsed args list.""" if not args: @@ -159,7 +155,7 @@ def create_hparams(): def create_experiment_fn(): return trainer_lib.create_experiment_fn( model_name=FLAGS.model, - problem_name=get_problem_name(), + problem_name=FLAGS.problem, data_dir=os.path.expanduser(FLAGS.data_dir), train_steps=FLAGS.train_steps, eval_steps=FLAGS.eval_steps, @@ -178,11 +174,24 @@ def create_experiment_fn(): def create_run_config(hp): + """Create a run config. + + Args: + hp: model hyperparameters + Returns: + a run config + """ save_ckpt_steps = max(FLAGS.iterations_per_loop, FLAGS.local_eval_frequency) save_ckpt_secs = FLAGS.save_checkpoints_secs or None if save_ckpt_secs: save_ckpt_steps = None assert FLAGS.output_dir or FLAGS.checkpoint_path + # the various custom getters we have written do not play well together yet. + # TODO(noam): ask rsepassi for help here. + daisy_chain_variables = ( + hp.daisy_chain_variables and + hp.activation_dtype == "float32" and + hp.weight_dtype == "float32") return trainer_lib.create_run_config( model_dir=os.path.expanduser(FLAGS.output_dir), master=FLAGS.master, @@ -202,7 +211,7 @@ def create_run_config(hp): use_tpu=FLAGS.use_tpu, schedule=FLAGS.schedule, no_data_parallelism=hp.no_data_parallelism, - daisy_chain_variables=hp.daisy_chain_variables, + daisy_chain_variables=daisy_chain_variables, ps_replicas=FLAGS.ps_replicas, ps_job=FLAGS.ps_job, ps_gpu=FLAGS.ps_gpu, @@ -222,7 +231,7 @@ def generate_data(): tf.gfile.MakeDirs(data_dir) tf.gfile.MakeDirs(tmp_dir) - problem_name = get_problem_name() + problem_name = FLAGS.problem tf.logging.info("Generating data for %s" % problem_name) registry.problem(problem_name).generate_data(data_dir, tmp_dir) @@ -281,9 +290,7 @@ def save_metadata(hparams): # Save hparams as hparams.json hparams_fname = os.path.join(output_dir, "hparams.json") with tf.gfile.Open(hparams_fname, "w") as f: - # TODO(lukaszkaiser): use the first line once we require TF 1.5+. - # f.write(hparams.to_json(indent=0, sort_keys=True)) - f.write(hparams.to_json()) + f.write(hparams.to_json(indent=0, sort_keys=True)) def execute_schedule(exp): diff --git a/tensor2tensor/bin/t2t_trainer_test.py b/tensor2tensor/bin/t2t_trainer_test.py index 1f4569e60..52d58111a 100644 --- a/tensor2tensor/bin/t2t_trainer_test.py +++ b/tensor2tensor/bin/t2t_trainer_test.py @@ -36,7 +36,7 @@ def setUpClass(cls): trainer_lib_test.TrainerLibTest.setUpClass() def testTrain(self): - FLAGS.problems = "tiny_algo" + FLAGS.problem = "tiny_algo" FLAGS.model = "transformer" FLAGS.hparams_set = "transformer_tiny" FLAGS.train_steps = 1 diff --git a/tensor2tensor/bin/t2t_translate_all.py b/tensor2tensor/bin/t2t_translate_all.py index 553489b61..7041fb8c1 100644 --- a/tensor2tensor/bin/t2t_translate_all.py +++ b/tensor2tensor/bin/t2t_translate_all.py @@ -16,7 +16,7 @@ """Translate a file with all checkpoints in a given directory. t2t-decoder will be executed with these parameters: ---problems +--problem --data_dir --output_dir with the value of --model_dir --decode_from_file with the value of --source @@ -61,7 +61,7 @@ flags.DEFINE_string("model", "transformer", "see t2t-decoder") flags.DEFINE_string("t2t_usr_dir", None, "see t2t-decoder") flags.DEFINE_string("data_dir", None, "see t2t-decoder") -flags.DEFINE_string("problems", None, "see t2t-decoder") +flags.DEFINE_string("problem", None, "see t2t-decoder") flags.DEFINE_string("hparams_set", "transformer_big_single_gpu", "see t2t-decoder") @@ -73,11 +73,11 @@ def main(_): translations_dir = os.path.expanduser(FLAGS.translations_dir) source = os.path.expanduser(FLAGS.source) tf.gfile.MakeDirs(translations_dir) - translated_base_file = os.path.join(translations_dir, FLAGS.problems) + translated_base_file = os.path.join(translations_dir, FLAGS.problem) # Copy flags.txt with the original time, so t2t-bleu can report correct # relative time. - flags_path = os.path.join(translations_dir, FLAGS.problems + "-flags.txt") + flags_path = os.path.join(translations_dir, FLAGS.problem + "-flags.txt") if not os.path.exists(flags_path): shutil.copy2(os.path.join(model_dir, "flags.txt"), flags_path) @@ -93,7 +93,7 @@ def main(_): tf.logging.info("Translating " + out_file) params = ( "--t2t_usr_dir={FLAGS.t2t_usr_dir} --output_dir={model_dir} " - "--data_dir={FLAGS.data_dir} --problems={FLAGS.problems} " + "--data_dir={FLAGS.data_dir} --problem={FLAGS.problem} " "--decode_hparams=beam_size={FLAGS.beam_size},alpha={FLAGS.alpha} " "--model={FLAGS.model} --hparams_set={FLAGS.hparams_set} " "--checkpoint_path={model.filename} --decode_from_file={source} " diff --git a/tensor2tensor/data_generators/README.md b/tensor2tensor/data_generators/README.md index 0ccbfe1c1..e67eac019 100644 --- a/tensor2tensor/data_generators/README.md +++ b/tensor2tensor/data_generators/README.md @@ -47,7 +47,7 @@ with an integer denoting the length of the input list. ``` def length_generator(nbr_cases): - for _ in xrange(nbr_cases): + for _ in range(nbr_cases): length = np.random.randint(100) + 1 yield {"inputs": [2] * length, "targets": [length]} ``` diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py index ddb456cb1..25dbb8add 100644 --- a/tensor2tensor/data_generators/algorithmic.py +++ b/tensor2tensor/data_generators/algorithmic.py @@ -22,7 +22,7 @@ import numpy as np -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.data_generators import generator_utils as utils from tensor2tensor.data_generators import problem @@ -113,9 +113,9 @@ def generator(self, nbr_symbols, max_length, nbr_cases): A dictionary {"inputs": input-list, "targets": target-list} where input-list and target-list are the same. """ - for _ in xrange(nbr_cases): + for _ in range(nbr_cases): l = np.random.randint(max_length) + 1 - inputs = [np.random.randint(nbr_symbols) for _ in xrange(l)] + inputs = [np.random.randint(nbr_symbols) for _ in range(l)] yield {"inputs": inputs, "targets": inputs} @@ -153,9 +153,9 @@ def generator(self, nbr_symbols, max_length, nbr_cases): target-list[i] = input-list[i] + shift. """ shift = 10 - for _ in xrange(nbr_cases): + for _ in range(nbr_cases): l = np.random.randint(max_length) + 1 - inputs = [np.random.randint(nbr_symbols - shift) for _ in xrange(l)] + inputs = [np.random.randint(nbr_symbols - shift) for _ in range(l)] yield {"inputs": inputs, "targets": [i + shift for i in inputs]} @property @@ -187,9 +187,9 @@ def generator(self, nbr_symbols, max_length, nbr_cases): A dictionary {"inputs": input-list, "targets": target-list} where target-list is input-list reversed. """ - for _ in xrange(nbr_cases): + for _ in range(nbr_cases): l = np.random.randint(max_length) + 1 - inputs = [np.random.randint(nbr_symbols) for _ in xrange(l)] + inputs = [np.random.randint(nbr_symbols) for _ in range(l)] yield {"inputs": inputs, "targets": list(reversed(inputs))} @@ -265,7 +265,7 @@ def reverse_generator_nlplike(nbr_symbols, """ std_dev = max_length / scale_std_dev distr_map = zipf_distribution(nbr_symbols, alpha) - for _ in xrange(nbr_cases): + for _ in range(nbr_cases): l = int(abs(np.random.normal(loc=max_length / 2, scale=std_dev)) + 1) inputs = zipf_random_sample(distr_map, l) yield {"inputs": inputs, "targets": list(reversed(inputs))} @@ -321,7 +321,7 @@ def random_number_lower_endian(length, base): """Helper function: generate a random number as a lower-endian digits list.""" if length == 1: # Last digit can be 0 only if length is 1. return [np.random.randint(base)] - prefix = [np.random.randint(base) for _ in xrange(length - 1)] + prefix = [np.random.randint(base) for _ in range(length - 1)] return prefix + [np.random.randint(base - 1) + 1] # Last digit is not 0. @@ -354,7 +354,7 @@ def generator(self, base, max_length, nbr_cases): """ if max_length < 3: raise ValueError("Maximum length must be at least 3.") - for _ in xrange(nbr_cases): + for _ in range(nbr_cases): l1 = np.random.randint(max_length // 2) + 1 l2 = np.random.randint(max_length - l1 - 1) + 1 n1 = random_number_lower_endian(l1, base) @@ -405,7 +405,7 @@ def generator(self, base, max_length, nbr_cases): """ if max_length < 3: raise ValueError("Maximum length must be at least 3.") - for _ in xrange(nbr_cases): + for _ in range(nbr_cases): l1 = np.random.randint(max_length // 2) + 1 l2 = np.random.randint(max_length - l1 - 1) + 1 n1 = random_number_lower_endian(l1, base) diff --git a/tensor2tensor/data_generators/algorithmic_math.py b/tensor2tensor/data_generators/algorithmic_math.py index 689fa4b41..ed96bbfad 100644 --- a/tensor2tensor/data_generators/algorithmic_math.py +++ b/tensor2tensor/data_generators/algorithmic_math.py @@ -28,7 +28,7 @@ # Dependency imports import six -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin import sympy @@ -421,7 +421,7 @@ def math_dataset_init(alphabet_size=26, digits=None, functions=None): raise ValueError("digits cannot must be between 1 and 10. Got %s." % digits) vlist = alphabet[:alphabet_size] if digits is not None: - dlist = [str(d) for d in xrange(digits)] + dlist = [str(d) for d in range(digits)] else: dlist = [] if functions is None: @@ -481,7 +481,7 @@ def algebra_inverse(alphabet_size=26, min_depth=0, max_depth=2, "Got max_depth=%s, min_depth=%s" % (max_depth, min_depth)) alg_cfg = math_dataset_init(alphabet_size) - for _ in xrange(nbr_cases): + for _ in range(nbr_cases): sample, target = generate_algebra_inverse_sample( alg_cfg.vlist, list(alg_cfg.ops.values()), alg_cfg.solve_ops, min_depth, max_depth) @@ -522,7 +522,7 @@ def algebra_simplify(alphabet_size=26, "Got max_depth=%s, min_depth=%s" % (max_depth, min_depth)) alg_cfg = math_dataset_init(alphabet_size, digits=5) - for _ in xrange(nbr_cases): + for _ in range(nbr_cases): sample, target = generate_algebra_simplify_sample( alg_cfg.vlist, list(alg_cfg.ops.values()), min_depth, max_depth) yield { diff --git a/tensor2tensor/data_generators/algorithmic_test.py b/tensor2tensor/data_generators/algorithmic_test.py index 5d954fd14..2644a3b33 100644 --- a/tensor2tensor/data_generators/algorithmic_test.py +++ b/tensor2tensor/data_generators/algorithmic_test.py @@ -21,7 +21,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.data_generators import algorithmic @@ -51,7 +51,7 @@ def testZipfDistribution(self): # more probable/frequent that the second in rank, three times more prob/freq # that the third in rank and so on. d = algorithmic.zipf_distribution(10, 1.0001) - for i in xrange(len(d[1:])-1): + for i in range(len(d[1:])-1): self.assertEqual("%.4f" % (abs(d[i+1]-d[i+2])*(i+2)), "%.4f" % d[1]) def testReverseGeneratorNlpLike(self): diff --git a/tensor2tensor/data_generators/cifar.py b/tensor2tensor/data_generators/cifar.py index 9a1a80ef3..2c332e6e7 100644 --- a/tensor2tensor/data_generators/cifar.py +++ b/tensor2tensor/data_generators/cifar.py @@ -100,10 +100,10 @@ def cifar_generator(cifar_version, tmp_dir, training, how_many, start_from=0): num_images = images.shape[0] images = images.reshape((num_images, 3, image_size, image_size)) all_images.extend([ - np.squeeze(images[j]).transpose((1, 2, 0)) for j in xrange(num_images) + np.squeeze(images[j]).transpose((1, 2, 0)) for j in range(num_images) ]) labels = data[label_key] - all_labels.extend([labels[j] for j in xrange(num_images)]) + all_labels.extend([labels[j] for j in range(num_images)]) return image_utils.image_generator( all_images[start_from:start_from + how_many], all_labels[start_from:start_from + how_many]) diff --git a/tensor2tensor/data_generators/dna_encoder.py b/tensor2tensor/data_generators/dna_encoder.py index 9db20de42..a4b2c244b 100644 --- a/tensor2tensor/data_generators/dna_encoder.py +++ b/tensor2tensor/data_generators/dna_encoder.py @@ -26,7 +26,7 @@ import itertools # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.data_generators import text_encoder @@ -77,7 +77,7 @@ def encode(self, s): assert (len(bases) % self._chunk_size) == 0 num_chunks = len(bases) // self._chunk_size ids = [] - for chunk_idx in xrange(num_chunks): + for chunk_idx in range(num_chunks): start_idx = chunk_idx * self._chunk_size end_idx = start_idx + self._chunk_size chunk = tuple(bases[start_idx:end_idx]) diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py index 29d7819f2..cdd62491f 100644 --- a/tensor2tensor/data_generators/gene_expression.py +++ b/tensor2tensor/data_generators/gene_expression.py @@ -44,7 +44,7 @@ import h5py import numpy as np -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.data_generators import dna_encoder from tensor2tensor.data_generators import generator_utils @@ -130,7 +130,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): # Start and wait for processes in batches num_batches = int( math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES)) - for i in xrange(num_batches): + for i in range(num_batches): start = i * MAX_CONCURRENT_PROCESSES end = start + MAX_CONCURRENT_PROCESSES current = processes[start:end] @@ -211,7 +211,7 @@ def generate_shard_args(outfiles, num_examples): """Generate start and end indices per outfile.""" num_shards = len(outfiles) num_examples_per_shard = num_examples // num_shards - start_idxs = [i * num_examples_per_shard for i in xrange(num_shards)] + start_idxs = [i * num_examples_per_shard for i in range(num_shards)] end_idxs = list(start_idxs) end_idxs.pop(0) end_idxs.append(num_examples) @@ -249,7 +249,7 @@ def dataset_generator(filepath, if end_idx is None: end_idx = inp_data.len() - for i in xrange(start_idx, end_idx): + for i in range(start_idx, end_idx): if i % 100 == 0: print("Generating example %d for %s" % (i, dataset)) inputs, mask, outputs = inp_data[i], mask_data[i], out_data[i] diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py index 3078f8dfe..6bd069388 100644 --- a/tensor2tensor/data_generators/generator_utils.py +++ b/tensor2tensor/data_generators/generator_utils.py @@ -29,7 +29,7 @@ import requests import six -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin import six.moves.urllib_request as urllib # Imports urllib on Python2, urllib.request on Python3 from tensor2tensor.data_generators import text_encoder @@ -119,7 +119,7 @@ def sharded_name(base_name, shard, total_shards): def shard_filepath(fname, num_shards): return [ - sharded_name(fname, shard, num_shards) for shard in xrange(num_shards) + sharded_name(fname, shard, num_shards) for shard in range(num_shards) ] @@ -592,7 +592,7 @@ def pack_examples(examples, if chop_long_sequences and len(x) > packed_length: assert not has_inputs num_fragments = len(x) // packed_length - for i in xrange(num_fragments): + for i in range(num_fragments): yield packer( x[packed_length * i:packed_length * (i + 1)], spacing).to_dict() x = x[packed_length * num_fragments:] diff --git a/tensor2tensor/data_generators/gym.py b/tensor2tensor/data_generators/gym.py index 5b979ee00..6a82f1d4c 100644 --- a/tensor2tensor/data_generators/gym.py +++ b/tensor2tensor/data_generators/gym.py @@ -22,11 +22,9 @@ from collections import deque import functools -import os # Dependency imports import gym -from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import video_utils @@ -35,6 +33,7 @@ from tensor2tensor.rl.envs import tf_atari_wrappers as atari from tensor2tensor.rl.envs.utils import batch_env_factory +from tensor2tensor.utils import metrics from tensor2tensor.utils import registry import tensorflow as tf @@ -63,6 +62,12 @@ def num_target_frames(self): """Number of frames to batch on one target.""" return 1 + def eval_metrics(self): + eval_metrics = [ + metrics.Metrics.ACC, metrics.Metrics.ACC_PER_SEQ, + metrics.Metrics.NEG_LOG_PERPLEXITY] + return eval_metrics + @property def extra_reading_spec(self): """Additional data fields to store on disk and their decoders.""" @@ -116,7 +121,8 @@ def hparams(self, defaults, unused_model_hparams): p.input_modality = {"inputs": ("video", 256), "input_reward": ("symbol", self.num_rewards), "input_action": ("symbol", self.num_actions)} - p.target_modality = ("video", 256) + p.target_modality = {"targets": ("video", 256), + "target_reward": ("symbol", self.num_rewards)} p.input_space_id = problem.SpaceID.IMAGE p.target_space_id = problem.SpaceID.IMAGE @@ -174,119 +180,27 @@ def num_steps(self): return 50000 -def moviepy_editor(): - """Access to moviepy that fails gracefully without a moviepy install.""" - try: - from moviepy import editor # pylint: disable=g-import-not-at-top - except ImportError: - raise ImportError("pip install moviepy to record videos") - return editor - @registry.register_problem -class GymDiscreteProblemWithAgent2(GymDiscreteProblem): - """Gym environment with discrete actions and rewards.""" - - def __init__(self, *args, **kwargs): - super(GymDiscreteProblemWithAgent2, self).__init__(*args, **kwargs) - self._env = None - - @property - def extra_reading_spec(self): - """Additional data fields to store on disk and their decoders.""" - data_fields = { - "action": tf.FixedLenFeature([1], tf.int64), - "reward": tf.FixedLenFeature([1], tf.int64) - } - decoders = { - "action": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="action"), - "reward": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="reward"), - } - return data_fields, decoders - - @property - def num_input_frames(self): - """Number of frames to batch on one input.""" - return 4 - - @property - def env_name(self): - """This is the name of the Gym environment for this problem.""" - return "PongDeterministic-v4" - - @property - def num_actions(self): - return self.env.action_space.n - - @property - def num_rewards(self): - return 3 - - @property - def num_steps(self): - return 200 - - @property - def frame_height(self): - return 210 - - @property - def frame_width(self): - return 160 - - @property - def min_reward(self): - return -1 - - def get_action(self, observation=None): - return self.env.action_space.sample() - - def hparams(self, defaults, unused_model_hparams): - p = defaults - p.input_modality = {"inputs": ("video", 256), - "input_reward": ("symbol", self.num_rewards), - "input_action": ("symbol", self.num_actions)} - # p.input_modality = {"inputs": ("video", 256), - # "reward": ("symbol", self.num_rewards), - # "input_action": ("symbol", self.num_actions)} - # p.target_modality = ("video", 256) - p.target_modality = {"targets": ("video", 256), - "target_reward": ("symbol", self.num_rewards)} - #p.target_modality = {"targets": ("image", 256), - # "reward": ("symbol", self.num_rewards + 1)} # ("video", 256) - p.input_space_id = problem.SpaceID.IMAGE - p.target_space_id = problem.SpaceID.IMAGE - - def generate_samples(self, data_dir, tmp_dir, unused_dataset_split): - self.env.reset() - action = self.get_action() - for _ in range(self.num_steps): - observation, reward, done, _ = self.env.step(action) - action = self.get_action(observation) - yield {"frame": observation, - "action": [action], - "done": [done], - "reward": [int(reward - self.min_reward)]} - - -@registry.register_problem -class GymDiscreteProblemWithAgent(problem.Problem): - """Gym environment with discrete actions and rewards.""" +class GymDiscreteProblemWithAgent(GymPongRandom5k): + """Gym environment with discrete actions and rewards and an agent.""" def __init__(self, *args, **kwargs): super(GymDiscreteProblemWithAgent, self).__init__(*args, **kwargs) - self.num_channels = 3 + self._env = None self.history_size = 2 # defaults - self.environment_spec = lambda: gym.make("PongNoFrameskip-v4") + self.environment_spec = lambda: gym.make("PongDeterministic-v4") self.in_graph_wrappers = [(atari.MaxAndSkipWrapper, {"skip": 4})] self.collect_hparams = rl.atari_base() - self.num_steps = 1000 - self.movies = True - self.movies_fps = 24 + self.settable_num_steps = 1000 self.simulated_environment = None self.warm_up = 70 + @property + def num_steps(self): + return self.settable_num_steps + def _setup(self): in_graph_wrappers = [(atari.ShiftRewardWrapper, {"add_value": 2}), (atari.MemoryWrapper, {})] + self.in_graph_wrappers @@ -319,85 +233,23 @@ def _setup(self): self.data_get_op = atari.MemoryWrapper.singleton.speculum.dequeue() self.history_buffer = deque(maxlen=self.history_size+1) - def example_reading_spec(self, label_repr=None): - data_fields = { - "targets_encoded": tf.FixedLenFeature((), tf.string), - "image/format": tf.FixedLenFeature((), tf.string), - "action": tf.FixedLenFeature([1], tf.int64), - "reward": tf.FixedLenFeature([1], tf.int64), - # "done": tf.FixedLenFeature([1], tf.int64) - } - - for x in range(self.history_size): - data_fields["inputs_encoded_{}".format(x)] = tf.FixedLenFeature( - (), tf.string) - - data_items_to_decoders = { - "targets": tf.contrib.slim.tfexample_decoder.Image( - image_key="targets_encoded", - format_key="image/format", - shape=[210, 160, 3], - channels=3), - # Just do a pass through. - "action": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="action"), - "reward": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="reward"), - } - - for x in range(self.history_size): - key = "inputs_{}".format(x) - data_items_to_decoders[key] = tf.contrib.slim.tfexample_decoder.Image( - image_key="inputs_encoded_{}".format(x), - format_key="image/format", - shape=[210, 160, 3], - channels=3) - - return data_fields, data_items_to_decoders - - @property - def num_actions(self): - return 4 - - @property - def num_rewards(self): - return 2 - - @property - def num_shards(self): - return 10 - - @property - def num_dev_shards(self): - return 1 - - def get_action(self, observation=None): - return self.env.action_space.sample() - - def hparams(self, defaults, unused_model_hparams): - p = defaults - # The hard coded +1 after "symbol" refers to the fact - # that 0 is a special symbol meaning padding - # when symbols are e.g. 0, 1, 2, 3 we - # shift them to 0, 1, 2, 3, 4. - p.input_modality = {"action": ("symbol:identity", self.num_actions)} - - for x in range(self.history_size): - p.input_modality["inputs_{}".format(x)] = ("image", 256) - - p.target_modality = {"targets": ("image", 256), - "reward": ("symbol", self.num_rewards + 1)} - - p.input_space_id = problem.SpaceID.IMAGE - p.target_space_id = problem.SpaceID.IMAGE - def restore_networks(self, sess): model_saver = tf.train.Saver( tf.global_variables(".*network_parameters.*")) if FLAGS.agent_policy_path: model_saver.restore(sess, FLAGS.agent_policy_path) - def generator(self, data_dir, tmp_dir): + def generate_encoded_samples(self, data_dir, tmp_dir, unused_dataset_split): self._setup() - clip_files = [] + + # When no agent_policy_path is set, just generate random samples. + if not FLAGS.agent_policy_path: + for sample in super(GymDiscreteProblemWithAgent, + self).generate_encoded_samples( + data_dir, tmp_dir, unused_dataset_split): + yield sample + return + with tf.Session() as sess: sess.run(tf.global_variables_initializer()) self.restore_networks(sess) @@ -409,44 +261,20 @@ def generator(self, data_dir, tmp_dir): observ, reward, action, _ = sess.run(self.data_get_op) self.history_buffer.append(observ) - if self.movies and pieces_generated > self.warm_up: - file_name = os.path.join(tmp_dir, - "output_{}.png".format(pieces_generated)) - clip_files.append(file_name) - with open(file_name, "wb") as f: - f.write(observ) - - if len(self.history_buffer) == self.history_size+1: + if len(self.history_buffer) == self.history_size + 1: pieces_generated += 1 - ret_dict = { - "targets_encoded": [observ], - "image/format": ["png"], - "action": [int(action)], - # "done": [bool(done)], - "reward": [int(reward)], - } - for i, v in enumerate(list(self.history_buffer)[:-1]): - ret_dict["inputs_encoded_{}".format(i)] = [v] + ret_dict = {"image/encoded": [observ], + "image/format": ["png"], + "image/height": [self.frame_height], + "image/width": [self.frame_width], + "action": [int(action)], + "done": [int(False)], + "reward": [int(reward) - self.min_reward]} if pieces_generated > self.warm_up: yield ret_dict else: sess.run(self.collect_trigger_op) - if self.movies: - clip = moviepy_editor().ImageSequenceClip(clip_files, fps=self.movies_fps) - clip_path = os.path.join(data_dir, "output_{}.mp4".format(self.name)) - clip.write_videofile(clip_path, fps=self.movies_fps, codec="mpeg4") - - def generate_data(self, data_dir, tmp_dir, task_id=-1): - train_paths = self.training_filepaths( - data_dir, self.num_shards, shuffled=False) - dev_paths = self.dev_filepaths( - data_dir, self.num_dev_shards, shuffled=False) - all_paths = train_paths + dev_paths - generator_utils.generate_files( - self.generator(data_dir, tmp_dir), all_paths) - generator_utils.shuffle_dataset(all_paths) - @registry.register_problem class GymSimulatedDiscreteProblemWithAgent(GymDiscreteProblemWithAgent): @@ -454,16 +282,12 @@ class GymSimulatedDiscreteProblemWithAgent(GymDiscreteProblemWithAgent): def __init__(self, *args, **kwargs): super(GymSimulatedDiscreteProblemWithAgent, self).__init__(*args, **kwargs) - # TODO(lukaszkaiser): pull it outside - self.in_graph_wrappers = [(atari.TimeLimitWrapper, {"timelimit": 150}), - (atari.MaxAndSkipWrapper, {"skip": 4})] self.simulated_environment = True - self.movies_fps = 2 + self.debug_dump_frames_path = "/tmp/t2t_debug_dump_frames" def restore_networks(self, sess): super(GymSimulatedDiscreteProblemWithAgent, self).restore_networks(sess) - - # TODO(lukaszkaiser): adjust regexp for different models + # TODO(blazej): adjust regexp for different models. env_model_loader = tf.train.Saver(tf.global_variables(".*basic_conv_gen.*")) sess = tf.get_default_session() diff --git a/tensor2tensor/data_generators/image_utils.py b/tensor2tensor/data_generators/image_utils.py index d65cfa4ba..06061a5ff 100644 --- a/tensor2tensor/data_generators/image_utils.py +++ b/tensor2tensor/data_generators/image_utils.py @@ -31,8 +31,6 @@ import tensorflow as tf -from tensorflow.python.eager import context - def resize_by_area(img, size): """image resize function used by quite a few image problems.""" @@ -159,7 +157,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): def encode_images_as_png(images): - if context.in_eager_mode(): + if tf.contrib.eager.in_eager_mode(): for image in images: yield tf.image.encode_png(image).numpy() else: diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py index 0fb21bff6..b0b2e719a 100644 --- a/tensor2tensor/data_generators/lm1b.py +++ b/tensor2tensor/data_generators/lm1b.py @@ -24,7 +24,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem @@ -79,7 +79,7 @@ def _train_data_filenames(tmp_dir): os.path.join(tmp_dir, "1-billion-word-language-modeling-benchmark-r13output", "training-monolingual.tokenized.shuffled", - "news.en-%05d-of-00100" % i) for i in xrange(1, 100) + "news.en-%05d-of-00100" % i) for i in range(1, 100) ] diff --git a/tensor2tensor/data_generators/ocr.py b/tensor2tensor/data_generators/ocr.py index 924483d5a..074686459 100644 --- a/tensor2tensor/data_generators/ocr.py +++ b/tensor2tensor/data_generators/ocr.py @@ -69,7 +69,7 @@ def generator(self, data_dir, tmp_dir, is_training): num_examples = 2 ocr_dir = os.path.join(tmp_dir, "ocr/") tf.logging.info("Looking for OCR data in %s." % ocr_dir) - for i in xrange(num_examples): + for i in range(num_examples): image_filepath = os.path.join(ocr_dir, "%d.png" % i) text_filepath = os.path.join(ocr_dir, "%d.txt" % i) with tf.gfile.Open(text_filepath, "rb") as f: diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 43ef66a4d..80d44ee61 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -344,13 +344,9 @@ def _preprocess(example): return examples is_training = mode == tf.estimator.ModeKeys.TRAIN - if hasattr(tf.contrib.data, "parallel_interleave"): - dataset = dataset.apply( - tf.contrib.data.parallel_interleave( - _preprocess, sloppy=is_training, cycle_length=8)) - else: - dataset = dataset.interleave(_preprocess, cycle_length=8, - block_length=16) + dataset = dataset.apply( + tf.contrib.data.parallel_interleave( + _preprocess, sloppy=is_training, cycle_length=8)) return dataset @@ -568,14 +564,9 @@ def _load_records_and_preprocess(filename): random.shuffle(data_files) dataset = tf.data.Dataset.from_tensor_slices(tf.constant(data_files)) - if hasattr(tf.contrib.data, "parallel_interleave"): - dataset = dataset.apply( - tf.contrib.data.parallel_interleave( - _load_records_and_preprocess, sloppy=is_training, cycle_length=8)) - else: - dataset = dataset.interleave(_load_records_and_preprocess, cycle_length=8, - block_length=16) - + dataset = dataset.apply( + tf.contrib.data.parallel_interleave( + _load_records_and_preprocess, sloppy=is_training, cycle_length=8)) dataset = dataset.map( self.maybe_reverse_and_copy, num_parallel_calls=num_threads) @@ -1067,7 +1058,6 @@ def problem_hparams_to_features(problem_hparams): input_space_id = problem_hparams.input_space_id target_space_id = problem_hparams.target_space_id return { - "problem_choice": 0, "input_space_id": input_space_id, "target_space_id": target_space_id, } diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index 5398c3930..cb2a43978 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -34,7 +34,7 @@ import numpy as np import six -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.data_generators import tokenizer import tensorflow as tf @@ -60,26 +60,37 @@ _ESCAPE_CHARS = set(u"\\_u;0123456789") -# Conversion between Unicode and UTF-8, if required (on Python2). -if six.PY2: - - def native_to_unicode(s): - return s if isinstance(s, unicode) else s.decode("utf-8") +# Unicode utility functions that work with Python 2 and 3 +def native_to_unicode(s): + return s if is_unicode(s) else to_unicode(s) - def unicode_to_native(s): - return s.encode("utf-8") if isinstance(s, unicode) else s -else: # No conversion required on Python >= 3. - def native_to_unicode(s): +def unicode_to_native(s): + if six.PY2: + return s.encode("utf-8") if is_unicode(s) else s + else: return s - def unicode_to_native(s): + +def is_unicode(s): + if six.PY2: + if isinstance(s, unicode): + return True + else: + if isinstance(s, str): + return True + return False + + +def to_unicode(s, ignore_errors=False): + if is_unicode(s): return s + error_mode = "ignore" if ignore_errors else "strict" + return s.decode("utf-8", errors=error_mode) def to_unicode_ignore_errors(s): - return (unicode(s, "utf-8", errors="ignore") - if six.PY2 else s.decode("utf-8", "ignore")) + return to_unicode(s, ignore_errors=True) class TextEncoder(object): @@ -374,7 +385,7 @@ def store_to_file(self, filename): filename: Full path of the file to store the vocab to. """ with tf.gfile.Open(filename, "w") as f: - for i in xrange(len(self._id_to_token)): + for i in range(len(self._id_to_token)): f.write(self._id_to_token[i] + "\n") @@ -588,7 +599,7 @@ def _escaped_token_to_subtoken_strings(self, escaped_token): start = 0 token_len = len(escaped_token) while start < token_len: - for end in xrange( + for end in range( min(token_len, start + self._max_subtoken_len), start, -1): subtoken = escaped_token[start:end] if subtoken in self._subtoken_string_to_id: @@ -774,7 +785,7 @@ def build_from_token_counts(self, # with high enough counts for our new vocabulary. if min_count < 1: min_count = 1 - for i in xrange(num_iterations): + for i in range(num_iterations): tf.logging.info("Iteration {0}".format(i)) # Collect all substrings of the encoded token that break along current @@ -789,7 +800,7 @@ def build_from_token_counts(self, if max_subtoken_length is not None: last_position = min(last_position, start + max_subtoken_length) - for end in xrange(start + 1, last_position): + for end in range(start + 1, last_position): new_subtoken = escaped_token[start:end] subtoken_counts[new_subtoken] += count start += len(subtoken) @@ -806,7 +817,7 @@ def build_from_token_counts(self, # Consider the candidates longest to shortest, so that if we accept # a longer subtoken string, we can decrement the counts of its prefixes. new_subtoken_strings = [] - for lsub in xrange(len(len_to_subtoken_strings) - 1, 0, -1): + for lsub in range(len(len_to_subtoken_strings) - 1, 0, -1): subtoken_strings = len_to_subtoken_strings[lsub] for subtoken_string in subtoken_strings: count = subtoken_counts[subtoken_string] @@ -815,7 +826,7 @@ def build_from_token_counts(self, # explicitly, regardless of count. if subtoken_string not in self._alphabet: new_subtoken_strings.append((count, subtoken_string)) - for l in xrange(1, lsub): + for l in range(1, lsub): subtoken_counts[subtoken_string[:l]] -= count # Include the alphabet explicitly to guarantee all strings are encodable. diff --git a/tensor2tensor/data_generators/text_encoder_test.py b/tensor2tensor/data_generators/text_encoder_test.py index e11607008..b3248a7c4 100644 --- a/tensor2tensor/data_generators/text_encoder_test.py +++ b/tensor2tensor/data_generators/text_encoder_test.py @@ -30,7 +30,7 @@ # Dependency imports import mock import six -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.data_generators import text_encoder import tensorflow as tf @@ -193,7 +193,7 @@ def test_long_tokens(self): long_tokens = [] for _ in range(num_tokens): long_token = "".join([random.choice(string.ascii_uppercase) - for _ in xrange(token_length)]) + for _ in range(token_length)]) long_tokens.append(long_token) corpus = " ".join(long_tokens) diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py index c023627c2..b6c0e3236 100644 --- a/tensor2tensor/data_generators/tokenizer.py +++ b/tensor2tensor/data_generators/tokenizer.py @@ -51,7 +51,7 @@ # Dependency imports import six -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin import tensorflow as tf # Conversion between Unicode and UTF-8, if required (on Python2) @@ -60,7 +60,7 @@ # This set contains all letter and number characters. _ALPHANUMERIC_CHAR_SET = set( - six.unichr(i) for i in xrange(sys.maxunicode) + six.unichr(i) for i in range(sys.maxunicode) if (unicodedata.category(six.unichr(i)).startswith("L") or unicodedata.category(six.unichr(i)).startswith("N"))) @@ -79,7 +79,7 @@ def encode(text): token_start = 0 # Classify each character in the input string is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text] - for pos in xrange(1, len(text)): + for pos in range(1, len(text)): if is_alnum[pos] != is_alnum[pos - 1]: token = text[token_start:pos] if token != u" " or token_start == 0: diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py index ac4cd0cca..e977d1126 100644 --- a/tensor2tensor/data_generators/tokenizer_test.py +++ b/tensor2tensor/data_generators/tokenizer_test.py @@ -26,7 +26,7 @@ # Dependency imports import six -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.data_generators import tokenizer import tensorflow as tf @@ -57,8 +57,8 @@ def test_decode(self): [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."])) def test_invertibility_on_random_strings(self): - for _ in xrange(1000): - s = u"".join(six.unichr(random.randint(0, 65535)) for _ in xrange(10)) + for _ in range(1000): + s = u"".join(six.unichr(random.randint(0, 65535)) for _ in range(10)) self.assertEqual(s, tokenizer.decode(tokenizer.encode(s))) diff --git a/tensor2tensor/data_generators/video_utils.py b/tensor2tensor/data_generators/video_utils.py index 723c192f5..869fad721 100644 --- a/tensor2tensor/data_generators/video_utils.py +++ b/tensor2tensor/data_generators/video_utils.py @@ -19,8 +19,12 @@ from __future__ import division from __future__ import print_function +import os + # Dependency imports +import six + from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import image_utils from tensor2tensor.data_generators import problem @@ -43,6 +47,12 @@ def resize_video_frames(images, size): class VideoProblem(problem.Problem): """Base class for problems with videos.""" + def __init__(self, *args, **kwargs): + super(VideoProblem, self).__init__(*args, **kwargs) + # Path to a directory to dump generated frames as png for debugging. + # If empty, no debug frames will be generated. + self.debug_dump_frames_path = "" + @property def num_channels(self): """Number of color channels in each frame.""" @@ -157,7 +167,7 @@ def features_from_batch(batched_prefeatures): Features dictionary with joint features per-frame. """ features = {} - for k, v in batched_prefeatures.items(): + for k, v in six.iteritems(batched_prefeatures): if k == "frame": # We rename past frames to inputs and targets. s1, s2 = split_on_batch(v) # Reshape just to make sure shapes are right and set. @@ -242,13 +252,27 @@ def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): if width != self.frame_width: raise ValueError("Generated frame has width %d while the class " "assumes width %d." % (width, self.frame_width)) - encoded_frame = image_utils.encode_images_as_png([unencoded_frame]).__next__() + encoded_frame = six.next( + image_utils.encode_images_as_png([unencoded_frame])) features["image/encoded"] = [encoded_frame] features["image/format"] = ["png"] features["image/height"] = [height] features["image/width"] = [width] yield features + def generate_encoded_samples_debug(self, data_dir, tmp_dir, dataset_split): + """Generate samples of the encoded frames and dump for debug if needed.""" + counter = 0 + for sample in self.generate_encoded_samples( + data_dir, tmp_dir, dataset_split): + if self.debug_dump_frames_path: + path = os.path.join(self.debug_dump_frames_path, + "frame_%d.png" % counter) + with tf.gfile.Open(path, "wb") as f: + f.write(sample["image/encoded"][0]) + counter += 1 + yield sample + def generate_data(self, data_dir, tmp_dir, task_id=-1): """The function generating the data.""" filepath_fns = { @@ -268,10 +292,11 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( - self.generate_encoded_samples(data_dir, tmp_dir, split), paths) + self.generate_encoded_samples_debug( + data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( - self.generate_encoded_samples( + self.generate_encoded_samples_debug( data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) diff --git a/tensor2tensor/insights/README.md b/tensor2tensor/insights/README.md index ebed255e1..014bfca81 100644 --- a/tensor2tensor/insights/README.md +++ b/tensor2tensor/insights/README.md @@ -41,7 +41,7 @@ Start guide, a sample configuration would be: "data_dir": "/tmp/t2t/data", "hparams": "", "hparams_set": "transformer_base_single_gpu", - "problems": "translate_ende_wmt32k" + "problem": "translate_ende_wmt32k" }, }] "language": [{ diff --git a/tensor2tensor/insights/transformer_model.py b/tensor2tensor/insights/transformer_model.py index d7ac83a0a..da8cf5fe3 100644 --- a/tensor2tensor/insights/transformer_model.py +++ b/tensor2tensor/insights/transformer_model.py @@ -115,7 +115,7 @@ def __init__(self, processor_configuration): transformer_config["hparams_set"], transformer_config["hparams"], data_dir=data_dir, - problem_name=transformer_config["problems"]) + problem_name=transformer_config["problem"]) decode_hp = decoding.decode_hparams() decode_hp.add_hparam("shards", 1) @@ -129,8 +129,8 @@ def __init__(self, processor_configuration): decode_hparams=decode_hp, use_tpu=False) # Fetch the vocabulary and other helpful variables for decoding. - self.source_vocab = self.hparams.problems[0].vocabulary["inputs"] - self.targets_vocab = self.hparams.problems[0].vocabulary["targets"] + self.source_vocab = self.hparams.problem_hparams.vocabulary["inputs"] + self.targets_vocab = self.hparams.problem_hparams.vocabulary["targets"] self.const_array_size = 10000 # Prepare the Transformer's debug data directory. @@ -166,7 +166,6 @@ def server_input_fn(): x += [0] * (self.const_array_size - len(x)) d = { "inputs": np.array(x).astype(np.int32), - "problem_choice": np.array(0).astype(np.int32) } yield d diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index a6b4f919d..46befcb8d 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -27,7 +27,7 @@ import numpy as np from six.moves import range # pylint: disable=redefined-builtin -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from six.moves import zip # pylint: disable=redefined-builtin from tensor2tensor.layers import common_layers @@ -540,7 +540,7 @@ def add_timing_signal_nd(x, min_timescale=1.0, max_timescale=1.0e4): (tf.to_float(num_timescales) - 1)) inv_timescales = min_timescale * tf.exp( tf.to_float(tf.range(num_timescales)) * -log_timescale_increment) - for dim in xrange(num_dims): + for dim in range(num_dims): length = common_layers.shape_list(x)[dim + 1] position = tf.to_float(tf.range(length)) scaled_time = tf.expand_dims(position, 1) * tf.expand_dims( @@ -549,9 +549,9 @@ def add_timing_signal_nd(x, min_timescale=1.0, max_timescale=1.0e4): prepad = dim * 2 * num_timescales postpad = channels - (dim + 1) * 2 * num_timescales signal = tf.pad(signal, [[0, 0], [prepad, postpad]]) - for _ in xrange(1 + dim): + for _ in range(1 + dim): signal = tf.expand_dims(signal, 0) - for _ in xrange(num_dims - 1 - dim): + for _ in range(num_dims - 1 - dim): signal = tf.expand_dims(signal, -2) x += signal return x @@ -579,7 +579,7 @@ def add_positional_embedding_nd(x, max_length, name): base_shape = [1] * (num_dims + 1) + [depth] base_start = [0] * (num_dims + 2) base_size = [-1] + [1] * num_dims + [depth] - for i in xrange(num_dims): + for i in range(num_dims): shape = base_shape[:] start = base_start[:] size = base_size[:] @@ -3710,7 +3710,7 @@ def forward_internal(x, wqkv, wo, attention_bias, norm_scale, norm_bias): wqkv_split = tf.unstack(wqkv, num=num_heads) wo_split = tf.unstack(wo, num=num_heads) y = 0 - for h in xrange(num_heads): + for h in range(num_heads): with tf.control_dependencies([y] if h > 0 else []): combined = tf.nn.conv1d(n, wqkv_split[h], 1, "SAME") q, k, v = tf.split(combined, 3, axis=2) @@ -3737,7 +3737,7 @@ def grad_fn(x, wqkv, wo, attention_bias, norm_scale, norm_bias, dy): dwqkvs = [] dwos = [] dn = 0 - for h in xrange(num_heads): + for h in range(num_heads): with tf.control_dependencies(deps): combined = tf.nn.conv1d(n, wqkv_split[h], 1, "SAME") q, k, v = tf.split(combined, 3, axis=2) diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py index fffe674f7..230579888 100644 --- a/tensor2tensor/layers/common_hparams.py +++ b/tensor2tensor/layers/common_hparams.py @@ -91,7 +91,6 @@ def basic_params1(): learning_rate=0.1, sampling_method="argmax", # "argmax" or "random" sampling_temp=1.0, # temperature for sampling - problem_choice="adaptive", # "uniform", "adaptive", "distributed" # expand the logits a piece at a time - saves memory. factored_logits=False, multiply_embedding_mode="sqrt_depth", @@ -229,12 +228,19 @@ def basic_params1(): force_full_predict=False, # Set this for pure model parallelism. There is only one data shard. no_data_parallelism=False, - # Set this to the dtype used for activation. Variables will still be - # stored in float32. + # dtype used for activations. - "float32" or "bfloat16" + # activation_dtype="bfloat16" currently only works on TPU. + # It lowers activation-memory usage + # and does not appear to affect quality. + # You can train on TPU with activation_dtype="bfloat16" and evaluate + # on CPU/GPU with activation_dtype="float32" activation_dtype="float32", - # Experimental: set weight_dtype="bfloat16" to use bfloat16 for both - # weights and activations. Model quality may be worse. Model quality - # appears to be close to baseline with large batch sizes (>4k). + # dtype used for parameters: "float32" or "bfloat16" + # bfloat16 currently only works with optimizer="adafactor". + # The savings in memory allow for training larger models. + # Weights are encoded as (w*128)^8, using pseudostochastic + # roundoff. Initial experiments show that model quality is similar + # to baseline for about 3M training steps, but worse thereafter. weight_dtype="float32", ) diff --git a/tensor2tensor/layers/common_image_attention.py b/tensor2tensor/layers/common_image_attention.py index f80b04e49..f60fa3711 100644 --- a/tensor2tensor/layers/common_image_attention.py +++ b/tensor2tensor/layers/common_image_attention.py @@ -16,7 +16,7 @@ """Utils for attention mechanism for images.""" # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_layers @@ -26,6 +26,7 @@ class AttentionType(object): + """Types of attention type used in cia.""" LOCAL_1D = "local_1d" LOCAL_2D = "local_2d" GLOBAL = "global" @@ -33,6 +34,7 @@ class AttentionType(object): DILATED = "dilated" MOE_LOCAL_1D = "moe_local1d" LOCAL_BLOCK = "local_block" + NON_CAUSAL_1D = "local_1d_noncausal" @staticmethod def get_choices(): @@ -44,6 +46,7 @@ def get_choices(): AttentionType.LOCAL_2D, AttentionType.LOCAL_BLOCK, AttentionType.DILATED, + AttentionType.NON_CAUSAL_1D, ] @@ -288,7 +291,7 @@ def transformer_decoder_layers(inputs, x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout) if attention_type == AttentionType.DILATED: assert len(hparams.gap_sizes) == num_layers - for layer in xrange(num_layers): + for layer in range(num_layers): with tf.variable_scope("%s_layer_%d" % (name, layer)): # self-attention + skip connections if attention_type == AttentionType.LOCAL_2D: @@ -300,6 +303,11 @@ def transformer_decoder_layers(inputs, hparams, attention_type="local_mask_right", q_padding="LEFT", kv_padding="LEFT") + elif attention_type == AttentionType.NON_CAUSAL_1D: + y = local_attention_1d(common_layers.layer_preprocess(x, hparams), + hparams, + attention_type="local_unmasked", + q_padding="VALID", kv_padding="VALID") elif attention_type == AttentionType.LOCAL_BLOCK: y = local_within_block_attention( common_layers.layer_preprocess(x, hparams), @@ -345,7 +353,7 @@ def transformer_encoder_layers(inputs, x = inputs x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout) - for layer in xrange(num_layers): + for layer in range(num_layers): # attention layers + skip connections with tf.variable_scope("%s_layer_%d" % (name, layer)): if attention_type == AttentionType.LOCAL_2D: @@ -433,7 +441,7 @@ def transformer_layers_sharded(dp, expert_fn = expert_utils.ffn_expert_fn( hparams.hidden_size, moe_hidden_sizes, hparams.hidden_size) x = dp(tf.nn.dropout, x, 1.0 - hparams.layer_prepostprocess_dropout) - for layer in xrange(num_layers): + for layer in range(num_layers): with tf.variable_scope("%s_layer_%d" % (name, layer)): # self-attention if attention_type == AttentionType.LOCAL_2D: @@ -632,7 +640,7 @@ def get_channel_embeddings(io_depth, targets, hidden_size, name="channel"): rgb_embedding_var = tf.identity(rgb_embedding_var) rgb_embedding_var *= float(hidden_size)**0.5 channel_target_embs = [] - for i in xrange(io_depth): + for i in range(io_depth): # Adding the channel offsets to get the right embedding since the # embedding tensor has shape 256 * io_depth, hidden_size target_ids = tf.squeeze(targets_split[i], axis=3) + i * 256 diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py index 3c1155643..ca5f3efc8 100644 --- a/tensor2tensor/layers/common_layers.py +++ b/tensor2tensor/layers/common_layers.py @@ -27,12 +27,11 @@ # Dependency imports import numpy as np -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.utils import expert_utils as eu import tensorflow as tf -from tensorflow.python.eager import context as tfe_context from tensorflow.python.framework import function from tensorflow.python.framework import ops @@ -42,7 +41,7 @@ def is_on_tpu(): - # Support TF versions 1.4+ + # Support TF versions 1.5+ try: from tensorflow.python.ops import control_flow_util # pylint: disable=g-import-not-at-top ctxt = tf.get_default_graph()._get_control_flow_context() # pylint: disable=protected-access @@ -51,54 +50,6 @@ def is_on_tpu(): return tf.contrib.framework.get_name_scope().startswith("TPUReplicate") -def bfloat16_var_getter(getter, *args, **kwargs): - """A custom getter function for bfloat16 variables. - - Variables maintain storage in float32. - - Args: - getter: custom getter - *args: arguments - **kwargs: keyword arguments - Returns: - variables with the correct dtype. - Raises: - KeyError: if "dtype" is not provided as a kwarg. - """ - requested_dtype = kwargs["dtype"] - if requested_dtype == tf.bfloat16: - kwargs["dtype"] = tf.float32 - var = getter(*args, **kwargs) - # This if statement is needed to guard the cast, because batch norm - # assigns directly to the return value of this custom getter. The cast - # makes the return value not a variable so it cannot be assigned. Batch - # norm variables are always in fp32 so this if statement is never - # triggered for them. - if var.dtype.base_dtype != requested_dtype: - var = tf.cast(var, requested_dtype) - return var - - -def bfloat16_weights_var_getter(getter, *args, **kwargs): - """A custom getter function for bfloat16 variables. - - Variables maintain storage in bfloat16. - - Args: - getter: A custom getter. - *args: Arguments. - **kwargs: Keyword arguments. - Returns: - Variables with the correct dtype. - Raises: - KeyError: if "dtype" is not provided as a kwarg. - """ - requested_dtype = kwargs["dtype"] - if requested_dtype in (tf.bfloat16, tf.float32): - kwargs["dtype"] = tf.bfloat16 - return getter(*args, **kwargs) - - def dropout_with_broadcast_dims(x, keep_prob, broadcast_dims=None, **kwargs): """Like tf.nn.dropout but takes broadcast_dims instead of noise_shape. @@ -120,8 +71,10 @@ def dropout_with_broadcast_dims(x, keep_prob, broadcast_dims=None, **kwargs): if broadcast_dims: shape = tf.shape(x) ndims = len(x.get_shape()) + # Allow dimensions like "-1" as well. + broadcast_dims = [dim + ndims if dim < 0 else dim for dim in broadcast_dims] kwargs["noise_shape"] = [ - 1 if i in broadcast_dims else shape[i] for i in xrange(ndims)] + 1 if i in broadcast_dims else shape[i] for i in range(ndims)] return tf.nn.dropout(x, keep_prob, **kwargs) @@ -311,7 +264,7 @@ def embedding(x, # On the backwards pass, we want to convert the gradient from # an indexed-slices to a regular tensor before sending it back to the # parameter server. This avoids excess computation on the parameter server. - if not tfe_context.in_eager_mode(): + if not tf.contrib.eager.in_eager_mode(): embedding_var = eu.convert_gradient_to_tensor(embedding_var) x = dropout_no_scaling(x, 1.0 - symbol_dropout_rate) emb_x = gather(embedding_var, x, dtype) @@ -377,7 +330,7 @@ def conv_stride2_multistep(x, nbr_steps, output_filters, name=None, reuse=None): out = conv(x, output_filters, (1, 1)) return out, [out] hidden_layers = [x] - for i in xrange(nbr_steps): + for i in range(nbr_steps): hidden_layers.append( conv( hidden_layers[-1], @@ -433,7 +386,7 @@ def deconv2d(cur, i): return tf.depth_to_space(thicker, 2) cur = x - for i in xrange(nbr_steps): + for i in range(nbr_steps): if cur.get_shape()[2] == 1: cur = deconv1d(cur, i) else: @@ -489,7 +442,7 @@ def conv2d_kernel(kernel_size_arg, name_suffix): return conv2d_kernel(kernel_size, "single") -def conv(inputs, filters, kernel_size, dilation_rate=1, **kwargs): +def conv(inputs, filters, kernel_size, dilation_rate=(1, 1), **kwargs): return conv_internal( tf.layers.conv2d, inputs, @@ -575,7 +528,7 @@ def tpu_conv1d(inputs, filters, kernel_size, padding="SAME", name="tpu_conv1d"): last_offset = first_offset + kernel_size - 1 results = [] padded = tf.pad(inputs, [[0, 0], [-first_offset, last_offset], [0, 0]]) - for i in xrange(kernel_size): + for i in range(kernel_size): shifted = tf.slice(padded, [0, i, 0], tf.shape(inputs)) if i else inputs shifted.set_shape(inputs.get_shape()) results.append(dense( @@ -1140,7 +1093,7 @@ def multiscale_conv_and_attention(x, padding, hparams, source=None): x, hparams.hidden_size, [((hparams.kernel_height**i, hparams.kernel_width**i), - (hparams.kernel_height, hparams.kernel_width)) for i in xrange(3)], + (hparams.kernel_height, hparams.kernel_width)) for i in range(3)], "AVG", padding=padding) # For residuals a rescale if necessary if channels differ. @@ -1955,7 +1908,7 @@ def next_state(cur_state, args_tup): cur_x_times_one_minus_f, cur_f = args_tup return cur_f * cur_state + cur_x_times_one_minus_f # Calculate SRU on each layer. - for i in xrange(num_layers): + for i in range(num_layers): # The parallel part of the SRU. x_orig = x x, f, r = tf.split(tf.layers.dense(x, 3 * x_shape[-1], @@ -2158,7 +2111,7 @@ def approximate_split(x, num_splits, axis=0): a list of num_splits Tensors. """ size = shape_list(x)[axis] - size_splits = [tf.div(size + i, num_splits) for i in xrange(num_splits)] + size_splits = [tf.div(size + i, num_splits) for i in range(num_splits)] return tf.split(x, size_splits, axis=axis) @@ -2225,7 +2178,7 @@ def smoothing_cross_entropy_factored_grad(op, dy): b_grad = None a_grad_parts = [] deps = [] - for part in xrange(num_splits): + for part in range(num_splits): with tf.control_dependencies(deps): logits = tf.matmul(a[part], b, transpose_b=True) output_part = smoothing_cross_entropy(logits, labels[part], vocab_size, @@ -2266,7 +2219,7 @@ def smoothing_cross_entropy_factored(a, b, labels, confidence): labels = approximate_split(labels, num_splits) a = approximate_split(a, num_splits) parts = [] - for part in xrange(num_splits): + for part in range(num_splits): with tf.control_dependencies(parts[-1:]): logits = tf.matmul(a[part], b, transpose_b=True) parts.append( @@ -2442,7 +2395,7 @@ def forward_internal(x, f1, f2, scale, bias): x_flat = tf.reshape(x, [-1, 1, shape_list(x)[2]]) xs = approximate_split(x_flat, num_splits) ys = [] - for i in xrange(num_splits): + for i in range(num_splits): with tf.control_dependencies(ys[-1:]): n = layer_norm_compute_python(xs[i], epsilon, scale, bias) y = tf.nn.conv1d(n, f1, 1, "SAME") @@ -2476,7 +2429,7 @@ def grad_fn(x, f1, f2, scale, bias, dy): dscale = 0 dbias = 0 deps = [] - for i in xrange(num_splits): + for i in range(num_splits): with tf.control_dependencies(deps): n = layer_norm_compute_python(xs[i], epsilon, scale, bias) y = tf.nn.conv1d(n, f1, 1, "SAME") @@ -2530,7 +2483,7 @@ def shape_list(x): shape = tf.shape(x) ret = [] - for i in xrange(len(static)): + for i in range(len(static)): dim = static[i] if dim is None: dim = shape[i] @@ -2587,7 +2540,7 @@ def ones_matrix_band_part(rows, cols, num_lower, num_upper, out_shape=None): def reshape_like_all_dims(a, b): """Reshapes a to match the shape of b.""" ret = tf.reshape(a, tf.shape(b)) - if not tfe_context.in_eager_mode(): + if not tf.contrib.eager.in_eager_mode(): ret.set_shape(b.get_shape()) return ret @@ -2637,7 +2590,7 @@ def expand_by_device(original_parallelism, device_parallelism, data): """ device_to_datum = { device_parallelism.devices[i]: data[i] - for i in xrange(device_parallelism.n)} + for i in range(device_parallelism.n)} return [device_to_datum[d] for d in original_parallelism.devices] @@ -2684,7 +2637,7 @@ def _step(source_replica, target_replica, x_split, op="plus_eq"): x_split: a list of lists of tensors op: a string """ - for shard in xrange(parallelism.n): + for shard in range(parallelism.n): source_device = (shard + source_replica) % parallelism.n target_device = (shard + target_replica) % parallelism.n source = x_split[source_device][shard] @@ -2702,10 +2655,10 @@ def _step(source_replica, target_replica, x_split, op="plus_eq"): # accumulate everything towards the center. for i in range(center, parallelism.n - 1)[::-1]: _step(i + 1, i, x_split, op="plus_eq") - for i in xrange(center): + for i in range(center): _step(i, i + 1, x_split, op="plus_eq") # copy everything away from the center. - for i in xrange(center, parallelism.n - 1): + for i in range(center, parallelism.n - 1): _step(i, i + 1, x_split, op="copy") for i in range(center)[::-1]: _step(i + 1, i, x_split, op="copy") @@ -2774,11 +2727,7 @@ def grad_fn(inputs, variables, outputs, output_grads): @fn_with_custom_grad(grad_fn) def fn_with_recompute(*args): cached_vs.append(tf.get_variable_scope()) - # TODO(rsepassi): Rm conditional in TF 1.5 - if hasattr(tf.contrib.framework, "current_arg_scope"): - cached_arg_scope.append(tf.contrib.framework.current_arg_scope()) - else: - cached_arg_scope.append({}) + cached_arg_scope.append(tf.contrib.framework.current_arg_scope()) return fn(*args) return fn_with_recompute(*args) diff --git a/tensor2tensor/layers/discretization.py b/tensor2tensor/layers/discretization.py index 78577db89..f0fc57391 100644 --- a/tensor2tensor/layers/discretization.py +++ b/tensor2tensor/layers/discretization.py @@ -67,9 +67,7 @@ def nearest_neighbor(x, block_v_size, random_top_k=1, soft_em=False, - soft_em_startup_steps=10000, - inv_temp=1.0, - ema_count=None): + num_samples=1): """Find the nearest element in means to elements in x. Args: @@ -79,11 +77,7 @@ def nearest_neighbor(x, block_v_size: Number of table entries per block. random_top_k: Noisy top-k if this is bigger than 1 (Default: 1). soft_em: If True then use soft EM rather than hard EM (Default: False). - soft_em_startup_steps: Number of steps before soft_em activates - (Default: 10000). - inv_temp: Inverse temperature for soft EM (Default: 1.) - ema_count: Table of counts for each embedding corresponding to how many - examples in a batch it was the closest to (Default: None). + num_samples: Number of samples to take in soft EM (Default: 1). Returns: Tensor with nearest element in mean encoded in one-hot notation. @@ -98,15 +92,15 @@ def nearest_neighbor(x, # computing cluster probabilities if soft_em: - ema_count = tf.expand_dims(ema_count, 0) - c_probs = ema_count / tf.reduce_sum(ema_count, 2, keepdims=True) - c_probs = tf.where( - tf.less(tf.to_int32(tf.train.get_global_step()), soft_em_startup_steps), - tf.ones_like(c_probs, dtype=tf.float32), c_probs) - mask = common_layers.inverse_lin_decay(2 * soft_em_startup_steps) - c_probs = mask * c_probs + (1 - mask) * tf.ones_like(c_probs) - nearest_hot = tf.exp(-inv_temp * dist) * c_probs - nearest_hot /= tf.reduce_sum(nearest_hot, 2, keepdims=True) + num_blocks = common_layers.shape_list(dist)[1] + nearest_idx = tf.stack( + [ + tf.multinomial(-dist[:, i, :], num_samples=num_samples) + for i in range(num_blocks) + ], + axis=1) + nearest_hot = tf.one_hot(nearest_idx, depth=block_v_size) + nearest_hot = tf.reduce_mean(nearest_hot, axis=-2) else: if random_top_k > 1: _, top_k_idx = tf.nn.top_k(-dist, k=random_top_k) @@ -127,9 +121,7 @@ def embedding_lookup(x, block_v_size, random_top_k=1, soft_em=False, - soft_em_startup_steps=10000, - inv_temp=1.0, - ema_count=None): + num_samples=1): """Compute nearest neighbors and loss for training the embeddings via DVQ. Args: @@ -140,11 +132,7 @@ def embedding_lookup(x, block_v_size: Number of table entries per block. random_top_k: Noisy top-k if this is bigger than 1 (Default: 1). soft_em: If True then use soft EM rather than hard EM (Default: False). - soft_em_startup_steps: Number of steps before soft_em activates - (Default: 10000). - inv_temp: Inverse temperature for soft EM (Default: 1.) - ema_count: Table of counts for each embedding corresponding to how many - examples in a batch it was the closest to (Default: None). + num_samples: Number of samples to use for soft EM (Default: 1). Returns: The nearest neighbor in one hot form, the nearest neighbor itself, the @@ -156,14 +144,9 @@ def embedding_lookup(x, block_v_size, random_top_k, soft_em=soft_em, - soft_em_startup_steps=soft_em_startup_steps, - inv_temp=inv_temp, - ema_count=ema_count) + num_samples=num_samples) x_means_hot_flat = tf.reshape(x_means_hot, [-1, num_blocks, block_v_size]) - x_means_idx = tf.argmax(x_means_hot_flat, axis=-1) - x_means = tf.matmul( - tf.transpose(tf.one_hot(x_means_idx, block_v_size), perm=[1, 0, 2]), - means) + x_means = tf.matmul(tf.transpose(x_means_hot_flat, perm=[1, 0, 2]), means) x_means = tf.transpose(x_means, [1, 0, 2]) q_loss = tf.reduce_mean(tf.square((tf.stop_gradient(x) - x_means))) e_loss = tf.reduce_mean(tf.square(x - tf.stop_gradient(x_means))) @@ -428,8 +411,7 @@ def discrete_bottleneck(x, discrete_mix=0.5, random_top_k=1, soft_em=False, - soft_em_startup_steps=10000, - inv_temp=1.0, + num_samples=1, epsilon=1e-5, softmax_k=0, kl_warmup_steps=150000, @@ -467,9 +449,7 @@ def discrete_bottleneck(x, (Default: 0.5). random_top_k: Noisy top-k for DVQ (Default: 1). soft_em: If True then use soft EM rather than hard EM (Default: False). - soft_em_startup_steps: Number of steps before soft_em activates - (Default: 10000). - inv_temp: Inverse temperature for soft EM (Default: 1.) + num_samples: Number of samples for soft EM (Default: 1). epsilon: Epsilon parameter for DVQ (Default: 1e-5). softmax_k: If > 1 then do top-k softmax (Default: 0). kl_warmup_steps: Number of steps for kl warmup (Default: 150000). @@ -488,6 +468,7 @@ def discrete_bottleneck(x, ValueError: If projection_tensors is None for reshape_method project, or ema_count or ema_means is None if we are using ema, or unknown args. """ + tf.logging.info("Shape of x = {}".format(common_layers.shape_list(x))) block_v_size = None if bottleneck_kind == "dvq": # Define the dvq parameters @@ -577,7 +558,7 @@ def discrete_bottleneck(x, for i in range(num_residuals): x_means_hot_res, x_means_res, q_loss_res, e_loss_res = embedding_lookup( x_res, means[i], num_blocks, block_v_size, random_top_k, soft_em, - soft_em_startup_steps, inv_temp, ema_count[i]) + num_samples) # Update the ema variables if ema: diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py index 8bac3bd30..5dba2c6f9 100644 --- a/tensor2tensor/layers/modalities.py +++ b/tensor2tensor/layers/modalities.py @@ -20,7 +20,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.layers import common_layers from tensor2tensor.utils import expert_utils as eu @@ -29,8 +29,6 @@ import tensorflow as tf -from tensorflow.python.eager import context - @registry.register_symbol_modality("default") class SymbolModality(modality.Modality): @@ -84,7 +82,7 @@ def _get_weights(self, hidden_dim=None): hidden_dim = self._body_input_depth num_shards = self._model_hparams.symbol_modality_num_shards shards = [] - for i in xrange(num_shards): + for i in range(num_shards): shard_size = (self._vocab_size // num_shards) + ( 1 if i < self._vocab_size % num_shards else 0) var_name = "weights_%d" % i @@ -97,7 +95,7 @@ def _get_weights(self, hidden_dim=None): else: ret = tf.concat(shards, 0) # Convert ret to tensor. - if not context.in_eager_mode(): + if not tf.contrib.eager.in_eager_mode(): ret = eu.convert_gradient_to_tensor(ret) return ret @@ -211,13 +209,13 @@ class ImageModality(modality.Modality): def bottom(self, inputs): with tf.variable_scope(self.name): inputs = tf.to_float(inputs) - if not context.in_eager_mode(): + if not tf.contrib.eager.in_eager_mode(): tf.summary.image("inputs", inputs, max_outputs=2) return inputs def targets_bottom(self, inputs): with tf.variable_scope(self.name): - if not context.in_eager_mode(): + if not tf.contrib.eager.in_eager_mode(): tf.summary.image("targets_bottom", tf.cast(inputs, tf.uint8), max_outputs=1) inputs_shape = common_layers.shape_list(inputs) @@ -240,7 +238,7 @@ def targets_bottom(self, inputs): def top(self, body_output, _): # TODO(lukaszkaiser): is this a universal enough way to get channels? - num_channels = self._model_hparams.problem_instances[0].num_channels + num_channels = self._model_hparams.problem.num_channels with tf.variable_scope("rgb_softmax"): body_output_shape = common_layers.shape_list(body_output) reshape_shape = body_output_shape[:3] @@ -258,8 +256,7 @@ def loss(self, logits, targets): logits, targets, self._model_hparams.label_smoothing, - weights_fn=self.targets_weights_fn, - gaussian=True) + weights_fn=self.targets_weights_fn) @registry.register_image_modality("image_channel_compress") @@ -338,7 +335,7 @@ def get_channel_embeddings(self, io_depth, targets, hidden_size, rgb_embedding_var = tf.identity(rgb_embedding_var) rgb_embedding_var *= float(hidden_size)**0.5 channel_target_embs = [] - for i in xrange(io_depth): + for i in range(io_depth): # Adding the channel offsets to get the right embedding since the # embedding tensor has shape 256 * io_depth, hidden_size target_ids = tf.squeeze(targets_split[i], axis=3) + i * 256 @@ -405,7 +402,7 @@ def xnet_resblock(x, filters, res_relu, name): x = tf.to_float(inputs) / 255. x.set_shape([None, None, None, 1]) - for i in xrange(self._model_hparams.audio_compression): + for i in range(self._model_hparams.audio_compression): x = xnet_resblock(x, 2**(i + 1), True, "compress_block_%d" % i) return xnet_resblock(x, self._body_input_depth, False, "compress_block_final") @@ -449,7 +446,7 @@ def xnet_resblock(x, filters, res_relu, name): # Bitcast back from int32 x = tf.bitcast(inputs, tf.float32) x.set_shape([None, None, None, 1]) - for i in xrange(self._model_hparams.audio_compression): + for i in range(self._model_hparams.audio_compression): x = xnet_resblock(x, 2**(i + 1), True, "compress_block_%d" % i) return xnet_resblock(x, self._body_input_depth, False, "compress_block_final") @@ -465,8 +462,9 @@ def bottom(self, inputs): inputs_shape = common_layers.shape_list(inputs) if len(inputs_shape) != 5: raise ValueError("Assuming videos given as tensors in the format " - "[batch, time, height, width, channels].") - if not context.in_eager_mode(): + "[batch, time, height, width, channels] but got one " + "of shape: %s" % str(inputs_shape)) + if not tf.contrib.eager.in_eager_mode(): tf.summary.image("inputs", tf.cast(inputs[:, -1, :, :, :], tf.uint8), max_outputs=1) # Standardize frames. @@ -485,8 +483,9 @@ def targets_bottom(self, inputs): inputs_shape = common_layers.shape_list(inputs) if len(inputs_shape) != 5: raise ValueError("Assuming videos given as tensors in the format " - "[batch, time, height, width, channels].") - if not context.in_eager_mode(): + "[batch, time, height, width, channels] but got one " + "of shape: %s" % str(inputs_shape)) + if not tf.contrib.eager.in_eager_mode(): tf.summary.image( "targets_bottom", tf.cast(inputs[:, -1, :, :, :], tf.uint8), max_outputs=1) @@ -511,8 +510,8 @@ def targets_bottom(self, inputs): return merged def top(self, body_output, _): - num_channels = self._model_hparams.problem_instances[0].num_channels - num_frames = self._model_hparams.problem_instances[0].num_target_frames + num_channels = self._model_hparams.problem.num_channels + num_frames = self._model_hparams.problem.num_target_frames with tf.variable_scope("rgb_softmax"): body_output_shape = common_layers.shape_list(body_output) reshape_shape = body_output_shape[:3] @@ -535,8 +534,7 @@ def loss(self, logits, targets): logits, targets, self._model_hparams.label_smoothing, - weights_fn=self.targets_weights_fn, - gaussian=True) + weights_fn=self.targets_weights_fn) @registry.register_class_label_modality("default") @@ -684,7 +682,7 @@ def loss(self, top_out, targets): return loss_scale, loss_denom -@registry.register_class_label_modality("sigmoid_pooling") +@registry.register_class_label_modality("sigmoid_max_pooling") class SigmoidMaxPoolingClassLabelModality(ClassLabelModality): """Sigmoid cross-entropy applied on max-pooling over timesteps.""" diff --git a/tensor2tensor/layers/rev_block.py b/tensor2tensor/layers/rev_block.py index aaacf0c5d..a6e462f7b 100644 --- a/tensor2tensor/layers/rev_block.py +++ b/tensor2tensor/layers/rev_block.py @@ -27,7 +27,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.layers import common_layers import tensorflow as tf @@ -117,7 +117,7 @@ def _rev_block_forward(x1, gate_outputs=False): """Forward for a series of reversible layers.""" out = (x1, x2) - for i in xrange(num_layers): + for i in range(num_layers): out = _rev_layer_forward( out, f[i], g[i], f_side_input, g_side_input, gate_outputs=gate_outputs) @@ -216,7 +216,7 @@ def _efficient_grad_fn(self, inputs, variables, ys, grad_ys): f.reverse() g.reverse() - for i in xrange(self.num_layers): + for i in range(self.num_layers): ys, grad_ys, f_ret, g_ret = _rev_layer_backward( ys, grad_ys, f[i], g[i], f_vars[i], self.f_side_input, g_vars[i], self.g_side_input) @@ -286,7 +286,7 @@ def backward(self, y1, y2): f.reverse() g.reverse() - for i in xrange(self.num_layers): + for i in range(self.num_layers): gy1 = g[i](y1, self.g_side_input) if self.g_side_input else g[i](y1) x2 = y2 - gy1 fx2 = f[i](x2, self.f_side_input) if self.f_side_input else f[i](x2) diff --git a/tensor2tensor/models/basic.py b/tensor2tensor/models/basic.py index ec65e68b2..d6fdc6101 100644 --- a/tensor2tensor/models/basic.py +++ b/tensor2tensor/models/basic.py @@ -37,7 +37,7 @@ def body(self, features): x = features["inputs"] shape = common_layers.shape_list(x) x = tf.reshape(x, [-1, shape[1] * shape[2] * shape[3]]) - for i in xrange(hparams.num_hidden_layers): + for i in range(hparams.num_hidden_layers): x = tf.layers.dense(x, hparams.hidden_size, name="layer_%d" % i) x = tf.nn.dropout(x, keep_prob=1.0 - hparams.dropout) x = tf.nn.relu(x) @@ -74,7 +74,7 @@ def encoder(self, x): hparams = self._hparams kernel, strides = self._get_kernel_and_strides() # Down-convolutions. - for i in xrange(hparams.num_hidden_layers): + for i in range(hparams.num_hidden_layers): x = tf.layers.conv2d( x, hparams.hidden_size * 2**(i + 1), kernel, strides=strides, padding="SAME", activation=common_layers.belu, name="conv_%d" % i) @@ -86,7 +86,7 @@ def decoder(self, x): hparams = self._hparams kernel, strides = self._get_kernel_and_strides() # Up-convolutions. - for i in xrange(hparams.num_hidden_layers): + for i in range(hparams.num_hidden_layers): j = hparams.num_hidden_layers - i - 1 x = tf.layers.conv2d_transpose( x, hparams.hidden_size * 2**j, kernel, strides=strides, @@ -159,7 +159,7 @@ def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1, # Sample and decode. # TODO(lukaszkaiser): is this a universal enough way to get channels? try: - num_channels = self._hparams.problem_instances[0].num_channels + num_channels = self._hparams.problem.num_channels except AttributeError: num_channels = 1 features["targets"] = tf.zeros( @@ -206,7 +206,7 @@ def basic_autoencoder(): hparams.learning_rate_constant = 0.0002 hparams.learning_rate_warmup_steps = 500 hparams.learning_rate_schedule = "constant * linear_warmup" - hparams.label_smoothing = 0.05 + hparams.label_smoothing = 0.0 hparams.batch_size = 128 hparams.hidden_size = 64 hparams.num_hidden_layers = 5 diff --git a/tensor2tensor/models/basic_test.py b/tensor2tensor/models/basic_test.py new file mode 100644 index 000000000..5a07a5502 --- /dev/null +++ b/tensor2tensor/models/basic_test.py @@ -0,0 +1,70 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Basic nets tests.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import mnist # pylint: disable=unused-import +from tensor2tensor.models import basic +from tensor2tensor.utils import trainer_lib + +import tensorflow as tf + + +class BasicTest(tf.test.TestCase): + + def testBasicFcRelu(self): + x = np.random.random_integers(0, high=255, size=(1, 28, 28, 1)) + y = np.random.random_integers(0, high=9, size=(1, 1)) + hparams = trainer_lib.create_hparams( + "basic_fc_small", problem_name="image_mnist", data_dir=".") + with self.test_session() as session: + features = { + "inputs": tf.constant(x, dtype=tf.int32), + "targets": tf.constant(y, dtype=tf.int32), + } + model = basic.BasicFcRelu(hparams, tf.estimator.ModeKeys.TRAIN) + logits, _ = model(features) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + self.assertEqual(res.shape, (1, 1, 1, 1, 10)) + + def testBasicAutoencoder(self): + x = np.random.random_integers(0, high=255, size=(1, 28, 28, 1)) + y = np.random.random_integers(0, high=9, size=(1, 1)) + hparams = trainer_lib.create_hparams( + "basic_autoencoder", problem_name="image_mnist_rev", data_dir=".") + with self.test_session() as session: + features = { + "targets": tf.constant(x, dtype=tf.int32), + "inputs": tf.constant(y, dtype=tf.int32), + } + tf.train.create_global_step() + model = basic.BasicAutoencoder(hparams, tf.estimator.ModeKeys.TRAIN) + logits, _ = model(features) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + self.assertEqual(res.shape, (1, 28, 28, 1, 256)) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/bytenet.py b/tensor2tensor/models/bytenet.py index cf576a0b3..74f46c27c 100644 --- a/tensor2tensor/models/bytenet.py +++ b/tensor2tensor/models/bytenet.py @@ -21,7 +21,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.layers import common_hparams from tensor2tensor.layers import common_layers @@ -36,8 +36,8 @@ def residual_dilated_conv(x, repeat, padding, name, hparams): with tf.variable_scope(name): k = (hparams.kernel_height, hparams.kernel_width) dilations_and_kernels = [((2**i, 1), k) - for i in xrange(hparams.num_hidden_layers)] - for i in xrange(repeat): + for i in range(hparams.num_hidden_layers)] + for i in range(repeat): with tf.variable_scope("repeat_%d" % i): y = common_layers.conv_block( common_layers.layer_norm(x, hparams.hidden_size, name="lnorm"), diff --git a/tensor2tensor/models/neural_gpu.py b/tensor2tensor/models/neural_gpu.py index 1d1875743..7d6433b92 100644 --- a/tensor2tensor/models/neural_gpu.py +++ b/tensor2tensor/models/neural_gpu.py @@ -21,7 +21,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.layers import common_hparams from tensor2tensor.layers import common_layers @@ -37,7 +37,7 @@ def neural_gpu_body(inputs, hparams, name=None): def step(state, inp): # pylint: disable=missing-docstring x = tf.nn.dropout(state, 1.0 - hparams.dropout) - for layer in xrange(hparams.num_hidden_layers): + for layer in range(hparams.num_hidden_layers): x = common_layers.conv_gru( x, (hparams.kernel_height, hparams.kernel_width), hparams.hidden_size, @@ -70,7 +70,7 @@ def step(state_tup, inp): """Single step of the improved Neural GPU.""" state, _ = state_tup x = state - for layer in xrange(hparams.num_hidden_layers): + for layer in range(hparams.num_hidden_layers): x, new_loss = common_layers.diagonal_conv_gru( x, (hparams.kernel_height, hparams.kernel_width), hparams.hidden_size, diff --git a/tensor2tensor/models/research/adafactor_experiments.py b/tensor2tensor/models/research/adafactor_experiments.py index c06c3f0cc..d7d3d4e2c 100644 --- a/tensor2tensor/models/research/adafactor_experiments.py +++ b/tensor2tensor/models/research/adafactor_experiments.py @@ -218,3 +218,12 @@ def afx_small_p8(): hparams = afx_small() hparams.add_hparam("simulated_parameter_quantize_bits", 8) return hparams + + +@registry.register_hparams +def afx_small_bfloat16(): + """Small transformer model with small batch size for fast step times.""" + hparams = afx_small() + hparams.weight_dtype = "bfloat16" + hparams.activation_dtype = "bfloat16" + return hparams diff --git a/tensor2tensor/models/research/attention_lm.py b/tensor2tensor/models/research/attention_lm.py index cbc45c4e7..bf7315f07 100644 --- a/tensor2tensor/models/research/attention_lm.py +++ b/tensor2tensor/models/research/attention_lm.py @@ -27,7 +27,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_hparams @@ -104,7 +104,7 @@ def attention_lm_decoder(decoder_input, """ x = decoder_input with tf.variable_scope(name): - for layer in xrange(hparams.num_hidden_layers): + for layer in range(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( diff --git a/tensor2tensor/models/research/attention_lm_moe.py b/tensor2tensor/models/research/attention_lm_moe.py index 49ca3d20f..14b633495 100644 --- a/tensor2tensor/models/research/attention_lm_moe.py +++ b/tensor2tensor/models/research/attention_lm_moe.py @@ -29,7 +29,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_hparams @@ -182,7 +182,7 @@ def print_shape(x, suffix, debug=False): num_hidden_layers = ( len(hparams.attention_layers) or hparams.num_hidden_layers) - for layer in xrange(num_hidden_layers): + for layer in range(num_hidden_layers): with tf.variable_scope("layer_%d" % layer): # Use the layer type defined in attention_layers diff --git a/tensor2tensor/models/research/autoencoders.py b/tensor2tensor/models/research/autoencoders.py index a7c696499..d9c852742 100644 --- a/tensor2tensor/models/research/autoencoders.py +++ b/tensor2tensor/models/research/autoencoders.py @@ -30,7 +30,53 @@ @registry.register_model -class ResidualAutoencoder(basic.BasicAutoencoder): +class AutoencoderAutoregressive(basic.BasicAutoencoder): + """Autoencoder with an autoregressive part.""" + + def body(self, features): + hparams = self._hparams + shape = common_layers.shape_list(features["targets"]) + # Run the basic autoencoder part first. + basic_result, losses = super(AutoencoderAutoregressive, self).body(features) + # Prepare inputs for autoregressive modes. + targets_keep_prob = 1.0 - hparams.autoregressive_dropout + targets_dropout = common_layers.dropout_with_broadcast_dims( + features["targets"], targets_keep_prob, broadcast_dims=[-1]) + targets1d = tf.reshape(targets_dropout, [shape[0], -1, shape[3]]) + targets_shifted = common_layers.shift_right_3d(targets1d) + basic1d = tf.reshape(basic_result, [shape[0], -1, shape[3]]) + concat1d = tf.concat([basic1d, targets_shifted], axis=-1) + # The forget_base hparam sets purely-autoregressive mode, no autoencoder. + if hparams.autoregressive_forget_base: + concat1d = tf.reshape(features["targets"], [shape[0], -1, shape[3]]) + concat1d = common_layers.shift_right_3d(concat1d) + # The autoregressive part depends on the mode. + if hparams.autoregressive_mode == "none": + assert not hparams.autoregressive_forget_base + return basic_result, losses + if hparams.autoregressive_mode == "conv3": + res = common_layers.conv1d(concat1d, shape[3], 3, padding="LEFT", + activation=common_layers.belu, + name="autoregressive_conv3") + return tf.reshape(res, shape), losses + if hparams.autoregressive_mode == "conv5": + res = common_layers.conv1d(concat1d, shape[3], 5, padding="LEFT", + activation=common_layers.belu, + name="autoregressive_conv5") + return tf.reshape(res, shape), losses + if hparams.autoregressive_mode == "sru": + res = common_layers.conv1d(concat1d, shape[3], 3, padding="LEFT", + activation=common_layers.belu, + name="autoregressive_sru_conv3") + res = common_layers.sru(res) + return tf.reshape(res, shape), losses + + raise ValueError("Unsupported autoregressive mode: %s" + % hparams.autoregressive_mode) + + +@registry.register_model +class AutoencoderResidual(AutoencoderAutoregressive): """Residual autoencoder.""" def encoder(self, x): @@ -45,7 +91,7 @@ def encoder(self, x): if hparams.residual_use_separable_conv: residual_conv = tf.layers.separable_conv2d # Down-convolutions. - for i in xrange(hparams.num_hidden_layers): + for i in range(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % i): x = tf.nn.dropout(x, 1.0 - hparams.dropout) filters = hparams.hidden_size * 2**(i + 1) @@ -54,7 +100,7 @@ def encoder(self, x): x, filters, kernel, strides=strides, padding="SAME", activation=common_layers.belu, name="strided") y = x - for r in xrange(hparams.num_residual_layers): + for r in range(hparams.num_residual_layers): residual_filters = filters if r < hparams.num_residual_layers - 1: residual_filters = int( @@ -79,7 +125,7 @@ def decoder(self, x): if hparams.residual_use_separable_conv: residual_conv = tf.layers.separable_conv2d # Up-convolutions. - for i in xrange(hparams.num_hidden_layers): + for i in range(hparams.num_hidden_layers): x = tf.nn.dropout(x, 1.0 - hparams.dropout) j = hparams.num_hidden_layers - i - 1 filters = hparams.hidden_size * 2**j @@ -91,7 +137,7 @@ def decoder(self, x): x, filters, kernel, strides=strides, padding="SAME", activation=common_layers.belu, name="strided") y = x - for r in xrange(hparams.num_residual_layers): + for r in range(hparams.num_residual_layers): residual_filters = filters if r < hparams.num_residual_layers - 1: residual_filters = int( @@ -106,7 +152,7 @@ def decoder(self, x): @registry.register_model -class BasicDiscreteAutoencoder(basic.BasicAutoencoder): +class AutoencoderBasicDiscrete(AutoencoderAutoregressive): """Discrete autoencoder.""" def bottleneck(self, x): @@ -132,7 +178,7 @@ def sample(self): @registry.register_model -class ResidualDiscreteAutoencoder(ResidualAutoencoder): +class AutoencoderResidualDiscrete(AutoencoderResidual): """Discrete residual autoencoder.""" def bottleneck(self, x, bottleneck_size=None): @@ -160,13 +206,15 @@ def sample(self): size = [hp.batch_size, hp.sample_height // div_x, hp.sample_width // div_y, hp.bottleneck_size] rand = tf.random_uniform(size) - res1 = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0 - res2 = tf.zeros_like(rand) - 1.0 - return tf.concat([res2[:, :, :, :2], res1[:, :, :, 2:]], axis=-1) + res = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0 + # If you want to set some first bits to a fixed value, do this: + # fixed = tf.zeros_like(rand) - 1.0 + # res = tf.concat([fixed[:, :, :, :2], res[:, :, :, 2:]], axis=-1) + return res @registry.register_model -class OrderedDiscreteAutoencoder(ResidualDiscreteAutoencoder): +class AutoencoderOrderedDiscrete(AutoencoderResidualDiscrete): """Ordered discrete autoencoder.""" def bottleneck(self, x): @@ -195,7 +243,7 @@ def bottleneck(self, x): @registry.register_model -class StackedAutoencoder(ResidualDiscreteAutoencoder): +class AutoencoderStacked(AutoencoderResidualDiscrete): """A stacked autoencoder.""" def stack(self, b, size, bottleneck_size, name): @@ -290,9 +338,19 @@ def body(self, features): @registry.register_hparams -def residual_autoencoder(): - """Residual autoencoder model.""" +def autoencoder_autoregressive(): + """Autoregressive autoencoder model.""" hparams = basic.basic_autoencoder() + hparams.add_hparam("autoregressive_forget_base", False) + hparams.add_hparam("autoregressive_mode", "conv3") + hparams.add_hparam("autoregressive_dropout", 0.4) + return hparams + + +@registry.register_hparams +def autoencoder_residual(): + """Residual autoencoder model.""" + hparams = autoencoder_autoregressive() hparams.optimizer = "Adam" hparams.learning_rate_constant = 0.0001 hparams.learning_rate_warmup_steps = 500 @@ -311,9 +369,9 @@ def residual_autoencoder(): @registry.register_hparams -def basic_discrete_autoencoder(): +def autoencoder_basic_discrete(): """Basic autoencoder model.""" - hparams = basic.basic_autoencoder() + hparams = autoencoder_autoregressive() hparams.num_hidden_layers = 5 hparams.hidden_size = 64 hparams.bottleneck_size = 4096 @@ -324,9 +382,9 @@ def basic_discrete_autoencoder(): @registry.register_hparams -def residual_discrete_autoencoder(): +def autoencoder_residual_discrete(): """Residual discrete autoencoder model.""" - hparams = residual_autoencoder() + hparams = autoencoder_residual() hparams.bottleneck_size = 4096 hparams.bottleneck_noise = 0.1 hparams.bottleneck_warmup_steps = 3000 @@ -339,9 +397,9 @@ def residual_discrete_autoencoder(): @registry.register_hparams -def residual_discrete_autoencoder_big(): +def autoencoder_residual_discrete_big(): """Residual discrete autoencoder model, big version.""" - hparams = residual_discrete_autoencoder() + hparams = autoencoder_residual_discrete() hparams.hidden_size = 128 hparams.max_hidden_size = 4096 hparams.bottleneck_noise = 0.1 @@ -351,15 +409,15 @@ def residual_discrete_autoencoder_big(): @registry.register_hparams -def ordered_discrete_autoencoder(): +def autoencoder_ordered_discrete(): """Basic autoencoder model.""" - hparams = residual_discrete_autoencoder() + hparams = autoencoder_residual_discrete() return hparams @registry.register_hparams -def stacked_autoencoder(): +def autoencoder_stacked(): """Stacked autoencoder model.""" - hparams = residual_discrete_autoencoder() + hparams = autoencoder_residual_discrete() hparams.bottleneck_size = 128 return hparams diff --git a/tensor2tensor/models/research/autoencoders_test.py b/tensor2tensor/models/research/autoencoders_test.py new file mode 100644 index 000000000..9cdcd139a --- /dev/null +++ b/tensor2tensor/models/research/autoencoders_test.py @@ -0,0 +1,84 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Autoencoders tests.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import mnist # pylint: disable=unused-import +from tensor2tensor.models.research import autoencoders # pylint: disable=unused-import +from tensor2tensor.utils import registry +from tensor2tensor.utils import trainer_lib + +import tensorflow as tf + + +class AutoencoderTest(tf.test.TestCase): + + def getMnistRandomOutput(self, model_name, hparams_set=None, + mode=tf.estimator.ModeKeys.TRAIN): + hparams_set = hparams_set or model_name + x = np.random.random_integers(0, high=255, size=(1, 28, 28, 1)) + y = np.random.random_integers(0, high=9, size=(1, 1)) + hparams = trainer_lib.create_hparams( + hparams_set, problem_name="image_mnist_rev", data_dir=".") + with self.test_session() as session: + features = { + "targets": tf.constant(x, dtype=tf.int32), + "inputs": tf.constant(y, dtype=tf.int32), + } + tf.train.create_global_step() + model = registry.model(model_name)(hparams, mode) + logits, _ = model(features) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + return res + + @property + def mnistOutputShape(self): + return (1, 28, 28, 1, 256) + + def testAutoencoderAutoregressive(self): + res = self.getMnistRandomOutput("autoencoder_autoregressive") + self.assertEqual(res.shape, self.mnistOutputShape) + + def testAutoencoderResidual(self): + res = self.getMnistRandomOutput("autoencoder_residual") + self.assertEqual(res.shape, self.mnistOutputShape) + + def testAutoencoderBasicDiscrete(self): + res = self.getMnistRandomOutput("autoencoder_basic_discrete") + self.assertEqual(res.shape, self.mnistOutputShape) + + def testAutoencoderResidualDiscrete(self): + res = self.getMnistRandomOutput("autoencoder_residual_discrete") + self.assertEqual(res.shape, self.mnistOutputShape) + + def testAutoencoderOrderedDiscrete(self): + res = self.getMnistRandomOutput("autoencoder_ordered_discrete") + self.assertEqual(res.shape, self.mnistOutputShape) + + def testAutoencoderStacked(self): + res = self.getMnistRandomOutput("autoencoder_stacked") + self.assertEqual(res.shape, self.mnistOutputShape) + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/research/basic_conv_gen.py b/tensor2tensor/models/research/basic_conv_gen.py index cd98cde77..f35509237 100644 --- a/tensor2tensor/models/research/basic_conv_gen.py +++ b/tensor2tensor/models/research/basic_conv_gen.py @@ -21,6 +21,8 @@ # Dependency imports +import six + from tensor2tensor.layers import common_hparams from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry @@ -31,6 +33,7 @@ @registry.register_model class BasicConvGen(t2t_model.T2TModel): + """Basic convolutional next-frame model.""" def body(self, features): hparams = self.hparams @@ -46,11 +49,12 @@ def body(self, features): x, x, final_length_divisible_by=2**hparams.num_compress_steps, axis=2) # Down-stride. - for _ in range(hparams.num_compress_steps): - x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu, - strides=(2, 2), padding="SAME") - x = common_layers.layer_norm(x) - filters *= 2 + for i in range(hparams.num_compress_steps): + with tf.variable_scope("downstride%d" % i): + x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu, + strides=(2, 2), padding="SAME") + x = common_layers.layer_norm(x) + filters *= 2 # Add embedded action. action = tf.reshape(features["input_action"][:, 1, :], @@ -71,29 +75,62 @@ def body(self, features): x = common_layers.layer_norm(x + y) # Up-convolve. - for _ in range(hparams.num_compress_steps): - filters //= 2 - x = tf.layers.conv2d_transpose( - x, filters, kernel2, activation=common_layers.belu, - strides=(2, 2), padding="SAME") - x = common_layers.layer_norm(x) - x = tf.nn.dropout(x, 1.0 - hparams.dropout) + for i in range(hparams.num_compress_steps): + with tf.variable_scope("upstride%d" % i): + filters //= 2 + x = tf.layers.conv2d_transpose( + x, filters, kernel2, activation=common_layers.belu, + strides=(2, 2), padding="SAME") + x = common_layers.layer_norm(x) + x = tf.nn.dropout(x, 1.0 - hparams.dropout) # Cut down to original size. x = x[:, :inputs_shape[1], :inputs_shape[2], :] # Reward prediction. - reward_pred_h1 = tf.reduce_mean(x, axis=[1, 2], keep_dims=True) - # Rewards are {-1, 0, 1} so we predict 3. - reward_pred = tf.layers.dense(reward_pred_h1, 3, name="reward") - reward_gold = tf.expand_dims(tf.to_int32( - features["input_reward_raw"][:, 1, :]), axis=1) - reward_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=reward_gold, logits=reward_pred, name="reward_loss") - reward_loss = tf.reduce_mean(reward_loss) - return {"targets": x, "target_reward": reward_pred_h1} - # return x, {"reward": reward_loss} - # return x + reward_pred = tf.reduce_mean(x, axis=[1, 2], keep_dims=True) + return {"targets": x, "target_reward": reward_pred} + + def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1, + alpha=0.0): + """Produce predictions from the model by running it.""" + # Inputs and features preparation needed to handle edge cases. + if not features: + features = {} + inputs_old = None + if "inputs" in features and len(features["inputs"].shape) < 4: + inputs_old = features["inputs"] + features["inputs"] = tf.expand_dims(features["inputs"], 2) + + # Get predictions. + try: + num_channels = self._hparams.problem.num_channels + except AttributeError: + num_channels = 1 + features["targets"] = tf.zeros( + [self._hparams.batch_size, 1, 1, 1, num_channels], dtype=tf.int32) + features["target_reward"] = tf.zeros( + [self._hparams.batch_size, 1, 1], dtype=tf.int32) + logits, _ = self(features) # pylint: disable=not-callable + if isinstance(logits, dict): + results = {} + for k, v in six.iteritems(logits): + # Argmax in TF doesn't handle more than 5 dimensions yet. + v_shape = common_layers.shape_list(v) + argmax = tf.argmax(tf.reshape(v, [-1, v_shape[-1]]), axis=-1) + results[k] = tf.reshape(argmax, v_shape[:-1]) + else: + # Argmax in TF doesn't handle more than 5 dimensions yet. + logits_shape = common_layers.shape_list(logits) + argmax = tf.argmax(tf.reshape(logits, [-1, logits_shape[-1]]), axis=-1) + results = tf.reshape(argmax, logits_shape[:-1]) + + # Restore inputs to not confuse Estimator in edge cases. + if inputs_old is not None: + features["inputs"] = inputs_old + + # Return results. + return results @registry.register_hparams @@ -149,11 +186,11 @@ def deconv2d(cur, i, kernel_size, output_filters, activation=tf.nn.relu): name="deconv2d" + str(i)) return tf.depth_to_space(thicker, 2) - # cur_frame = common_layers.standardize_images(features["inputs_0"]) - # prev_frame = common_layers.standardize_images(features["inputs_1"]) - # frames = tf.concat([cur_frame, prev_frame], axis=3) - # frames = tf.reshape(frames, [-1, 210, 160, 6]) - frames = common_layers.standardize_images(features["inputs"]) + cur_frame = common_layers.standardize_images(features["inputs_0"]) + prev_frame = common_layers.standardize_images(features["inputs_1"]) + + frames = tf.concat([cur_frame, prev_frame], axis=3) + frames = tf.reshape(frames, [-1, 210, 160, 6]) h1 = tf.layers.conv2d(frames, filters=64, strides=2, kernel_size=(8, 8), padding="SAME", activation=tf.nn.relu) diff --git a/tensor2tensor/models/research/gene_expression.py b/tensor2tensor/models/research/gene_expression.py index 0235774ab..abe0a4834 100644 --- a/tensor2tensor/models/research/gene_expression.py +++ b/tensor2tensor/models/research/gene_expression.py @@ -20,7 +20,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.layers import common_hparams from tensor2tensor.layers import common_layers @@ -59,7 +59,7 @@ def body(self, features): # Conv layers assert hp.num_conv_layers == len(hp.pooling_windows) - for i in xrange(hp.num_conv_layers): + for i in range(hp.num_conv_layers): out = conv_layer( out, hp.hidden_size, @@ -71,7 +71,7 @@ def body(self, features): name="conv_%d" % (i + 1)) # Dense dilated conv layers - for i in xrange(hp.num_dconv_layers): + for i in range(hp.num_dconv_layers): dilation_rate = 2**(i + 1) dconv_out = conv_layer( out, diff --git a/tensor2tensor/models/research/gene_expression_test.py b/tensor2tensor/models/research/gene_expression_test.py index 899defadb..70403935c 100644 --- a/tensor2tensor/models/research/gene_expression_test.py +++ b/tensor2tensor/models/research/gene_expression_test.py @@ -54,7 +54,7 @@ def _testModel(self, hparams, model_cls): "inputs": tf.constant(inputs, dtype=tf.int32), "targets": tf.constant(targets, dtype=tf.float32), } - p_hparams, = hparams.problems + p_hparams = hparams.problem_hparams logits, _ = model_cls( hparams, tf.estimator.ModeKeys.TRAIN, p_hparams)(features) @@ -70,7 +70,7 @@ def testGeneExpressionModels(self): for model_cls, hparams in models_hparams: hparams.add_hparam("data_dir", None) p_hparams = gene_data.GenomicsExpressionCage10().get_hparams(hparams) - hparams.problems = [p_hparams] + hparams.problem_hparams = p_hparams self._testModel(hparams, model_cls) diff --git a/tensor2tensor/models/research/lm_experiments.py b/tensor2tensor/models/research/lm_experiments.py index a8d68583d..4e34673c2 100644 --- a/tensor2tensor/models/research/lm_experiments.py +++ b/tensor2tensor/models/research/lm_experiments.py @@ -77,3 +77,25 @@ def lmx_h2k_f8k(): hparams.filter_size = 8192 return hparams + +@registry.register_hparams +def lmx_h3k_f12k(): + """HParams for training languagemodel_lm1b32k_packed. 880M Params.""" + hparams = lmx_base() + hparams.hidden_size = 3072 + hparams.filter_size = 12288 + hparams.batch_size = 2048 + hparams.weight_dtype = "bfloat16" + return hparams + + +@registry.register_hparams +def lmx_h4k_f16k(): + """HParams for training languagemodel_lm1b32k_packed. 1470M Params.""" + hparams = lmx_base() + hparams.hidden_size = 4096 + hparams.filter_size = 16384 + hparams.batch_size = 1024 + hparams.weight_dtype = "bfloat16" + return hparams + diff --git a/tensor2tensor/models/research/multimodel.py b/tensor2tensor/models/research/multimodel.py index 88ab3950b..4b3d93445 100644 --- a/tensor2tensor/models/research/multimodel.py +++ b/tensor2tensor/models/research/multimodel.py @@ -20,7 +20,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_hparams @@ -138,7 +138,7 @@ def flatten(inputs): inputs_mask = dp(lambda x: 1.0 - x, inputs_pad) inputs_encoded = dp(common_layers.add_timing_signal, inputs) expert_loss = 0.0 - for i in xrange(hparams.num_hidden_layers): + for i in range(hparams.num_hidden_layers): with tf.variable_scope("enc_layer_%d" % i): inputs_encoded, moe_loss = conv_experts(inputs_encoded, hparams, dp, self._ps_devices, "SAME", @@ -168,7 +168,7 @@ def flatten(inputs): expert_fn = expert_utils.ffn_expert_fn( hparams.hidden_size, moe_hidden_sizes, hparams.hidden_size) x = dp(tf.nn.dropout, decoder_input, 1.0 - hparams.dropout) - for layer in xrange(hparams.num_hidden_layers): + for layer in range(hparams.num_hidden_layers): with tf.variable_scope("dec_layer_%d" % layer): with tf.variable_scope("attention"): y = dp( diff --git a/tensor2tensor/models/research/multimodel_test.py b/tensor2tensor/models/research/multimodel_test.py index ef8e30138..c480d23e1 100644 --- a/tensor2tensor/models/research/multimodel_test.py +++ b/tensor2tensor/models/research/multimodel_test.py @@ -39,7 +39,7 @@ def testMultiModel(self): hparams.add_hparam("data_dir", "") problem = registry.problem("image_cifar10") p_hparams = problem.get_hparams(hparams) - hparams.problems = [p_hparams] + hparams.problem_hparams = p_hparams with self.test_session() as session: features = { "inputs": tf.constant(x, dtype=tf.int32), diff --git a/tensor2tensor/models/research/super_lm.py b/tensor2tensor/models/research/super_lm.py index caaef23e6..40bfb7f64 100644 --- a/tensor2tensor/models/research/super_lm.py +++ b/tensor2tensor/models/research/super_lm.py @@ -30,7 +30,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_hparams @@ -56,7 +56,7 @@ def body(self, features): assert hparams.num_model_shards % len(ps_devices) == 0 shards_per_device = hparams.num_model_shards // len(ps_devices) model_devices = [ps_devices[i // shards_per_device] - for i in xrange(hparams.num_model_shards)] + for i in range(hparams.num_model_shards)] print("model_devices = %s" % model_devices) mp = expert_utils.Parallelism(model_devices, reuse=False) vocab_size = self._problem_hparams.vocabulary["targets"].vocab_size diff --git a/tensor2tensor/models/research/transformer_revnet_test.py b/tensor2tensor/models/research/transformer_revnet_test.py index b3d4c0812..89e075c12 100644 --- a/tensor2tensor/models/research/transformer_revnet_test.py +++ b/tensor2tensor/models/research/transformer_revnet_test.py @@ -47,7 +47,7 @@ def testTransformer(self): vocab_size = 9 hparams = transformer_revnet_test() p_hparams = problem_hparams.test_problem_hparams(vocab_size, vocab_size) - hparams.problems = [p_hparams] + hparams.problem_hparams = p_hparams inputs = -1 + np.random.random_integers( vocab_size, size=(batch_size, input_length, 1, 1)) targets = -1 + np.random.random_integers( diff --git a/tensor2tensor/models/research/transformer_symshard.py b/tensor2tensor/models/research/transformer_symshard.py index 64b9fed97..e3c541a07 100644 --- a/tensor2tensor/models/research/transformer_symshard.py +++ b/tensor2tensor/models/research/transformer_symshard.py @@ -46,7 +46,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_hparams @@ -69,7 +69,7 @@ def body(self, features): assert hparams.num_model_shards % len(ps_devices) == 0 shards_per_device = hparams.num_model_shards // len(ps_devices) model_devices = [ps_devices[i // shards_per_device] - for i in xrange(hparams.num_model_shards)] + for i in range(hparams.num_model_shards)] print("model_devices = %s" % model_devices) mp = expert_utils.Parallelism(model_devices, reuse=False) targets_vocab_size = self._problem_hparams.vocabulary["targets"].vocab_size diff --git a/tensor2tensor/models/research/transformer_vae.py b/tensor2tensor/models/research/transformer_vae.py index ec2966b6a..6d7b35b3e 100644 --- a/tensor2tensor/models/research/transformer_vae.py +++ b/tensor2tensor/models/research/transformer_vae.py @@ -25,7 +25,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_image_attention as cia @@ -46,8 +46,8 @@ def residual_conv(x, repeat, k, hparams, name, reuse=None): """A stack of convolution blocks with residual connections.""" with tf.variable_scope(name, reuse=reuse): - dilations_and_kernels = [((1, 1), k) for _ in xrange(3)] - for i in xrange(repeat): + dilations_and_kernels = [((1, 1), k) for _ in range(3)] + for i in range(repeat): with tf.variable_scope("repeat_%d" % i): y = common_layers.conv_block( common_layers.layer_norm(x, hparams.hidden_size, name="lnorm"), @@ -122,7 +122,7 @@ def compress(x, c, is_2d, hparams, name): cur = residual_conv(cur, hparams.num_compress_steps, k1, hparams, "rc") if c is not None and hparams.do_attend_compress: cur = attend(cur, c, hparams, "compress_attend") - for i in xrange(hparams.num_compress_steps): + for i in range(hparams.num_compress_steps): if hparams.do_residual_compress: cur = residual_conv(cur, hparams.num_compress_steps, k1, hparams, "rc_%d" % i) @@ -242,12 +242,12 @@ def ae_latent_softmax(latents_pred, latents_discrete, hparams): latents_logits = [ tf.layers.dense( latents_pred, block_vocab_size, name="extra_logits_%d" % i) - for i in xrange(hparams.num_decode_blocks) + for i in range(hparams.num_decode_blocks) ] loss = None if latents_discrete is not None: losses = [] - for i in xrange(hparams.num_decode_blocks): + for i in range(hparams.num_decode_blocks): d = tf.floormod(tf.floordiv(latents_discrete, block_vocab_size**i), block_vocab_size) losses.append(tf.nn.sparse_softmax_cross_entropy_with_logits( @@ -309,7 +309,7 @@ def next_bit(latents_discrete, i): return tf.concat([latents_discrete_prev[:, :(i+1), :], latents_discrete[:, (i+1):, :]], axis=1) - for i in xrange(iters): + for i in range(iters): latents_discrete = next_bit(latents_discrete, i) return latents_discrete @@ -450,7 +450,7 @@ def bn_inputs(): mask = tf.less(masking, tf.random_uniform( common_layers.shape_list(targets)[:-1])) mask = tf.expand_dims(tf.to_float(mask), 3) - for i in xrange(hparams.num_compress_steps): + for i in range(hparams.num_compress_steps): j = hparams.num_compress_steps - i - 1 d = residual_conv(d, 1, (3, 1), hparams, "decompress_rc_%d" % j) if hparams.do_attend_decompress: @@ -514,8 +514,7 @@ def __init__(self, *args, **kwargs): discrete_mix=self._hparams.d_mix, random_top_k=self._hparams.random_top_k, soft_em=self.hparams.soft_em, - soft_em_startup_steps=self.hparams.soft_em_startup_steps, - inv_temp=self.hparams.inv_temp, + num_samples=self.hparams.num_samples, epsilon=self._hparams.epsilon, softmax_k=self._hparams.softmax_k, kl_warmup_steps=self._hparams.kl_warmup_steps, @@ -562,7 +561,7 @@ def __init__(self, *args, **kwargs): ema_means = None if self._hparams.ema: ema_count = [] - for i in xrange(self._hparams.num_residuals): + for i in range(self._hparams.num_residuals): ema_count_i = tf.get_variable( "ema_count_{}".format(i), [self._hparams.num_blocks, block_v_size], @@ -571,7 +570,7 @@ def __init__(self, *args, **kwargs): ema_count.append(ema_count_i) with tf.colocate_with(means): ema_means = [] - for i in xrange(self._hparams.num_residuals): + for i in range(self._hparams.num_residuals): ema_means_i = tf.get_variable( "ema_means_{}".format(i), initializer=means.initialized_value()[i], @@ -649,7 +648,7 @@ def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1, # More steps. self.predict_mask = 0.0 # Use the provided targets this time. how_many_more_steps = 0 # Set to 1 or more for Gibbs-like sampling. - for _ in xrange(how_many_more_steps): + for _ in range(how_many_more_steps): with tf.variable_scope(tf.get_variable_scope(), reuse=True): features["targets"] = samples logits, _ = self(features) # pylint: disable=not-callable @@ -716,7 +715,7 @@ def transformer_ae_small(): hparams.add_hparam("ema", True) hparams.add_hparam("random_top_k", 1) hparams.add_hparam("soft_em", False) - hparams.add_hparam("soft_em_startup_steps", 10000) + hparams.add_hparam("num_samples", 10) hparams.add_hparam("inv_temp", 1.0) hparams.kl_warmup_steps = 150000 hparams.force_full_predict = True diff --git a/tensor2tensor/models/research/transformer_vae_test.py b/tensor2tensor/models/research/transformer_vae_test.py index 3c73a4da6..ae08f6dc3 100644 --- a/tensor2tensor/models/research/transformer_vae_test.py +++ b/tensor2tensor/models/research/transformer_vae_test.py @@ -34,7 +34,7 @@ def testTransformerAEOnDVQ(self): hparams.bottleneck_kind = "dvq" hparams.dp_strength = 0 p_hparams = problem_hparams.test_problem_hparams(vocab_size, vocab_size) - hparams.problems = [p_hparams] + hparams.problem_hparams = p_hparams inputs = -1 + np.random.random_integers( vocab_size, size=(batch_size, input_length, 1, 1)) targets = -1 + np.random.random_integers( diff --git a/tensor2tensor/models/resnet.py b/tensor2tensor/models/resnet.py index d889ba328..e953ba6a0 100644 --- a/tensor2tensor/models/resnet.py +++ b/tensor2tensor/models/resnet.py @@ -420,6 +420,9 @@ def body(self, features): data_format, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) + if hp.use_nchw: + out = tf.transpose(out, [0, 2, 3, 1]) + return out @@ -473,6 +476,23 @@ def resnet_18(): return hp +@registry.register_hparams +def resnet_imagenet_34(): + """Set of hyperparameters.""" + hp = resnet_base() + hp.block_fn = "residual" + hp.layer_sizes = [2, 4, 8, 2] + + return hp + + +@registry.register_hparams +def resnet_imagenet_102(): + hp = resnet_imagenet_34() + hp.layer_sizes = [3, 8, 36, 3] + return hp + + @registry.register_hparams def resnet_cifar_15(): """Set of hyperparameters.""" diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py index 0410ff7d1..be6c51e4d 100644 --- a/tensor2tensor/models/slicenet.py +++ b/tensor2tensor/models/slicenet.py @@ -20,7 +20,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from six.moves import zip # pylint: disable=redefined-builtin from tensor2tensor.layers import common_attention @@ -117,7 +117,7 @@ def norm_fn(x, name): return common_layers.apply_norm( x, hparams.norm_type, hparams.hidden_size, hparams.norm_epsilon) - for layer in xrange(layers): + for layer in range(layers): with tf.variable_scope("layer_%d" % layer): y = common_layers.subseparable_conv_block( x, @@ -187,19 +187,6 @@ def norm_fn(x, name): target_space_emb = tf.tile(target_space_emb, [tf.shape(targets_flat)[0], 1, 1, 1]) - # Calculate similarity loss (but don't run if not needed). - if len(hparams.problems) > 1 and hparams.sim_loss_mult > 0.00001: - targets_timed = common_layers.add_timing_signal(targets_flat) - extra_layers = int(hparams.num_hidden_layers * 1.5) - with tf.variable_scope(tf.get_variable_scope(), reuse=True): - targets_encoded = multi_conv_res(targets_timed, "SAME", "encoder", - extra_layers, hparams) - with tf.variable_scope("similarity_loss"): - similarity_loss = similarity_cost(inputs_encoded, targets_encoded) - similarity_loss *= hparams.sim_loss_mult - else: - similarity_loss = 0.0 - # Use attention from each target to look at input and retrieve. targets_shifted = common_layers.shift_right( targets_flat, pad_value=target_space_emb) @@ -224,7 +211,7 @@ def norm_fn(x, name): separability=4, name="targets_merge") - return targets_merged, similarity_loss + return targets_merged, 0.0 def embed_target_space(target_space_id, hidden_size): diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py index 2aceb4d5e..299944b6b 100644 --- a/tensor2tensor/models/slicenet_test.py +++ b/tensor2tensor/models/slicenet_test.py @@ -40,7 +40,7 @@ def testSliceNet(self): hparams.add_hparam("data_dir", "") problem = registry.problem("image_cifar10") p_hparams = problem.get_hparams(hparams) - hparams.problems = [p_hparams] + hparams.problem_hparams = p_hparams with self.test_session() as session: features = { "inputs": tf.constant(x, dtype=tf.int32), diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 88be60dfd..4fb89db61 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -29,7 +29,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.data_generators import librispeech from tensor2tensor.layers import common_attention @@ -310,6 +310,7 @@ def _fast_decode(self, partial_targets = features.get("inputs") if partial_targets is None: partial_targets = features["targets"] + assert partial_targets is not None partial_targets = common_layers.expand_squeeze_to_nd(partial_targets, 2) partial_targets = tf.to_int64(partial_targets) partial_targets_shape = common_layers.shape_list(partial_targets) @@ -664,8 +665,8 @@ def transformer_prepare_encoder(inputs, target_space, hparams, features=None): 32, ishape_static[-1], name="target_space_embedding", - dtype=tf.bfloat16 if hparams.activation_dtype == "bfloat16" or - hparams.weight_dtype == "bfloat16" else tf.float32) + dtype=tf.bfloat16 if hparams.activation_dtype == "bfloat16" + else tf.float32) emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) encoder_input += emb_target_space if hparams.pos == "timing": @@ -674,8 +675,7 @@ def transformer_prepare_encoder(inputs, target_space, hparams, features=None): encoder_input, inputs_position) else: encoder_input = common_attention.add_timing_signal_1d(encoder_input) - if (hparams.activation_dtype == "bfloat16" or - hparams.weight_dtype == "bfloat16"): + if hparams.activation_dtype == "bfloat16": encoder_self_attention_bias = tf.cast(encoder_self_attention_bias, tf.bfloat16) encoder_decoder_attention_bias = tf.cast(encoder_decoder_attention_bias, @@ -724,8 +724,7 @@ def transformer_prepare_decoder(targets, hparams, features=None): decoder_input, targets_position) else: decoder_input = common_attention.add_timing_signal_1d(decoder_input) - if (hparams.activation_dtype == "bfloat16" or - hparams.weight_dtype == "bfloat16"): + if hparams.activation_dtype == "bfloat16": decoder_self_attention_bias = tf.cast(decoder_self_attention_bias, tf.bfloat16) return (decoder_input, decoder_self_attention_bias) @@ -774,8 +773,8 @@ def transformer_encoder(encoder_input, pad_remover = None if hparams.use_pad_remover and not common_layers.is_on_tpu(): pad_remover = expert_utils.PadRemover(padding) - for layer in xrange(hparams.num_encoder_layers or - hparams.num_hidden_layers): + for layer in range(hparams.num_encoder_layers or + hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( @@ -845,8 +844,8 @@ def transformer_decoder(decoder_input, common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): - for layer in xrange(hparams.num_decoder_layers or - hparams.num_hidden_layers): + for layer in range(hparams.num_decoder_layers or + hparams.num_hidden_layers): layer_name = "layer_%d" % layer layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope(layer_name): @@ -1592,10 +1591,15 @@ def transformer_supervised_attention(): @registry.register_hparams def transformer_tpu_1b(): - """Hparams for training with 1B parameters.""" + """Hparams for machine translation with ~1.1B parameters.""" hparams = transformer_tpu() hparams.hidden_size = 2048 hparams.filter_size = 8192 hparams.num_hidden_layers = 8 + # smaller batch size to avoid OOM hparams.batch_size = 1024 + hparams.activation_dtype = "bfloat16" + hparams.weight_dtype = "bfloat16" + # maximize number of parameters relative to computation by not sharing. + hparams.shared_embedding_and_softmax_weights = False return hparams diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py index 9b5d6fe4d..7eaf0e285 100644 --- a/tensor2tensor/models/transformer_test.py +++ b/tensor2tensor/models/transformer_test.py @@ -35,36 +35,38 @@ VOCAB_SIZE = 10 -class TransformerTest(tf.test.TestCase): +def get_model(hparams=None, mode=tf.estimator.ModeKeys.TRAIN, + has_input=True, model_cls=transformer.Transformer): + if hparams is None: + hparams = transformer.transformer_tiny() + hparams.hidden_size = 8 + hparams.filter_size = 32 + hparams.num_heads = 1 + hparams.layer_prepostprocess_dropout = 0.0 + + p_hparams = problem_hparams.test_problem_hparams(VOCAB_SIZE, VOCAB_SIZE) + if not has_input: + p_hparams.input_modality = {} + hparams.problem_hparams = p_hparams + + inputs = -1 + np.random.random_integers( + VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1)) + targets = -1 + np.random.random_integers( + VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1)) + features = { + "targets": tf.constant(targets, dtype=tf.int32, name="targets"), + "target_space_id": tf.constant(1, dtype=tf.int32) + } + if has_input: + features["inputs"] = tf.constant(inputs, dtype=tf.int32, name="inputs") + + return model_cls(hparams, mode, p_hparams), features - def getModel(self, hparams=None, mode=tf.estimator.ModeKeys.TRAIN, - has_input=True, model_cls=transformer.Transformer): - if hparams is None: - hparams = transformer.transformer_tiny() - hparams.hidden_size = 8 - hparams.filter_size = 32 - hparams.num_heads = 1 - hparams.layer_prepostprocess_dropout = 0.0 - - p_hparams = problem_hparams.test_problem_hparams(VOCAB_SIZE, VOCAB_SIZE) - if not has_input: - p_hparams.input_modality = {} - hparams.problems = [p_hparams] - - inputs = -1 + np.random.random_integers( - VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1)) - targets = -1 + np.random.random_integers( - VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1)) - features = { - "inputs": tf.constant(inputs, dtype=tf.int32, name="inputs"), - "targets": tf.constant(targets, dtype=tf.int32, name="targets"), - "target_space_id": tf.constant(1, dtype=tf.int32) - } - return model_cls(hparams, mode, p_hparams), features +class TransformerTest(tf.test.TestCase): def testTransformer(self): - model, features = self.getModel(transformer.transformer_small()) + model, features = get_model(transformer.transformer_small()) logits, _ = model(features) with self.test_session() as session: session.run(tf.global_variables_initializer()) @@ -72,17 +74,17 @@ def testTransformer(self): self.assertEqual(res.shape, (BATCH_SIZE, TARGET_LENGTH, 1, 1, VOCAB_SIZE)) def testTransformerRelative(self): - model, features = self.getModel(transformer.transformer_relative_tiny()) + model, features = get_model(transformer.transformer_relative_tiny()) logits, _ = model(features) with self.test_session() as session: session.run(tf.global_variables_initializer()) res = session.run(logits) self.assertEqual(res.shape, (BATCH_SIZE, TARGET_LENGTH, 1, 1, VOCAB_SIZE)) - def testGreedyVsFast(self): - model, features = self.getModel(transformer.transformer_small()) + def testSlowVsFast(self): + model, features = get_model(transformer.transformer_small()) - decode_length = 2 + decode_length = 3 out_logits, _ = model(features) out_logits = tf.squeeze(out_logits, axis=[2, 3]) @@ -114,10 +116,10 @@ def testGreedyVsFast(self): self.assertAllClose(greedy_res, fast_res) def testSlowVsFastNoInput(self): - model, features = self.getModel( + model, features = get_model( transformer.transformer_small(), has_input=False) - decode_length = 2 + decode_length = 3 out_logits, _ = model(features) out_logits = tf.squeeze(out_logits, axis=[2, 3]) @@ -145,12 +147,12 @@ def testSlowVsFastNoInput(self): slow_res = slow_result.eval() fast_res = fast_result.eval() - self.assertEqual(fast_res.shape, (BATCH_SIZE, decode_length)) + self.assertEqual(slow_res.shape, (BATCH_SIZE, decode_length)) self.assertAllClose(slow_res, fast_res) def testBeamDecodeWithRelativeAttention(self): decode_length = 2 - model, features = self.getModel(transformer.transformer_relative_tiny()) + model, features = get_model(transformer.transformer_relative_tiny()) model(features) model.set_mode(tf.estimator.ModeKeys.PREDICT) @@ -166,7 +168,7 @@ def testBeamDecodeWithRelativeAttention(self): self.assertEqual(beam_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length)) def testBeamVsFast(self): - model, features = self.getModel(transformer.transformer_small()) + model, features = get_model(transformer.transformer_small()) decode_length = 2 @@ -204,9 +206,8 @@ def testBeamVsFast(self): beam_res = beam_result.eval() fast_res = fast_result.eval() - # TODO(rsepassi): Fix decode length. Broken by cl/190537320. - # self.assertEqual(fast_res.shape, - # (BATCH_SIZE, INPUT_LENGTH + decode_length)) + self.assertEqual(fast_res.shape, + (BATCH_SIZE, INPUT_LENGTH + decode_length)) self.assertAllClose(beam_res, fast_res) def testTransformerWithoutProblem(self): @@ -230,7 +231,7 @@ def testTransformerWithoutProblem(self): [BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size]) def testTransformerWithEncoderDecoderAttentionLoss(self): - model, features = self.getModel( + model, features = get_model( transformer.transformer_supervised_attention()) expected_attention_weights = np.random.random_sample( size=(BATCH_SIZE, TARGET_LENGTH, INPUT_LENGTH)) @@ -243,10 +244,10 @@ def testTransformerWithEncoderDecoderAttentionLoss(self): self.assertEqual(res.shape, ()) -class TransformerScorerTest(TransformerTest): +class TransformerScorerTest(tf.test.TestCase): def testReturnsScores(self): - model, features = self.getModel( + model, features = get_model( mode=tf.estimator.ModeKeys.PREDICT, model_cls=transformer.TransformerScorer) infer_out = model.infer(features) @@ -261,21 +262,21 @@ def testReturnsScores(self): def testVarNames(self): with tf.Graph().as_default(): - model, features = self.getModel( + model, features = get_model( mode=tf.estimator.ModeKeys.PREDICT, model_cls=transformer.TransformerScorer) _ = model.infer(features) scorer_vars = [v.name for v in tf.global_variables()] with tf.Graph().as_default(): - model, features = self.getModel( + model, features = get_model( mode=tf.estimator.ModeKeys.EVAL, model_cls=transformer.TransformerScorer) _ = model(features) scorer_eval_vars = [v.name for v in tf.global_variables()] with tf.Graph().as_default(): - model, features = self.getModel( + model, features = get_model( mode=tf.estimator.ModeKeys.EVAL, model_cls=transformer.Transformer) _ = model(features) diff --git a/tensor2tensor/models/xception.py b/tensor2tensor/models/xception.py index d56321049..bec758687 100644 --- a/tensor2tensor/models/xception.py +++ b/tensor2tensor/models/xception.py @@ -23,7 +23,7 @@ # Dependency imports -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.layers import common_hparams from tensor2tensor.layers import common_layers @@ -36,7 +36,7 @@ def residual_block(x, hparams): """A stack of convolution blocks with residual connection.""" k = (hparams.kernel_height, hparams.kernel_width) - dilations_and_kernels = [((1, 1), k) for _ in xrange(3)] + dilations_and_kernels = [((1, 1), k) for _ in range(3)] y = common_layers.subseparable_conv_block( x, hparams.hidden_size, @@ -66,7 +66,7 @@ def xception_internal(inputs, hparams): force2d=True, name="small_image_conv") - for i in xrange(hparams.num_hidden_layers): + for i in range(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % i): cur = residual_block(cur, hparams) diff --git a/tensor2tensor/notebooks/hello_t2t.ipynb b/tensor2tensor/notebooks/hello_t2t.ipynb index 7d56dbede..820744500 100644 --- a/tensor2tensor/notebooks/hello_t2t.ipynb +++ b/tensor2tensor/notebooks/hello_t2t.ipynb @@ -783,7 +783,7 @@ "\n", "def to_tokens(ids):\n", " ids = np.squeeze(ids)\n", - " subtokenizer = hparams.problems[0].vocabulary['targets']\n", + " subtokenizer = hparams.problem_hparams.vocabulary['targets']\n", " tokens = []\n", " for _id in ids:\n", " if _id == 0:\n", diff --git a/tensor2tensor/rl/README.md b/tensor2tensor/rl/README.md index b163a16a5..7c723126f 100644 --- a/tensor2tensor/rl/README.md +++ b/tensor2tensor/rl/README.md @@ -13,7 +13,7 @@ Currently the only supported algorithm is Proximal Policy Optimization - PPO. ``` python rl/t2t_rl_trainer.py \ - --problems=Pendulum-v0 \ + --problem=Pendulum-v0 \ --hparams_set continuous_action_base \ [--output_dir dir_location] ``` @@ -45,7 +45,7 @@ python tensor2tensor/bin/t2t-trainer \ --generate_data \ --data_dir=~/t2t_data \ --output_dir=~/t2t_data/output \ - --problems=gym_pong_random5k \ + --problem=gym_pong_random5k \ --model=basic_conv_gen \ --hparams_set=basic_conv_small \ --train_steps=1000 \ diff --git a/tensor2tensor/rl/envs/simulated_batch_env.py b/tensor2tensor/rl/envs/simulated_batch_env.py index 69dfcff94..5d4a7e066 100644 --- a/tensor2tensor/rl/envs/simulated_batch_env.py +++ b/tensor2tensor/rl/envs/simulated_batch_env.py @@ -22,12 +22,12 @@ from __future__ import division from __future__ import print_function +import os + # Dependency imports import gym -import pkg_resources - from tensor2tensor.rl.envs.in_graph_batch_env import InGraphBatchEnv from tensor2tensor.utils import registry from tensor2tensor.utils import trainer_lib @@ -39,6 +39,9 @@ FLAGS = flags.FLAGS +flags.DEFINE_string("frames_path", "", "Path to the first frames.") + + class SimulatedBatchEnv(InGraphBatchEnv): """Batch of environments inside the TensorFlow graph. @@ -52,7 +55,7 @@ def __init__(self, length, observ_shape, observ_dtype, action_shape, """Batch of environments inside the TensorFlow graph.""" self.length = length hparams = trainer_lib.create_hparams( - FLAGS.hparams_set, problem_name=FLAGS.problems, data_dir="UNUSED") + FLAGS.hparams_set, problem_name=FLAGS.problem, data_dir="UNUSED") hparams.force_full_predict = True self._model = registry.model(FLAGS.model)( hparams, tf.estimator.ModeKeys.PREDICT) @@ -60,12 +63,10 @@ def __init__(self, length, observ_shape, observ_dtype, action_shape, self.action_shape = action_shape self.action_dtype = action_dtype - with open(pkg_resources.resource_filename( - "tensor2tensor.rl.envs", "frame1.png"), "rb") as f: + with open(os.path.join(FLAGS.frames_path, "frame1.png"), "rb") as f: png_frame_1_raw = f.read() - with open(pkg_resources.resource_filename( - "tensor2tensor.rl.envs", "frame2.png"), "rb") as f: + with open(os.path.join(FLAGS.frames_path, "frame2.png"), "rb") as f: png_frame_2_raw = f.read() self.frame_1 = tf.expand_dims(tf.cast(tf.image.decode_png(png_frame_1_raw), @@ -81,13 +82,6 @@ def __init__(self, length, observ_shape, observ_dtype, action_shape, trainable=False) observ_dtype = tf.int64 - self._observ_not_sure_why_we_need_this = tf.Variable( - tf.zeros((self.length,) + observ_shape, observ_dtype), - name="observ_new", trainable=False) - - self._reward_not_sure_why_we_need_this = tf.Variable( - tf.zeros((self.length, 1), observ_dtype), - name="reward_new", trainable=False) @property def action_space(self): @@ -99,15 +93,19 @@ def __len__(self): def simulate(self, action): with tf.name_scope("environment/simulate"): - inputs = {"inputs_0": self._prev_observ.read_value(), - "inputs_1": self._observ.read_value(), - "action": action, - "targets": self._observ_not_sure_why_we_need_this, - "reward": self._reward_not_sure_why_we_need_this} - model_output = self._model(inputs) - observ_expaned = model_output[0]["targets"] - observ = tf.cast(tf.argmax(observ_expaned, axis=-1), tf.float32) - reward = tf.constant(0, tf.float32, shape=(self.length,)) + input0 = self._prev_observ.read_value() + input1 = self._observ.read_value() + # Note: the merging below must be consistent with video_utils format. + inputs_merged = tf.concat([input0, input1], axis=0) + action = tf.expand_dims(action, axis=0) # Action needs time too. + action = tf.concat([action, action], axis=0) + inputs = {"inputs": tf.expand_dims(inputs_merged, axis=0), # Add batch. + "input_action": tf.expand_dims(action, axis=0)} + model_output = self._model.infer(inputs) + observ = model_output["targets"] + observ = tf.cast(observ[:, 0, :, :, :], tf.float32) + reward = model_output["target_reward"][:, 0, 0, 0] - 1 + reward = tf.cast(reward, tf.float32) done = tf.constant(False, tf.bool, shape=(self.length,)) with tf.control_dependencies([observ]): diff --git a/tensor2tensor/rl/envs/tf_atari_wrappers.py b/tensor2tensor/rl/envs/tf_atari_wrappers.py index 61bff7ab2..cfe25caa2 100644 --- a/tensor2tensor/rl/envs/tf_atari_wrappers.py +++ b/tensor2tensor/rl/envs/tf_atari_wrappers.py @@ -53,7 +53,7 @@ class TransformWrapper(WrapperBase): def __init__(self, batch_env, transform_observation=None, transform_reward=tf.identity, transform_done=tf.identity): - super().__init__(batch_env) + super(TransformWrapper, self).__init__(batch_env) if transform_observation is not None: _, observ_shape, observ_dtype = transform_observation # pylint: disable=unpacking-non-sequence self._observ = tf.Variable( @@ -88,7 +88,7 @@ def __init__(self, batch_env): nature_transform = lambda o: tf.image.rgb_to_grayscale( # pylint: disable=g-long-lambda tf.image.resize_images(o, dims)) - super().__init__(batch_env, transform_observation=( + super(WarpFrameWrapper, self).__init__(batch_env, transform_observation=( nature_transform, dims, tf.float32)) @@ -97,14 +97,15 @@ class ShiftRewardWrapper(TransformWrapper): def __init__(self, batch_env, add_value): shift_reward = lambda r: tf.add(r, add_value) - super().__init__(batch_env, transform_reward=shift_reward) + super(ShiftRewardWrapper, self).__init__( + batch_env, transform_reward=shift_reward) class MaxAndSkipWrapper(WrapperBase): """Max and skip wrapper.""" def __init__(self, batch_env, skip=4): - super().__init__(batch_env) + super(MaxAndSkipWrapper, self).__init__(batch_env) self.skip = skip self._observ = None observs_shape = batch_env.observ.shape @@ -141,7 +142,7 @@ class TimeLimitWrapper(WrapperBase): # TODO(lukaszkaiser): Check if TimeLimitWrapper does what it's supposed to do. def __init__(self, batch_env, timelimit=100): - super().__init__(batch_env) + super(TimeLimitWrapper, self).__init__(batch_env) self.timelimit = timelimit self._time_elapsed = tf.Variable(tf.zeros((len(self),), tf.int32), trainable=False) @@ -167,7 +168,7 @@ class MemoryWrapper(WrapperBase): """Memory wrapper.""" def __init__(self, batch_env): - super().__init__(batch_env) + super(MemoryWrapper, self).__init__(batch_env) MemoryWrapper.singleton = self assert self._length == 1, "We support only one environment" infinity = 10000000 diff --git a/tensor2tensor/rl/model_rl_experiment.py b/tensor2tensor/rl/model_rl_experiment.py index 8b691a203..139cf6f46 100644 --- a/tensor2tensor/rl/model_rl_experiment.py +++ b/tensor2tensor/rl/model_rl_experiment.py @@ -22,11 +22,11 @@ # Dependency imports -from tensor2tensor import problems from tensor2tensor.bin import t2t_trainer from tensor2tensor.rl import rl_trainer_lib from tensor2tensor.rl.envs.tf_atari_wrappers import ShiftRewardWrapper from tensor2tensor.rl.envs.tf_atari_wrappers import TimeLimitWrapper +from tensor2tensor.utils import registry from tensor2tensor.utils import trainer_lib import tensorflow as tf @@ -52,11 +52,10 @@ def train(hparams, output_dir): time_delta = time.time() - start_time print(line+"Step {}.1. - generate data from policy. " "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta)))) - # FLAGS.problems = "gym_discrete_problem_with_agent" - FLAGS.problems = "gym_discrete_problem_with_agent2" + FLAGS.problem = "gym_discrete_problem_with_agent" FLAGS.agent_policy_path = last_model - gym_problem = problems.problem(FLAGS.problems) - # gym_problem.num_steps = hparams.true_env_generator_num_steps + gym_problem = registry.problem(FLAGS.problem) + gym_problem.settable_num_steps = hparams.true_env_generator_num_steps iter_data_dir = os.path.join(data_dir, str(iloop)) tf.gfile.MakeDirs(iter_data_dir) gym_problem.generate_data(iter_data_dir, tmp_dir) @@ -67,23 +66,25 @@ def train(hparams, output_dir): # 2. generate env model FLAGS.data_dir = iter_data_dir FLAGS.output_dir = output_dir - # FLAGS.model = hparams.generative_model - FLAGS.model = "basic_conv_gen" - # FLAGS.model = "michigan_basic_conv_gen" + FLAGS.model = hparams.generative_model FLAGS.hparams_set = hparams.generative_model_params - # FLAGS.train_steps = hparams.model_train_steps + FLAGS.train_steps = hparams.model_train_steps FLAGS.train_steps = 1 FLAGS.eval_steps = 1 t2t_trainer.main([]) + # Dump frames from env model. time_delta = time.time() - start_time - print(line+"Step {}.3. - evaluate env model. " + print(line+"Step {}.3. - evalue env model. " "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta)))) - gym_simulated_problem = problems.problem("gym_simulated_discrete_problem_with_agent") - gym_simulated_problem.num_steps = hparams.simulated_env_generator_num_steps + gym_simulated_problem = registry.problem( + "gym_simulated_discrete_problem_with_agent") + sim_steps = hparams.simulated_env_generator_num_steps + gym_simulated_problem.settable_num_steps = sim_steps gym_simulated_problem.generate_data(iter_data_dir, tmp_dir) - # time_delta = time.time() - start_time + # PPO. + time_delta = time.time() - start_time print(line + "Step {}.4. - train PPO in model env." " Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta)))) @@ -100,7 +101,8 @@ def train(hparams, output_dir): (ShiftRewardWrapper, {"add_value": -2})] in_graph_wrappers += gym_problem.in_graph_wrappers ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers) - rl_trainer_lib.train(ppo_hparams, "PongNoFrameskip-v4", ppo_dir) + ppo_hparams.num_agents = 1 + rl_trainer_lib.train(ppo_hparams, "PongDeterministic-v4", ppo_dir) last_model = ppo_dir + "/model{}.ckpt".format(ppo_epochs_num) @@ -109,14 +111,14 @@ def main(_): hparams = tf.contrib.training.HParams( epochs=100, true_env_generator_num_steps=100, - generative_model="static_basic_conv_gen", - generative_model_params="basic_conv_small", - model_train_steps=80, + generative_model="basic_conv_gen", + generative_model_params="basic_conv", + model_train_steps=5000, simulated_env_generator_num_steps=300, ppo_epochs_num=2, ppo_epoch_length=300, ) - train(hparams, tempfile.mkdtemp()) + train(hparams, FLAGS.output_dir) if __name__ == "__main__": diff --git a/tensor2tensor/rl/t2t_rl_trainer.py b/tensor2tensor/rl/t2t_rl_trainer.py index bd3780a9b..188433789 100644 --- a/tensor2tensor/rl/t2t_rl_trainer.py +++ b/tensor2tensor/rl/t2t_rl_trainer.py @@ -36,7 +36,7 @@ def main(_): hparams = trainer_lib.create_hparams(FLAGS.hparams_set, FLAGS.hparams) - rl_trainer_lib.train(hparams, FLAGS.problems, FLAGS.output_dir) + rl_trainer_lib.train(hparams, FLAGS.problem, FLAGS.output_dir) if __name__ == "__main__": diff --git a/tensor2tensor/serving/README.md b/tensor2tensor/serving/README.md index 633479132..8bb35da27 100644 --- a/tensor2tensor/serving/README.md +++ b/tensor2tensor/serving/README.md @@ -3,8 +3,6 @@ Tensor2Tensor and the TensorFlow ecosystem make it easy to serve a model once trained. -**Note**: Requires TF 1.5+. - ## 1. Export for Serving First, export it for serving: @@ -13,7 +11,7 @@ First, export it for serving: t2t-exporter \ --model=transformer \ --hparams_set=transformer_tiny \ - --problems=translate_ende_wmt8k \ + --problem=translate_ende_wmt8k \ --data_dir=~/t2t/data \ --output_dir=/tmp/t2t_train ``` diff --git a/tensor2tensor/serving/export.py b/tensor2tensor/serving/export.py index 5fffdbff6..5b5dccf5b 100644 --- a/tensor2tensor/serving/export.py +++ b/tensor2tensor/serving/export.py @@ -45,7 +45,7 @@ def create_hparams(): FLAGS.hparams_set, FLAGS.hparams, data_dir=os.path.expanduser(FLAGS.data_dir), - problem_name=FLAGS.problems) + problem_name=FLAGS.problem) def main(_): @@ -61,7 +61,7 @@ def main(_): estimator = create_estimator(run_config, hparams) - problem = hparams.problem_instances[0] + problem = hparams.problem strategy = trainer_lib.create_export_strategy(problem, hparams) export_dir = os.path.join(ckpt_dir, "export", strategy.name) diff --git a/tensor2tensor/serving/query.py b/tensor2tensor/serving/query.py index 9d8eed092..1af0e9f2d 100644 --- a/tensor2tensor/serving/query.py +++ b/tensor2tensor/serving/query.py @@ -87,14 +87,16 @@ def main(_): while True: inputs = FLAGS.inputs_once if FLAGS.inputs_once else input(">> ") outputs = serving_utils.predict([inputs], problem, request_fn) + outputs, = outputs + output, score = outputs print_str = """ Input: {inputs} -Output: -{outputs} +Output (Score {score:.3f}): +{output} """ - print(print_str.format(inputs=inputs, outputs=outputs[0])) + print(print_str.format(inputs=inputs, output=output, score=score)) if FLAGS.inputs_once: break diff --git a/tensor2tensor/serving/serving_utils.py b/tensor2tensor/serving/serving_utils.py index e22ddfb2c..5bb2fe724 100644 --- a/tensor2tensor/serving/serving_utils.py +++ b/tensor2tensor/serving/serving_utils.py @@ -119,7 +119,8 @@ def predict(inputs_list, problem, request_fn): predictions = request_fn(examples) output_decoder = problem.feature_info["targets"].encoder outputs = [ - _decode(prediction["outputs"], output_decoder) + (_decode(prediction["outputs"], output_decoder), + prediction["scores"]) for prediction in predictions ] return outputs diff --git a/tensor2tensor/test_data/transformer_test_ckpt/flags.txt b/tensor2tensor/test_data/transformer_test_ckpt/flags.txt index 2587e3e2d..2ecee8328 100644 --- a/tensor2tensor/test_data/transformer_test_ckpt/flags.txt +++ b/tensor2tensor/test_data/transformer_test_ckpt/flags.txt @@ -27,7 +27,7 @@ --ps_job=/job:ps --tmp_dir=/tmp/t2t_datagen --schedule=continuous_train_and_eval ---problems=translate_ende_wmt8k +--problem=translate_ende_wmt8k --hparams= --use_tpu=False --eval_early_stopping_metric_delta=0.1 diff --git a/tensor2tensor/test_data/transformer_test_ckpt/hparams.json b/tensor2tensor/test_data/transformer_test_ckpt/hparams.json index b07ac9486..df9a654c0 100644 --- a/tensor2tensor/test_data/transformer_test_ckpt/hparams.json +++ b/tensor2tensor/test_data/transformer_test_ckpt/hparams.json @@ -1 +1 @@ -{"daisy_chain_variables": true, "optimizer_adam_beta1": 0.9, "scheduled_sampling_prob": 0.0, "num_hidden_layers": 2, "moe_loss_coef": 0.01, "max_target_seq_length": 0, "clip_grad_norm": 0.0, "pos": "timing", "scheduled_sampling_gold_mixin_prob": 0.5, "initializer": "uniform_unit_scaling", "grad_noise_scale": 0.0, "optimizer_momentum_momentum": 0.9, "nbr_decoder_problems": 1, "attention_key_channels": 0, "eval_drop_long_sequences": false, "learning_rate_cosine_cycle_steps": 250000, "prepend_mode": "none", "weight_decay": 0.0, "symbol_modality_skip_top": false, "weight_noise": 0.0, "target_modality": "default", "attention_dropout": 0.1, "parameter_attention_value_channels": 0, "factored_logits": false, "relu_dropout": 0.1, "no_data_parallelism": false, "layer_preprocess_sequence": "n", "sampling_method": "argmax", "learning_rate": 0.2, "num_heads": 2, "max_length": 256, "summarize_grads": false, "attention_value_channels": 0, "num_encoder_layers": 0, "label_smoothing": 0.1, "use_fixed_batch_size": false, "optimizer": "Adam", "moe_k": 2, "self_attention_type": "dot_product", "learning_rate_decay_scheme": "noam", "sampling_temp": 1.0, "kernel_height": 3, "use_pad_remover": true, "batch_size": 4096, "problem_choice": "adaptive", "max_relative_position": 0, "force_full_predict": false, "min_length_bucket": 8, "layer_prepostprocess_dropout": 0.1, "eval_run_autoregressive": false, "shared_embedding_and_softmax_weights": true, "symbol_modality_num_shards": 16, "dropout": 0.2, "compress_steps": 0, "parameter_attention_key_channels": 0, "length_bucket_step": 1.1, "kernel_width": 1, "hidden_size": 16, "num_decoder_layers": 0, "input_modalities": "default", "filter_size": 8, "optimizer_adam_beta2": 0.98, "scheduled_sampling_warmup_steps": 50000, "norm_type": "layer", "min_length": 0, "moe_num_experts": 64, "multiply_embedding_mode": "sqrt_depth", "max_input_seq_length": 0, "learning_rate_warmup_steps": 8000, "proximity_bias": false, "ffn_layer": "dense_relu_dense", "initializer_gain": 1.0, "layer_postprocess_sequence": "da", "moe_hidden_sizes": "2048", "optimizer_adam_epsilon": 1e-09, "norm_epsilon": 1e-06} +{"daisy_chain_variables": true, "optimizer_adam_beta1": 0.9, "scheduled_sampling_prob": 0.0, "num_hidden_layers": 2, "moe_loss_coef": 0.01, "max_target_seq_length": 0, "clip_grad_norm": 0.0, "pos": "timing", "scheduled_sampling_gold_mixin_prob": 0.5, "initializer": "uniform_unit_scaling", "grad_noise_scale": 0.0, "optimizer_momentum_momentum": 0.9, "nbr_decoder_problems": 1, "attention_key_channels": 0, "eval_drop_long_sequences": false, "learning_rate_cosine_cycle_steps": 250000, "prepend_mode": "none", "weight_decay": 0.0, "symbol_modality_skip_top": false, "weight_noise": 0.0, "target_modality": "default", "attention_dropout": 0.1, "parameter_attention_value_channels": 0, "factored_logits": false, "relu_dropout": 0.1, "no_data_parallelism": false, "layer_preprocess_sequence": "n", "sampling_method": "argmax", "learning_rate": 0.2, "num_heads": 2, "max_length": 256, "summarize_grads": false, "attention_value_channels": 0, "num_encoder_layers": 0, "label_smoothing": 0.1, "use_fixed_batch_size": false, "optimizer": "Adam", "moe_k": 2, "self_attention_type": "dot_product", "learning_rate_decay_scheme": "noam", "sampling_temp": 1.0, "kernel_height": 3, "use_pad_remover": true, "batch_size": 4096, "max_relative_position": 0, "force_full_predict": false, "min_length_bucket": 8, "layer_prepostprocess_dropout": 0.1, "eval_run_autoregressive": false, "shared_embedding_and_softmax_weights": true, "symbol_modality_num_shards": 16, "dropout": 0.2, "compress_steps": 0, "parameter_attention_key_channels": 0, "length_bucket_step": 1.1, "kernel_width": 1, "hidden_size": 16, "num_decoder_layers": 0, "input_modalities": "default", "filter_size": 8, "optimizer_adam_beta2": 0.98, "scheduled_sampling_warmup_steps": 50000, "norm_type": "layer", "min_length": 0, "moe_num_experts": 64, "multiply_embedding_mode": "sqrt_depth", "max_input_seq_length": 0, "learning_rate_warmup_steps": 8000, "proximity_bias": false, "ffn_layer": "dense_relu_dense", "initializer_gain": 1.0, "layer_postprocess_sequence": "da", "moe_hidden_sizes": "2048", "optimizer_adam_epsilon": 1e-09, "norm_epsilon": 1e-06} diff --git a/tensor2tensor/utils/adafactor.py b/tensor2tensor/utils/adafactor.py index 11daaf890..69a240733 100644 --- a/tensor2tensor/utils/adafactor.py +++ b/tensor2tensor/utils/adafactor.py @@ -19,6 +19,7 @@ from __future__ import print_function # Dependency imports +from tensor2tensor.utils import quantization import tensorflow as tf @@ -28,7 +29,6 @@ class AdafactorOptimizer(tf.train.Optimizer): Adafactor is described in https://arxiv.org/abs/1804.04235. - Adafactor is most similar to Adam (Kingma and Ba), the major differences are: 1. For a two-dimensional AxB weight matrix, Adafactor uses only A+B auxiliary @@ -108,6 +108,7 @@ def __init__(self, clipping_threshold=1.0, factored=True, simulated_quantize_bits=None, + parameter_encoding=None, use_locking=False, name="Adafactor"): """Construct a new Adafactor optimizer. @@ -124,6 +125,8 @@ def __init__(self, for 2d variables simulated_quantize_bits: train with simulated quantized parameters (experimental) + parameter_encoding: a ParameterEncoding object to use in the case of + bfloat16 variables. use_locking: If True use locks for update operations. name: Optional name for the operations created when applying gradients. Defaults to "AdafactorOptimizer". @@ -144,8 +147,8 @@ def __init__(self, self._clipping_threshold = clipping_threshold self._factored = factored self._simulated_quantize_bits = simulated_quantize_bits - if self._simulated_quantize_bits: - self._quantization_noise = _quantization_noise_from_step_num() + self._parameter_encoding = parameter_encoding + self._quantization_noise = quantization.noise_from_step_num() def _should_use_factored_second_moment_estimate(self, shape): """Should we use a factored second moment estimator. @@ -201,8 +204,11 @@ def _resource_apply_dense(self, grad, var): grad_squared_mean = tf.reduce_mean(grad_squared) decay_rate = self._decay_rate update_scale = self._learning_rate + old_val = var + if var.dtype.base_dtype == tf.bfloat16: + old_val = tf.to_float(self._parameter_encoding.decode(old_val)) if self._multiply_by_parameter_scale: - update_scale *= tf.to_float(self._parameter_scale(var)) + update_scale *= tf.to_float(self._parameter_scale(old_val)) # HACK: Make things dependent on grad. # This confounds the XLA rewriter and keeps it from fusing computations # across different variables. This fusion is a bad for HBM usage, since @@ -243,11 +249,12 @@ def _resource_apply_dense(self, grad, var): subtrahend = new_m new_m = tf.cast(new_m, var.dtype) updates.append(tf.assign(m, new_m, use_locking=self._use_locking)) - new_val = tf.to_float(var) - subtrahend - if var.dtype == tf.bfloat16: - new_val = _to_bfloat16_unbiased(new_val) + new_val = tf.to_float(old_val) - subtrahend + if var.dtype.base_dtype == tf.bfloat16: + new_val = self._parameter_encoding.encode( + new_val, self._quantization_noise) if self._simulated_quantize_bits: - new_val = _simulated_quantize( + new_val = quantization.simulated_quantize( var - subtrahend, self._simulated_quantize_bits, self._quantization_noise) var_update = tf.assign(var, new_val, use_locking=self._use_locking) @@ -312,6 +319,10 @@ def adafactor_optimizer_from_hparams(hparams, lr): hparams.optimizer_adafactor_memory_exponent) else: raise ValueError("unknown optimizer_adafactor_decay_type") + if hparams.weight_dtype == "bfloat16": + parameter_encoding = quantization.EighthPowerEncoding() + else: + parameter_encoding = None return AdafactorOptimizer( multiply_by_parameter_scale=( hparams.optimizer_adafactor_multiply_by_parameter_scale), @@ -322,145 +333,10 @@ def adafactor_optimizer_from_hparams(hparams, lr): factored=hparams.optimizer_adafactor_factored, simulated_quantize_bits=getattr( hparams, "simulated_parameter_quantize_bits", 0), + parameter_encoding=parameter_encoding, use_locking=False, name="Adafactor") def reduce_rms(x): return tf.sqrt(tf.reduce_mean(tf.square(x))) - - -def _simulated_quantize(x, num_bits, quantization_noise): - """Simulate quantization to num_bits bits, with externally-stored scale. - - num_bits is the number of bits used to store each value. - quantization_noise is a float32 Tensor containing values in [0, 1). - Each value in quantization_noise should take different values across - different steps, approximating a uniform distribution over [0, 1). - In the case of replicated TPU training, quantization_noise should be identical - across replicas in order to keep the parameters identical across replicas. - - The natural choice for quantization_noise would be tf.random_uniform(), - but this is not possible for TPU, since there is currently no way to seed - the different cores to produce identical values across replicas. Instead we - use _quantization_noise_from_step_num() (see below). - - The quantization scheme is as follows: - - Compute the maximum absolute value by row (call this max_abs). - Store this either in an auxiliary variable or in an extra column. - - Divide the parameters by (max_abs / (2^(num_bits-1)-1)). This gives a - float32 value in the range [-2^(num_bits-1)-1, 2^(num_bits-1)-1] - - Unbiased randomized roundoff by adding quantization_noise and rounding down. - - This produces a signed integer with num_bits bits which can then be stored. - - Args: - x: a float32 Tensor - num_bits: an integer between 1 and 22 - quantization_noise: a float Tensor broadcastable to the shape of x. - - Returns: - a float32 Tensor - """ - shape = x.get_shape().as_list() - if not (len(shape) >= 2 and shape[-1] > 1): - return x - max_abs = tf.reduce_max(tf.abs(x), -1, keep_dims=True) + 1e-9 - max_int = 2 ** (num_bits - 1) - 1 - scale = max_abs / max_int - x /= scale - x = tf.floor(x + quantization_noise) - # dequantize before storing (since this is a simulation) - x *= scale - return x - - -def _quantization_noise_from_step_num(): - """A quantization noise equal to (phi * (step_num + 1)) mod 1.0. - - See _simulated_quantize. - - Returns: - a float32 scalar - """ - step = tf.to_int32(tf.train.get_or_create_global_step()) + 1 - phi = ((5 ** 0.5) - 1) / 2 - # Naive computation tf.mod(phi * step, 1.0) in float32 would be disastrous - # due to loss of precision when the step number gets large. - # Computation in doubles does not work on TPU, so we use this complicated - # alternative computation which does not suffer from these roundoff errors. - ret = 0.0 - for i in xrange(30): - ret += (((phi * (2 ** i)) % 1.0) # double-precision computation in python - * tf.to_float(tf.mod(step // (2 ** i), 2))) - return tf.mod(ret, 1.0) - - -def _randomized_roundoff_to_bfloat16(x, quantization_noise, cand1, cand2): - """Round-off x to cand1 or to cand2 in an unbiased way. - - Cand1 and cand2 are the same shape as x. - For every element of x, the corresponding elements of cand1 and cand2 should - be the two closest bfloat16 values to x. Order does not matter. - cand1 and cand2 must differ from each other. - - Args: - x: A float32 Tensor. - quantization_noise: A Tensor broadcastable to the shape of x containing - random uniform values in [0.0, 1.0]. - cand1: A bfloat16 Tensor the same shape as x. - cand2: A bfloat16 Tensor the same shape as x. - - Returns: - A bfloat16 Tensor. - """ - cand1_f = tf.to_float(cand1) - cand2_f = tf.to_float(cand2) - step_size = cand2_f - cand1_f - fpart = (x - cand1_f) / step_size - ret = tf.where(tf.greater(fpart, quantization_noise), cand2, cand1) - return ret - - -def _to_bfloat16_unbiased(x): - """Convert a float32 to a bfloat16 using randomized roundoff. - - Note: If this ever produces worse results than using float32 all the way - through, we should try to diagnose and fix it. There are several things - to try: - - 1. Encode parameter x for storage purposes as - _to_bfloat16_unbiased(tf.pow(x, 5)) . This gives 5x the - resolution while incurring overflow and underflow at 10^9 and 10^-9 - instead of 10^37 and 10^-37. Comes at a cost of extracting fifth roots - to decode parameters. Or use some other such scheme. - - 2. In this function, use actual random numbers, different for each parameter - as opposed to the same for every parameter in the graph. - - 3. Look for bugs in this function. - - Args: - x: A float32 Tensor. - Returns: - A float32 Tensor. - """ - # Not using random_uniform here due to a problem on TPU in that random seeds - # are not respected, which may cause the parameters on different replicas - # to go out-of-sync. - quantization_noise = _quantization_noise_from_step_num() - x_sign = tf.sign(x) - # Make sure x is positive. If it is zero, the two candidates are identical. - x = x * x_sign + 1e-30 - cand1 = tf.to_bfloat16(x) - cand1_f = tf.to_float(cand1) - # This relies on the fact that for a positive bfloat16 b, - # b * 1.005 gives you the next higher bfloat16 and b*0.995 gives you the - # next lower one. Both 1.005 and 0.995 are ballpark estimation. - cand2 = tf.to_bfloat16( - tf.where(tf.greater(x, cand1_f), cand1_f * 1.005, cand1_f * 0.995)) - ret = _randomized_roundoff_to_bfloat16(x, quantization_noise, cand1, cand2) - return ret * tf.to_bfloat16(x_sign) diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py index 11d455c17..e42fd9621 100644 --- a/tensor2tensor/utils/beam_search.py +++ b/tensor2tensor/utils/beam_search.py @@ -356,11 +356,8 @@ def grow_topk(i, alive_seq, alive_log_probs, states): lambda t: _unmerge_beam_dim(t, batch_size, beam_size), flat_states) else: flat_logits = symbols_to_logits_fn(flat_ids) - if len(flat_logits.shape)>=3: - logits = tf.reshape(flat_logits, [batch_size, beam_size, -1]) - elif len(flat_logits.shape)<3: - logits = tf.reshape(flat_logits, [beam_size, batch_size, -1]) - logits = tf.transpose(logits, perm=[1, 0, 2]) + + logits = tf.reshape(flat_logits, [batch_size, beam_size, -1]) # Convert logits to normalized log probs candidate_log_probs = log_prob_from_logits(logits) diff --git a/tensor2tensor/utils/bleu_hook.py b/tensor2tensor/utils/bleu_hook.py index fa3d25213..e18b72c0b 100644 --- a/tensor2tensor/utils/bleu_hook.py +++ b/tensor2tensor/utils/bleu_hook.py @@ -25,18 +25,18 @@ import sys import time import unicodedata -#To fix issue:#706 -import io # Dependency imports import numpy as np import six # pylint: disable=redefined-builtin -from six.moves import xrange +from six.moves import range from six.moves import zip # pylint: enable=redefined-builtin +from tensor2tensor.data_generators import text_encoder + import tensorflow as tf @@ -53,8 +53,8 @@ def _get_ngrams(segment, max_order): with a count of how many times each n-gram occurred. """ ngram_counts = collections.Counter() - for order in xrange(1, max_order + 1): - for i in xrange(0, len(segment) - order + 1): + for order in range(1, max_order + 1): + for i in range(0, len(segment) - order + 1): ngram = tuple(segment[i:i + order]) ngram_counts[ngram] += 1 return ngram_counts @@ -102,7 +102,7 @@ def compute_bleu(reference_corpus, possible_matches_by_order[len(ngram)-1] += translation_ngram_counts[ngram] precisions = [0] * max_order smooth = 1.0 - for i in xrange(0, max_order): + for i in range(0, max_order): if possible_matches_by_order[i] > 0: precisions[i] = matches_by_order[i] / possible_matches_by_order[i] if matches_by_order[i] > 0: @@ -196,9 +196,10 @@ def bleu_tokenize(string): def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False): """Compute BLEU for two files (reference and hypothesis translation).""" - #To fix the issue #706 - ref_lines = io.open(ref_filename, 'rt', encoding='utf-8').read().splitlines() - hyp_lines = io.open(hyp_filename, 'rt', encoding='utf-8').read().splitlines() + ref_lines = text_encoder.native_to_unicode( + tf.gfile.Open(ref_filename, "r").read()).splitlines() + hyp_lines = text_encoder.native_to_unicode( + tf.gfile.Open(hyp_filename, "r").read()).splitlines() assert len(ref_lines) == len(hyp_lines) if not case_sensitive: ref_lines = [x.lower() for x in ref_lines] diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py index 703f1fa0d..36e3bfcb5 100755 --- a/tensor2tensor/utils/cloud_mlengine.py +++ b/tensor2tensor/utils/cloud_mlengine.py @@ -23,6 +23,8 @@ from googleapiclient import discovery from oauth2client.client import GoogleCredentials + +from tensor2tensor.data_generators import text_encoder from tensor2tensor.layers import common_hparams from tensor2tensor.utils import cloud_tpu as cloud from tensor2tensor.utils import registry @@ -31,7 +33,7 @@ FLAGS = tf.flags.FLAGS -CONSOLE_URL = 'https://console.cloud.google.com/mlengine/jobs/' +CONSOLE_URL = "https://console.cloud.google.com/mlengine/jobs/" # TODO(rsepassi): # * Enable multi-machine sync/async training @@ -44,8 +46,8 @@ def get_setup_file(name, packages=None): from setuptools import find_packages from setuptools import setup setup( - name='{name}', - version='0.1', + name="{name}", + version="0.1", packages=find_packages(), install_requires={pypi_packages} ) @@ -54,53 +56,53 @@ def get_setup_file(name, packages=None): def job_dir(): # The flag --job-dir is parsed differently before and after switching to absl - return getattr(FLAGS, 'job-dir', '') or getattr(FLAGS, 'job_dir', '') + return getattr(FLAGS, "job-dir", "") or getattr(FLAGS, "job_dir", "") def get_requirements(usr_dir): - requirements_file = os.path.join(usr_dir, 'requirements.txt') + requirements_file = os.path.join(usr_dir, "requirements.txt") if not tf.gfile.Exists(requirements_file): return [] with tf.gfile.Open(requirements_file) as f: pkg_list = f.readlines() - return [pkg.strip() for pkg in pkg_list if 'tensor2tensor' not in pkg] + return [pkg.strip() for pkg in pkg_list if "tensor2tensor" not in pkg] def flags_as_args(): """Convert FLAGS to list of args suitable for passing on cmd line.""" - if hasattr(FLAGS, 'flag_values_dict'): + if hasattr(FLAGS, "flag_values_dict"): args_dict = FLAGS.flag_values_dict() else: - args_dict = dict(FLAGS.__dict__['__flags']) - del args_dict['cloud_mlengine'] + args_dict = dict(FLAGS.__dict__["__flags"]) + del args_dict["cloud_mlengine"] # Configured later - del args_dict['t2t_usr_dir'] - args_dict.pop('h', None) - args_dict.pop('helpfull', None) - args_dict.pop('helpshort', None) - args_dict.pop('help', None) + del args_dict["t2t_usr_dir"] + args_dict.pop("h", None) + args_dict.pop("helpfull", None) + args_dict.pop("helpshort", None) + args_dict.pop("help", None) args = [] for name, val in args_dict.items(): if val is None: continue - if name.startswith('autotune'): + if name.startswith("autotune"): continue - args.extend(['--%s' % name, str(val)]) + args.extend(["--%s" % name, str(val)]) return args def get_default_master_type(num_gpus=1, use_tpu=False): """Returns master_type for trainingInput.""" if use_tpu: - return 'standard_tpu' + return "standard_tpu" elif num_gpus <= 0: - return 'standard' + return "standard" elif num_gpus == 1: - return 'standard_p100' + return "standard_p100" elif num_gpus == 4: - return 'complex_model_m_p100' + return "complex_model_m_p100" elif num_gpus == 8: - return 'complex_model_l_gpu' + return "complex_model_l_gpu" assert False @@ -109,20 +111,20 @@ def configure_job(): # See documentation: # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput training_input = { - 'pythonModule': 'tensor2tensor.bin.t2t_trainer', - 'args': flags_as_args(), - 'region': cloud.default_region().decode('utf-8'), - 'runtimeVersion': '1.5', - 'pythonVersion': '3.5' if sys.version_info.major == 3 else '2.7', - 'jobDir': FLAGS.output_dir, - 'scaleTier': 'CUSTOM', - 'masterType': FLAGS.cloud_mlengine_master_type or get_default_master_type( + "pythonModule": "tensor2tensor.bin.t2t_trainer", + "args": flags_as_args(), + "region": text_encoder.native_to_unicode(cloud.default_region()), + "runtimeVersion": "1.5", + "pythonVersion": "3.5" if sys.version_info.major == 3 else "2.7", + "jobDir": FLAGS.output_dir, + "scaleTier": "CUSTOM", + "masterType": FLAGS.cloud_mlengine_master_type or get_default_master_type( num_gpus=FLAGS.worker_gpu, use_tpu=FLAGS.use_tpu) } if FLAGS.hparams_range: - tf.logging.info('Configuring hyperparameter tuning.') - training_input['hyperparameters'] = configure_autotune( + tf.logging.info("Configuring hyperparameter tuning.") + training_input["hyperparameters"] = configure_autotune( FLAGS.hparams_range, FLAGS.autotune_objective, FLAGS.autotune_maximize, @@ -130,17 +132,18 @@ def configure_job(): FLAGS.autotune_parallel_trials, ) - timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') - job_name = '%s_%s_t2t_%s' % (FLAGS.model, FLAGS.problems, timestamp) - job_spec = {'jobId': job_name, 'trainingInput': training_input} + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + job_name = "%s_%s_t2t_%s" % (FLAGS.model, FLAGS.problem, timestamp) + job_spec = {"jobId": job_name, "trainingInput": training_input} return job_spec def launch_job(job_spec): """Launch job on ML Engine.""" - project_id = 'projects/{}'.format(cloud.default_project().decode('utf-8')) + project_id = "projects/{}".format( + text_encoder.native_to_unicode(cloud.default_project())) credentials = GoogleCredentials.get_application_default() - cloudml = discovery.build('ml', 'v1', credentials=credentials, + cloudml = discovery.build("ml", "v1", credentials=credentials, cache_discovery=False) request = cloudml.projects().jobs().create(body=job_spec, parent=project_id) request.execute() @@ -148,19 +151,19 @@ def launch_job(job_spec): def _tar_and_copy(src_dir, target_dir): """Tar and gzip src_dir and copy to GCS target_dir.""" - src_dir = src_dir.rstrip('/') - target_dir = target_dir.rstrip('/') - tmp_dir = tempfile.gettempdir().rstrip('/') + src_dir = src_dir.rstrip("/") + target_dir = target_dir.rstrip("/") + tmp_dir = tempfile.gettempdir().rstrip("/") src_base = os.path.basename(src_dir) cloud.shell_run( - 'tar -zcf {tmp_dir}/{src_base}.tar.gz -C {src_dir} .', + "tar -zcf {tmp_dir}/{src_base}.tar.gz -C {src_dir} .", src_dir=src_dir, src_base=src_base, tmp_dir=tmp_dir) - final_destination = '%s/%s.tar.gz' % (target_dir, src_base) + final_destination = "%s/%s.tar.gz" % (target_dir, src_base) cloud.shell_run( - ('gsutil cp {tmp_dir}/{src_base}.tar.gz ' - '{final_destination}'), + ("gsutil cp {tmp_dir}/{src_base}.tar.gz " + "{final_destination}"), tmp_dir=tmp_dir, src_base=src_base, final_destination=final_destination) @@ -169,38 +172,39 @@ def _tar_and_copy(src_dir, target_dir): def tar_and_copy_t2t(train_dir): """Tar Tensor2Tensor and cp to train_dir.""" - tf.logging.info('Tarring and pushing local Tensor2Tensor package.') + tf.logging.info("Tarring and pushing local Tensor2Tensor package.") - output = cloud.shell_output('pip show tensor2tensor').decode('utf-8').split('\n') - assert output[1].startswith('Version') - assert output[7].startswith('Location') - t2t_version = output[1].split(':')[1].strip() - t2t_dir = output[7].split(':')[1].strip() + output = text_encoder.native_to_unicode(cloud.shell_output( + "pip show tensor2tensor")).split("\n") + assert output[1].startswith("Version") + assert output[7].startswith("Location") + t2t_version = output[1].split(":")[1].strip() + t2t_dir = output[7].split(":")[1].strip() # A local installation cloned from GitHub will have a setup.py file and a docs # folder is_local_t2t = all([ tf.gfile.Exists(os.path.join(t2t_dir, fname)) - for fname in ['setup.py', 'docs/cloud_mlengine.md'] + for fname in ["setup.py", "docs/cloud_mlengine.md"] ]) if is_local_t2t: - tf.logging.info('Found local T2T installation. Tarring directory %s', + tf.logging.info("Found local T2T installation. Tarring directory %s", t2t_dir) else: # PyPI installation # Create a folder with just a setup.py file pointing to the right version - tf.logging.info('Found PyPI T2T installation. Launching tensor2tensor==%s', + tf.logging.info("Found PyPI T2T installation. Launching tensor2tensor==%s", t2t_version) - t2t_dir = os.path.join(tempfile.gettempdir(), 'tensor2tensor_tmp') + t2t_dir = os.path.join(tempfile.gettempdir(), "tensor2tensor_tmp") shutil.rmtree(t2t_dir, ignore_errors=True) os.mkdir(t2t_dir) - setup_fname = os.path.join(t2t_dir, 'setup.py') + setup_fname = os.path.join(t2t_dir, "setup.py") setup_file_str = get_setup_file( - name='DummyT2TPackage', - packages=['tensor2tensor==%s' % t2t_version] + name="DummyT2TPackage", + packages=["tensor2tensor==%s" % t2t_version] ) - with tf.gfile.Open(setup_fname, 'w') as f: + with tf.gfile.Open(setup_fname, "w") as f: f.write(setup_file_str) t2t_tar = _tar_and_copy(t2t_dir, train_dir) return t2t_tar @@ -208,20 +212,20 @@ def tar_and_copy_t2t(train_dir): def tar_and_copy_usr_dir(usr_dir, train_dir): """Package, tar, and copy usr_dir to GCS train_dir.""" - tf.logging.info('Tarring and pushing t2t_usr_dir.') + tf.logging.info("Tarring and pushing t2t_usr_dir.") usr_dir = os.path.abspath(os.path.expanduser(usr_dir)) # Copy usr dir to a temp location - top_dir = os.path.join(tempfile.gettempdir(), 't2t_usr_container') + top_dir = os.path.join(tempfile.gettempdir(), "t2t_usr_container") tmp_usr_dir = os.path.join(top_dir, usr_dir_lib.INTERNAL_USR_DIR_PACKAGE) shutil.rmtree(top_dir, ignore_errors=True) shutil.copytree(usr_dir, tmp_usr_dir) # Insert setup.py if one does not exist - top_setup_fname = os.path.join(top_dir, 'setup.py') + top_setup_fname = os.path.join(top_dir, "setup.py") setup_file_str = get_setup_file( - name='DummyUsrDirPackage', + name="DummyUsrDirPackage", packages=get_requirements(usr_dir) ) - with tf.gfile.Open(top_setup_fname, 'w') as f: + with tf.gfile.Open(top_setup_fname, "w") as f: f.write(setup_file_str) usr_tar = _tar_and_copy(top_dir, train_dir) return usr_tar @@ -230,7 +234,7 @@ def tar_and_copy_usr_dir(usr_dir, train_dir): def autotune_paramspecs(hparams_range): rhp = common_hparams.RangedHParams() registry.ranged_hparams(hparams_range)(rhp) - return rhp.to_parameter_specs(name_prefix='hp_') + return rhp.to_parameter_specs(name_prefix="hp_") def configure_autotune(hparams_range, @@ -239,32 +243,32 @@ def configure_autotune(hparams_range, max_trials=10, parallel_trials=1): return { - 'goal': 'MAXIMIZE' if maximize else 'MINIMIZE', - 'params': autotune_paramspecs(hparams_range), - 'maxTrials': max_trials, - 'maxParallelTrials': parallel_trials, - 'hyperparameterMetricTag': objective, + "goal": "MAXIMIZE" if maximize else "MINIMIZE", + "params": autotune_paramspecs(hparams_range), + "maxTrials": max_trials, + "maxParallelTrials": parallel_trials, + "hyperparameterMetricTag": objective, } def configure_trainer_package(job_spec, t2t_tar): - assert t2t_tar.startswith('gs://') - job_spec['trainingInput']['packageUris'] = [t2t_tar] + assert t2t_tar.startswith("gs://") + job_spec["trainingInput"]["packageUris"] = [t2t_tar] def configure_usr_dir(job_spec, usr_tar): - assert usr_tar.startswith('gs://') - job_spec['trainingInput']['packageUris'].append(usr_tar) - usr_args = ['--t2t_usr_dir', usr_dir_lib.INTERNAL_USR_DIR_PACKAGE] - job_spec['trainingInput']['args'].extend(usr_args) + assert usr_tar.startswith("gs://") + job_spec["trainingInput"]["packageUris"].append(usr_tar) + usr_args = ["--t2t_usr_dir", usr_dir_lib.INTERNAL_USR_DIR_PACKAGE] + job_spec["trainingInput"]["args"].extend(usr_args) def validate_flags(): """Validates flags are set to acceptable values for CloudML Engine runs.""" assert not FLAGS.cloud_tpu assert not job_dir() - assert FLAGS.output_dir.startswith('gs://') - assert FLAGS.data_dir.startswith('gs://') + assert FLAGS.output_dir.startswith("gs://") + assert FLAGS.data_dir.startswith("gs://") assert FLAGS.worker_replicas <= 1 assert FLAGS.ps_replicas <= 0 if FLAGS.hparams_range: @@ -273,29 +277,29 @@ def validate_flags(): assert FLAGS.worker_gpu in [1, 4, 8] if FLAGS.cloud_mlengine_master_type: if FLAGS.use_tpu: - assert FLAGS.cloud_mlengine_master_type == 'standard_tpu' + assert FLAGS.cloud_mlengine_master_type == "standard_tpu" elif FLAGS.worker_gpu: if FLAGS.worker_gpu == 1: - assert FLAGS.cloud_mlengine_master_type in ['standard_gpu', - 'standard_p100'] + assert FLAGS.cloud_mlengine_master_type in ["standard_gpu", + "standard_p100"] elif FLAGS.worker_gpu == 4: - assert FLAGS.cloud_mlengine_master_type in ['complex_model_m_gpu', - 'complex_model_m_p100'] + assert FLAGS.cloud_mlengine_master_type in ["complex_model_m_gpu", + "complex_model_m_p100"] else: - assert FLAGS.cloud_mlengine_master_type == 'complex_model_l_gpu' + assert FLAGS.cloud_mlengine_master_type == "complex_model_l_gpu" else: - assert FLAGS.cloud_mlengine_master_type in ['standard', 'large_model', - 'complex_model_s', - 'complex_model_m', - 'complex_model_l'] + assert FLAGS.cloud_mlengine_master_type in ["standard", "large_model", + "complex_model_s", + "complex_model_m", + "complex_model_l"] def launch(): """Launch t2t_trainer on Cloud ML Engine.""" validate_flags() job_spec = configure_job() - job_name = job_spec['jobId'] - tf.logging.info('Launching job %s with ML Engine spec:\n%s', job_name, + job_name = job_spec["jobId"] + tf.logging.info("Launching job %s with ML Engine spec:\n%s", job_name, job_spec) assert cloud.confirm() train_dir = FLAGS.output_dir @@ -305,5 +309,5 @@ def launch(): usr_tar = tar_and_copy_usr_dir(FLAGS.t2t_usr_dir, train_dir) configure_usr_dir(job_spec, usr_tar) launch_job(job_spec) - tf.logging.info('Launched %s. See console to track: %s.', job_name, + tf.logging.info("Launched %s. See console to track: %s.", job_name, CONSOLE_URL) diff --git a/tensor2tensor/utils/cloud_tpu.py b/tensor2tensor/utils/cloud_tpu.py index 1518e69ae..d1ea417be 100644 --- a/tensor2tensor/utils/cloud_tpu.py +++ b/tensor2tensor/utils/cloud_tpu.py @@ -29,6 +29,7 @@ import time from six.moves import input # pylint: disable=redefined-builtin +from tensor2tensor.data_generators import text_encoder import tensorflow as tf TPU_IP = "10.240.%d.2" @@ -216,7 +217,7 @@ def shell_background(cmd_, **kwargs): def shell_output(cmd_, **kwargs): - return sp.check_output(format_cmd(cmd_, **kwargs)) + return text_encoder.to_unicode(sp.check_output(format_cmd(cmd_, **kwargs))) def shell_run(cmd_, **kwargs): diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index d7be24e7e..9ad3a712f 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -23,7 +23,7 @@ import numpy as np import six -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin import tensorflow as tf @@ -191,7 +191,7 @@ def _batching_scheme(batch_size, ] window_size = max( [i for i in highly_composite_numbers if i <= 3 * max_batch_size]) - divisors = [i for i in xrange(1, window_size + 1) if window_size % i == 0] + divisors = [i for i in range(1, window_size + 1) if window_size % i == 0] batch_sizes = [max([d for d in divisors if d <= bs]) for bs in batch_sizes] window_size *= shard_multiplier batch_sizes = [bs * shard_multiplier for bs in batch_sizes] diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py index db039b799..ec5897092 100644 --- a/tensor2tensor/utils/data_reader_test.py +++ b/tensor2tensor/utils/data_reader_test.py @@ -25,7 +25,7 @@ # Dependency imports import numpy as np -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem as problem_mod @@ -39,7 +39,7 @@ class TestProblem(problem_mod.Problem): def generator(self, data_dir, tmp_dir, is_training): - for i in xrange(30): + for i in range(30): yield {"inputs": [i] * (i + 1), "targets": [i], "floats": [i + 0.5]} def generate_data(self, data_dir, tmp_dir, task_id=-1): @@ -98,7 +98,7 @@ def testBasicExampleReading(self): with tf.train.MonitoredSession() as sess: # Check that there are multiple examples that have the right fields of the # right type (lists of int/float). - for _ in xrange(10): + for _ in range(10): ex_val = sess.run(examples) inputs, targets, floats = (ex_val["inputs"], ex_val["targets"], ex_val["floats"]) @@ -130,7 +130,7 @@ def testLengthFilter(self): examples = dataset.make_one_shot_iterator().get_next() with tf.train.MonitoredSession() as sess: ex_lens = [] - for _ in xrange(max_len): + for _ in range(max_len): ex_lens.append(len(sess.run(examples)["inputs"])) self.assertAllEqual(list(range(1, max_len + 1)), sorted(ex_lens)) diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py index 65616191c..7daa12b21 100644 --- a/tensor2tensor/utils/decoding.py +++ b/tensor2tensor/utils/decoding.py @@ -28,8 +28,8 @@ from six.moves import input # pylint: disable=redefined-builtin +from tensor2tensor.data_generators import problem as problem_lib from tensor2tensor.data_generators import text_encoder -from tensor2tensor.data_generators.problem import problem_hparams_to_features import tensorflow as tf FLAGS = tf.flags.FLAGS @@ -43,7 +43,6 @@ def decode_hparams(overrides=""): hp = tf.contrib.training.HParams( save_images=False, log_targets=True, - problem_idx=0, extra_length=100, batch_size=0, beam_size=4, @@ -102,14 +101,14 @@ def log_decode_results(inputs, def decode_from_dataset(estimator, - problem_names, + problem_name, hparams, decode_hp, decode_to_file=None, dataset_split=None): """Perform decoding from dataset.""" tf.logging.info("Performing local inference from dataset for %s.", - str(problem_names)) + str(problem_name)) # We assume that worker_id corresponds to shard number. shard = decode_hp.shard_id if decode_hp.shards > 1 else None @@ -123,76 +122,59 @@ def decode_from_dataset(estimator, "dataset_split": dataset_split, } - for problem_idx, problem_name in enumerate(problem_names): - # Build the inference input function - problem = hparams.problem_instances[problem_idx] - infer_input_fn = problem.make_estimator_input_fn( - tf.estimator.ModeKeys.PREDICT, hparams, dataset_kwargs=dataset_kwargs) + # Build the inference input function + problem = hparams.problem + infer_input_fn = problem.make_estimator_input_fn( + tf.estimator.ModeKeys.PREDICT, hparams, dataset_kwargs=dataset_kwargs) - # Get the predictions as an iterable - predictions = estimator.predict(infer_input_fn) + # Get the predictions as an iterable + predictions = estimator.predict(infer_input_fn) - # Prepare output file writers if decode_to_file passed - if decode_to_file: - if decode_hp.shards > 1: - decode_filename = decode_to_file + ("%.2d" % decode_hp.shard_id) - else: - decode_filename = decode_to_file - output_filepath = _decode_filename(decode_filename, problem_name, - decode_hp) - parts = output_filepath.split(".") - parts[-1] = "targets" - target_filepath = ".".join(parts) - parts[-1] = "inputs" - input_filepath = ".".join(parts) - - output_file = tf.gfile.Open(output_filepath, "w") - target_file = tf.gfile.Open(target_filepath, "w") - input_file = tf.gfile.Open(input_filepath, "w") - - problem_hparams = hparams.problems[problem_idx] - # Inputs vocabulary is set to targets if there are no inputs in the problem, - # e.g., for language models where the inputs are just a prefix of targets. - has_input = "inputs" in problem_hparams.vocabulary - inputs_vocab_key = "inputs" if has_input else "targets" - inputs_vocab = problem_hparams.vocabulary[inputs_vocab_key] - targets_vocab = problem_hparams.vocabulary["targets"] - for num_predictions, prediction in enumerate(predictions): - num_predictions += 1 - inputs = prediction["inputs"] - targets = prediction["targets"] - outputs = prediction["outputs"] - - # Log predictions - decoded_outputs = [] - decoded_scores = [] - if decode_hp.return_beams: - output_beams = np.split(outputs, decode_hp.beam_size, axis=0) - scores = None - if "scores" in prediction: - scores = np.split(prediction["scores"], decode_hp.beam_size, axis=0) - for i, beam in enumerate(output_beams): - tf.logging.info("BEAM %d:" % i) - score = scores and scores[i] - decoded = log_decode_results( - inputs, - beam, - problem_name, - num_predictions, - inputs_vocab, - targets_vocab, - save_images=decode_hp.save_images, - model_dir=estimator.model_dir, - identity_output=decode_hp.identity_output, - targets=targets, - log_targets=decode_hp.log_targets) - decoded_outputs.append(decoded) - if decode_hp.write_beam_scores: - decoded_scores.append(score) - else: + # Prepare output file writers if decode_to_file passed + if decode_to_file: + if decode_hp.shards > 1: + decode_filename = decode_to_file + ("%.2d" % decode_hp.shard_id) + else: + decode_filename = decode_to_file + output_filepath = _decode_filename(decode_filename, problem_name, + decode_hp) + parts = output_filepath.split(".") + parts[-1] = "targets" + target_filepath = ".".join(parts) + parts[-1] = "inputs" + input_filepath = ".".join(parts) + + output_file = tf.gfile.Open(output_filepath, "w") + target_file = tf.gfile.Open(target_filepath, "w") + input_file = tf.gfile.Open(input_filepath, "w") + + problem_hparams = hparams.problem_hparams + # Inputs vocabulary is set to targets if there are no inputs in the problem, + # e.g., for language models where the inputs are just a prefix of targets. + has_input = "inputs" in problem_hparams.vocabulary + inputs_vocab_key = "inputs" if has_input else "targets" + inputs_vocab = problem_hparams.vocabulary[inputs_vocab_key] + targets_vocab = problem_hparams.vocabulary["targets"] + for num_predictions, prediction in enumerate(predictions): + num_predictions += 1 + inputs = prediction["inputs"] + targets = prediction["targets"] + outputs = prediction["outputs"] + + # Log predictions + decoded_outputs = [] + decoded_scores = [] + if decode_hp.return_beams: + output_beams = np.split(outputs, decode_hp.beam_size, axis=0) + scores = None + if "scores" in prediction: + scores = np.split(prediction["scores"], decode_hp.beam_size, axis=0) + for i, beam in enumerate(output_beams): + tf.logging.info("BEAM %d:" % i) + score = scores and scores[i] decoded = log_decode_results( inputs, - outputs, + beam, problem_name, num_predictions, inputs_vocab, @@ -203,28 +185,44 @@ def decode_from_dataset(estimator, targets=targets, log_targets=decode_hp.log_targets) decoded_outputs.append(decoded) + if decode_hp.write_beam_scores: + decoded_scores.append(score) + else: + decoded = log_decode_results( + inputs, + outputs, + problem_name, + num_predictions, + inputs_vocab, + targets_vocab, + save_images=decode_hp.save_images, + model_dir=estimator.model_dir, + identity_output=decode_hp.identity_output, + targets=targets, + log_targets=decode_hp.log_targets) + decoded_outputs.append(decoded) + + # Write out predictions if decode_to_file passed + if decode_to_file: + for i, (d_input, d_output, d_target) in enumerate(decoded_outputs): + beam_score_str = "" + if decode_hp.write_beam_scores: + beam_score_str = "\t%.2f" % decoded_scores[i] + output_file.write( + str(d_output) + beam_score_str + decode_hp.delimiter) + target_file.write(str(d_target) + decode_hp.delimiter) + input_file.write(str(d_input) + decode_hp.delimiter) - # Write out predictions if decode_to_file passed - if decode_to_file: - for i, (d_input, d_output, d_target) in enumerate(decoded_outputs): - beam_score_str = "" - if decode_hp.write_beam_scores: - beam_score_str = "\t%.2f" % decoded_scores[i] - output_file.write( - str(d_output) + beam_score_str + decode_hp.delimiter) - target_file.write(str(d_target) + decode_hp.delimiter) - input_file.write(str(d_input) + decode_hp.delimiter) - - if (decode_hp.num_samples >= 0 and - num_predictions >= decode_hp.num_samples): - break + if (decode_hp.num_samples >= 0 and + num_predictions >= decode_hp.num_samples): + break - if decode_to_file: - output_file.close() - target_file.close() - input_file.close() + if decode_to_file: + output_file.close() + target_file.close() + input_file.close() - tf.logging.info("Completed inference on %d samples." % num_predictions) # pylint: disable=undefined-loop-variable + tf.logging.info("Completed inference on %d samples." % num_predictions) # pylint: disable=undefined-loop-variable def decode_from_file(estimator, @@ -239,14 +237,14 @@ def decode_from_file(estimator, tf.logging.info( "decode_hp.batch_size not specified; default=%d" % decode_hp.batch_size) - problem_id = decode_hp.problem_idx # Inputs vocabulary is set to targets if there are no inputs in the problem, # e.g., for language models where the inputs are just a prefix of targets. - has_input = "inputs" in hparams.problems[problem_id].vocabulary + p_hp = hparams.problem_hparams + has_input = "inputs" in p_hp.vocabulary inputs_vocab_key = "inputs" if has_input else "targets" - inputs_vocab = hparams.problems[problem_id].vocabulary[inputs_vocab_key] - targets_vocab = hparams.problems[problem_id].vocabulary["targets"] - problem_name = FLAGS.problems.split("-")[problem_id] + inputs_vocab = p_hp.vocabulary[inputs_vocab_key] + targets_vocab = p_hp.vocabulary["targets"] + problem_name = FLAGS.problem tf.logging.info("Performing decoding from a file.") sorted_inputs, sorted_keys = _get_sorted_inputs(filename, decode_hp.shards, decode_hp.delimiter) @@ -254,7 +252,7 @@ def decode_from_file(estimator, def input_fn(): input_gen = _decode_batch_input_fn( - problem_id, num_decode_batches, sorted_inputs, inputs_vocab, + num_decode_batches, sorted_inputs, inputs_vocab, decode_hp.batch_size, decode_hp.max_input_size) gen_fn = make_input_fn_from_generator(input_gen) example = gen_fn() @@ -355,9 +353,8 @@ def input_fn(): result_iter = estimator.predict(input_fn, checkpoint_path=checkpoint_path) for result in result_iter: - problem_idx = result["problem_choice"] is_image = False # TODO(lukaszkaiser): find out from problem id / class. - targets_vocab = hparams.problems[problem_idx].vocabulary["targets"] + targets_vocab = hparams.problem_hparams.vocabulary["targets"] if decode_hp.return_beams: beams = np.split(result["outputs"], decode_hp.beam_size, axis=0) @@ -379,7 +376,7 @@ def input_fn(): targets_vocab.decode(_save_until_eos(result["outputs"], is_image))) -def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, +def _decode_batch_input_fn(num_decode_batches, sorted_inputs, vocabulary, batch_size, max_input_size): tf.logging.info(" batch %d" % num_decode_batches) # First reverse all the input sentences so that if you're going to get OOMs, @@ -406,7 +403,6 @@ def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, yield { "inputs": np.array(final_batch_inputs).astype(np.int32), - "problem_choice": np.array(problem_id).astype(np.int32), } @@ -432,8 +428,7 @@ def _interactive_input_fn(hparams, decode_hp): num_samples = decode_hp.num_samples if decode_hp.num_samples > 0 else 1 decode_length = decode_hp.extra_length input_type = "text" - problem_id = 0 - p_hparams = hparams.problems[problem_id] + p_hparams = hparams.problem_hparams has_input = "inputs" in p_hparams.input_modality vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"] # This should be longer than the longest input. @@ -447,9 +442,6 @@ def _interactive_input_fn(hparams, decode_hp): prompt = ("INTERACTIVE MODE num_samples=%d decode_length=%d \n" " it= ('text' or 'image' or 'label', default: " "text)\n" - " pr= (set the problem number, default: 0)\n" - " in= (set the input problem number)\n" - " ou= (set the output problem number)\n" " ns= (changes number of samples, default: 1)\n" " dl= (changes decode length, default: 100)\n" " <%s> (decode)\n" @@ -459,19 +451,6 @@ def _interactive_input_fn(hparams, decode_hp): input_string = input(prompt) if input_string == "q": return - elif input_string[:3] == "pr=": - problem_id = int(input_string[3:]) - p_hparams = hparams.problems[problem_id] - has_input = "inputs" in p_hparams.input_modality - vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"] - elif input_string[:3] == "in=": - problem = int(input_string[3:]) - p_hparams.input_modality = hparams.problems[problem].input_modality - p_hparams.input_space_id = hparams.problems[problem].input_space_id - elif input_string[:3] == "ou=": - problem = int(input_string[3:]) - p_hparams.target_modality = hparams.problems[problem].target_modality - p_hparams.target_space_id = hparams.problems[problem].target_space_id elif input_string[:3] == "ns=": num_samples = int(input_string[3:]) elif input_string[:3] == "dl=": @@ -503,7 +482,8 @@ def _interactive_input_fn(hparams, decode_hp): } else: raise Exception("Unsupported input type.") - for k, v in six.iteritems(problem_hparams_to_features(p_hparams)): + for k, v in six.iteritems( + problem_lib.problem_hparams_to_features(p_hparams)): features[k] = np.array(v).astype(np.int32) yield features @@ -574,8 +554,7 @@ def _interactive_input_tensor_to_features_dict(feature_map, hparams): """Convert the interactive input format (see above) to a dictionary. Args: - feature_map: a dictionary with keys `problem_choice` and `input` containing - Tensors. + feature_map: dict with inputs. hparams: model hyperparameters Returns: @@ -584,31 +563,26 @@ def _interactive_input_tensor_to_features_dict(feature_map, hparams): inputs = tf.convert_to_tensor(feature_map["inputs"]) input_is_image = False if len(inputs.get_shape()) < 3 else True - def input_fn(problem_choice, x=inputs): # pylint: disable=missing-docstring - if input_is_image: - x = tf.image.resize_images(x, [299, 299]) - x = tf.reshape(x, [1, 299, 299, -1]) - x = tf.to_int32(x) - else: - # Remove the batch dimension. - num_samples = x[0] - length = x[2] - x = tf.slice(x, [3], tf.to_int32([length])) - x = tf.reshape(x, [1, -1, 1, 1]) - # Transform into a batch of size num_samples to get that many random - # decodes. - x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1])) - - p_hparams = hparams.problems[problem_choice] - return (tf.constant(p_hparams.input_space_id), tf.constant( - p_hparams.target_space_id), x) - - input_space_id, target_space_id, x = cond_on_index( - input_fn, feature_map["problem_choice"], len(hparams.problems) - 1) + x = inputs + if input_is_image: + x = tf.image.resize_images(x, [299, 299]) + x = tf.reshape(x, [1, 299, 299, -1]) + x = tf.to_int32(x) + else: + # Remove the batch dimension. + num_samples = x[0] + length = x[2] + x = tf.slice(x, [3], tf.to_int32([length])) + x = tf.reshape(x, [1, -1, 1, 1]) + # Transform into a batch of size num_samples to get that many random + # decodes. + x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1])) + + p_hparams = hparams.problem_hparams + input_space_id = tf.constant(p_hparams.input_space_id) + target_space_id = tf.constant(p_hparams.target_space_id) features = {} - features["problem_choice"] = tf.convert_to_tensor( - feature_map["problem_choice"]) features["input_space_id"] = input_space_id features["target_space_id"] = target_space_id features["decode_length"] = ( @@ -621,8 +595,7 @@ def _decode_input_tensor_to_features_dict(feature_map, hparams): """Convert the interactive input format (see above) to a dictionary. Args: - feature_map: a dictionary with keys `problem_choice` and `input` containing - Tensors. + feature_map: dict with inputs. hparams: model hyperparameters Returns: @@ -631,34 +604,18 @@ def _decode_input_tensor_to_features_dict(feature_map, hparams): inputs = tf.convert_to_tensor(feature_map["inputs"]) input_is_image = False - def input_fn(problem_choice, x=inputs): # pylint: disable=missing-docstring - p_hparams = hparams.problems[problem_choice] - # Add a third empty dimension - x = tf.expand_dims(x, axis=[2]) - x = tf.to_int32(x) - return (tf.constant(p_hparams.input_space_id), tf.constant( - p_hparams.target_space_id), x) - - input_space_id, target_space_id, x = cond_on_index( - input_fn, feature_map["problem_choice"], len(hparams.problems) - 1) + x = inputs + p_hparams = hparams.problem_hparams + # Add a third empty dimension + x = tf.expand_dims(x, axis=[2]) + x = tf.to_int32(x) + input_space_id = tf.constant(p_hparams.input_space_id) + target_space_id = tf.constant(p_hparams.target_space_id) features = {} - features["problem_choice"] = feature_map["problem_choice"] features["input_space_id"] = input_space_id features["target_space_id"] = target_space_id features["decode_length"] = ( IMAGE_DECODE_LENGTH if input_is_image else tf.shape(x)[1] + 50) features["inputs"] = x return features - - -def cond_on_index(fn, index_tensor, max_idx, cur_idx=0): - """Call fn(index_tensor) using tf.cond in [cur_id, max_idx].""" - if cur_idx == max_idx: - return fn(cur_idx) - - return tf.cond( - tf.equal(index_tensor, cur_idx), - lambda: fn(cur_idx), - lambda: cond_on_index(fn, index_tensor, max_idx, cur_idx + 1) - ) diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py index 2bfd35f01..a83616500 100644 --- a/tensor2tensor/utils/expert_utils.py +++ b/tensor2tensor/utils/expert_utils.py @@ -29,11 +29,10 @@ # Dependency imports import six -from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import range # pylint: disable=redefined-builtin from six.moves import zip # pylint: disable=redefined-builtin import tensorflow as tf -from tensorflow.python.eager import context from tensorflow.python.framework import function DEFAULT_DEV_STRING = "existing_device" @@ -117,7 +116,7 @@ class Parallelism(object): e = [] f = [] - for i in xrange(len(devices)): + for i in range(len(devices)): with tf.device(devices[i]): e_, f_ = func(a[i], b[i], c) e.append(e_) @@ -177,11 +176,11 @@ def __call__(self, fn, *args, **kwargs): my_args = transpose_list_of_lists( [self._maybe_repeat(arg) for arg in args]) else: - my_args = [[] for _ in xrange(self.n)] - my_kwargs = [{} for _ in xrange(self.n)] + my_args = [[] for _ in range(self.n)] + my_kwargs = [{} for _ in range(self.n)] for k, v in six.iteritems(kwargs): vals = self._maybe_repeat(v) - for i in xrange(self.n): + for i in range(self.n): my_kwargs[i][k] = vals[i] # Construct lists of functions. @@ -191,7 +190,7 @@ def __call__(self, fn, *args, **kwargs): outputs = [] cache = {} tensor_to_var = {} - for i in xrange(self.n): + for i in range(self.n): def daisy_chain_getter(getter, name, *args, **kwargs): """Get a variable and cache in a daisy chain.""" @@ -427,7 +426,7 @@ def _my_top_k(x, k): values = [] indices = [] depth = tf.shape(x)[1] - for i in xrange(k): + for i in range(k): values.append(tf.reduce_max(x, 1)) argmax = tf.argmax(x, 1) indices.append(argmax) @@ -560,7 +559,7 @@ def remove(self, x): x, indices=self.nonpad_ids, ) - if not context.in_eager_mode(): + if not tf.contrib.eager.in_eager_mode(): # This is a hack but for some reason, gather_nd return a tensor of # undefined shape, so the shape is set up manually x.set_shape([None] + x_shape[1:]) @@ -895,7 +894,7 @@ def ffn_expert_fn(input_size, """ def my_fn(x): layer_sizes = [input_size] + hidden_sizes + [output_size] - for i in xrange(1 + len(hidden_sizes)): + for i in range(1 + len(hidden_sizes)): w = tf.get_variable("w_%d" % i, layer_sizes[i:i+2], tf.float32) x = tf.matmul(x, w) if i < len(hidden_sizes): @@ -909,7 +908,7 @@ def my_fn(x): def reshape_like(a, b): """Reshapes a to match the shape of b in all but the last dimension.""" ret = tf.reshape(a, tf.concat([tf.shape(b)[:-1], tf.shape(a)[-1:]], 0)) - if not context.in_eager_mode(): + if not tf.contrib.eager.in_eager_mode(): ret.set_shape(b.get_shape().as_list()[:-1] + a.get_shape().as_list()[-1:]) return ret @@ -917,7 +916,7 @@ def reshape_like(a, b): def flatten_all_but_last(a): """Flatten all dimensions of a except the last.""" ret = tf.reshape(a, [-1, tf.shape(a)[-1]]) - if not context.in_eager_mode(): + if not tf.contrib.eager.in_eager_mode(): ret.set_shape([None] + a.get_shape().as_list()[-1:]) return ret @@ -962,7 +961,7 @@ def distributed_moe(data_parallelism, # We use the default of reuse=False. Otherwise, the experts would all # use the same variables. ep = Parallelism( - [expert_devices[i % len(expert_devices)] for i in xrange(num_experts)], + [expert_devices[i % len(expert_devices)] for i in range(num_experts)], reuse=None) # Experts expect 2d input tensors, so flatten the batch dimension and all # spatial dimensions together. diff --git a/tensor2tensor/utils/flags.py b/tensor2tensor/utils/flags.py index 08b40efdf..20827e69a 100644 --- a/tensor2tensor/utils/flags.py +++ b/tensor2tensor/utils/flags.py @@ -43,8 +43,7 @@ hyperparameters or when using Vizier. If a hyperparameter setting is specified by this flag then it must be a valid hyperparameter name for the model.""") -flags.DEFINE_string("problems", None, "Dash separated list of problems to " - "solve.") +flags.DEFINE_string("problem", None, "Problem name.") # data_dir is a common flag name - catch conflicts and define it once. try: diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py index dc3b71607..864ff49d5 100644 --- a/tensor2tensor/utils/metrics.py +++ b/tensor2tensor/utils/metrics.py @@ -362,7 +362,7 @@ def create_evaluation_metrics(problems, model_hparams): Returns: dict. The metric functions have signature (Tensor predictions, features) -> (metric Tensor, update op), where features - is a dict with keys {targets, problem_choice}. + is a dict with keys {targets}. Raises: ValueError: if the metrics specified by a problem are not recognized (i.e. @@ -379,13 +379,11 @@ def reduce_dimensions(predictions, labels): labels, [-1] + common_layers.shape_list(labels)[-3:]) return predictions, labels - def make_problem_specific_metric_fn(metric_fn, problem_idx, weights_fn): - """Create a metric fn conditioned on problem_idx.""" + def make_problem_specific_metric_fn(metric_fn, weights_fn): + """Create a metric fn.""" def problem_metric_fn(predictions, features, labels): """Metric fn.""" - problem_choice = features.get("problem_choice", 0) - # Send along the entire features dict if the metric fn has the kwarg # "features". kwargs = {} @@ -395,19 +393,14 @@ def problem_metric_fn(predictions, features, labels): predictions, labels = reduce_dimensions(predictions, labels) - def wrapped_metric_fn(): - return metric_fn(predictions, labels, weights_fn=weights_fn, **kwargs) - - (scores, weights) = tf.cond( - tf.equal(problem_idx, problem_choice), wrapped_metric_fn, - lambda: (tf.constant(0.0), tf.constant(0.0))) - # The tf.metrics.mean function assures correct aggregation. + scores, weights = metric_fn(predictions, labels, + weights_fn=weights_fn, **kwargs) return tf.metrics.mean(scores, weights) return problem_metric_fn eval_metrics = dict() - for problem_idx, problem_instance in enumerate(problems): + for problem_instance in problems: problem_name = problem_instance.name metrics = problem_instance.eval_metrics() if not all([m in METRICS_FNS for m in metrics]): @@ -440,7 +433,7 @@ def image_wrapped_metric_fn(predictions, eval_metrics[metric_name] = image_wrapped_metric_fn else: problem_metric_fn = make_problem_specific_metric_fn( - metric_fn, problem_idx, weights_fn) + metric_fn, weights_fn) eval_metrics[metric_name] = problem_metric_fn else: if isinstance(tm, tuple): @@ -454,7 +447,7 @@ def image_wrapped_metric_fn(predictions, eval_metrics[metric_name] = image_wrapped_metric_fn else: problem_metric_fn = make_problem_specific_metric_fn( - metric_fn, problem_idx, weights_fn) + metric_fn, weights_fn) eval_metrics[metric_name] = problem_metric_fn return eval_metrics diff --git a/tensor2tensor/utils/quantization.py b/tensor2tensor/utils/quantization.py new file mode 100644 index 000000000..339bcf9f5 --- /dev/null +++ b/tensor2tensor/utils/quantization.py @@ -0,0 +1,279 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities related to using bfloat16 activations and/or parameters.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import tensorflow as tf + +from tensorflow.python.framework import function + + +def bfloat16_activations_var_getter(getter, *args, **kwargs): + """A custom getter function for float32 parameters and bfloat16 activations. + + Args: + getter: custom getter + *args: arguments + **kwargs: keyword arguments + Returns: + variables with the correct dtype. + Raises: + KeyError: if "dtype" is not provided as a kwarg. + """ + requested_dtype = kwargs["dtype"] + if requested_dtype == tf.bfloat16: + kwargs["dtype"] = tf.float32 + var = getter(*args, **kwargs) + # This if statement is needed to guard the cast, because batch norm + # assigns directly to the return value of this custom getter. The cast + # makes the return value not a variable so it cannot be assigned. Batch + # norm variables are always in fp32 so this if statement is never + # triggered for them. + if var.dtype.base_dtype != requested_dtype: + var = tf.cast(var, requested_dtype) + return var + + +def simulated_quantize(x, num_bits, noise): + """Simulate quantization to num_bits bits, with externally-stored scale. + + num_bits is the number of bits used to store each value. + noise is a float32 Tensor containing values in [0, 1). + Each value in noise should take different values across + different steps, approximating a uniform distribution over [0, 1). + In the case of replicated TPU training, noise should be identical + across replicas in order to keep the parameters identical across replicas. + + The natural choice for noise would be tf.random_uniform(), + but this is not possible for TPU, since there is currently no way to seed + the different cores to produce identical values across replicas. Instead we + use noise_from_step_num() (see below). + + The quantization scheme is as follows: + + Compute the maximum absolute value by row (call this max_abs). + Store this either in an auxiliary variable or in an extra column. + + Divide the parameters by (max_abs / (2^(num_bits-1)-1)). This gives a + float32 value in the range [-2^(num_bits-1)-1, 2^(num_bits-1)-1] + + Unbiased randomized roundoff by adding noise and rounding down. + + This produces a signed integer with num_bits bits which can then be stored. + + Args: + x: a float32 Tensor + num_bits: an integer between 1 and 22 + noise: a float Tensor broadcastable to the shape of x. + + Returns: + a float32 Tensor + """ + shape = x.get_shape().as_list() + if not (len(shape) >= 2 and shape[-1] > 1): + return x + max_abs = tf.reduce_max(tf.abs(x), -1, keep_dims=True) + 1e-9 + max_int = 2 ** (num_bits - 1) - 1 + scale = max_abs / max_int + x /= scale + x = tf.floor(x + noise) + # dequantize before storing (since this is a simulation) + x *= scale + return x + + +def noise_from_step_num(): + """Quantization noise equal to (phi * (step_num + 1)) mod 1.0. + + Not using random_uniform here due to a problem on TPU in that random seeds + are not respected, which may cause the parameters on different replicas + to go out-of-sync. + + Returns: + a float32 scalar + """ + step = tf.to_int32(tf.train.get_or_create_global_step()) + 1 + phi = ((5 ** 0.5) - 1) / 2 + # Naive computation tf.mod(phi * step, 1.0) in float32 would be disastrous + # due to loss of precision when the step number gets large. + # Computation in doubles does not work on TPU, so we use this complicated + # alternative computation which does not suffer from these roundoff errors. + ret = 0.0 + for i in range(30): + ret += (((phi * (2 ** i)) % 1.0) # double-precision computation in python + * tf.to_float(tf.mod(step // (2 ** i), 2))) + return tf.mod(ret, 1.0) + + +def _randomized_roundoff_to_bfloat16(x, noise, cand1, cand2): + """Round-off x to cand1 or to cand2 in an unbiased way. + + Cand1 and cand2 are the same shape as x. + For every element of x, the corresponding elements of cand1 and cand2 should + be the two closest bfloat16 values to x. Order does not matter. + cand1 and cand2 must differ from each other. + + Args: + x: A float32 Tensor. + noise: A Tensor broadcastable to the shape of x containing + random uniform values in [0.0, 1.0]. + cand1: A bfloat16 Tensor the same shape as x. + cand2: A bfloat16 Tensor the same shape as x. + + Returns: + A bfloat16 Tensor. + """ + cand1_f = tf.to_float(cand1) + cand2_f = tf.to_float(cand2) + step_size = cand2_f - cand1_f + fpart = (x - cand1_f) / step_size + ret = tf.where(tf.greater(fpart, noise), cand2, cand1) + return ret + + +def _to_bfloat16_unbiased(x, noise): + """Convert a float32 to a bfloat16 using randomized roundoff. + + Args: + x: A float32 Tensor. + noise: a float32 Tensor with values in [0, 1), broadcastable to tf.shape(x) + Returns: + A float32 Tensor. + """ + x_sign = tf.sign(x) + # Make sure x is positive. If it is zero, the two candidates are identical. + x = x * x_sign + 1e-30 + cand1 = tf.to_bfloat16(x) + cand1_f = tf.to_float(cand1) + # This relies on the fact that for a positive bfloat16 b, + # b * 1.005 gives you the next higher bfloat16 and b*0.995 gives you the + # next lower one. Both 1.005 and 0.995 are ballpark estimation. + cand2 = tf.to_bfloat16( + tf.where(tf.greater(x, cand1_f), cand1_f * 1.005, cand1_f * 0.995)) + ret = _randomized_roundoff_to_bfloat16(x, noise, cand1, cand2) + return ret * tf.to_bfloat16(x_sign) + + +class ParameterEncoding(object): + """Helper class for encoding weights as bfloat16. + + For now, the parameters are always stored (encoded) as bfloat16 and decoded + to bfloat32. Confusingly, the custom getter then converts the bfloat32 back + to a bfloat16 to use as an activation, assuming that we use bfloat16 for + activations. + + TODO(noam): Add options for activation dtype=float32, and for different + storage dtypes. + """ + + def encode(self, x, noise): + """Encode float32 to bfloat16. + + Args: + x: a float32 Tensor + noise: a float32 Tensor with values in [0, 1), broadcastable to shape(x) + + Returns: + a bfloat16 Tensor + """ + raise NotImplementedError("encode not implemented") + + def decode(self, x): + """Decode bfloat16 to float32.""" + raise NotImplementedError("decode not implemented") + + def _decode_with_identity_gradient(self, x): + # identity backprop through the decoder. + # This means that the optimizer must call encode when updating weights. + @function.Defun(python_grad_func=lambda op, dy: dy, + shape_func=lambda op: [op.inputs[0].get_shape()]) + def my_fn(x): + return self.decode(x) + return my_fn(x) + + def custom_getter(self, activation_dtype=tf.bfloat16): + """A custom getter that uses the encoding for bfloat16 and float32 vars. + + When a bfloat16 or float32 variable is requsted, an encoded float16 + varaible is created, which is then decoded and cast to a bfloat16 + activation. + + Args: + activation_dtype: a dtype to which to convert the decoded value. + + Returns: + a function. + """ + def getter_fn(getter, *args, **kwargs): + requested_dtype = kwargs["dtype"] + if requested_dtype in (tf.bfloat16, tf.float32): + kwargs["dtype"] = tf.bfloat16 + kwargs["initializer"] = _EncodingInitializer( + kwargs["initializer"], self) + ret = self._decode_with_identity_gradient(getter(*args, **kwargs)) + return tf.cast(ret, activation_dtype) + return getter(*args, **kwargs) + return getter_fn + + +class _EncodingInitializer(object): + """Helper class for ParameterEncoding. + + Initializes variables by calling base initializer, then encoding. + """ + + def __init__(self, base_initializer, parameter_encoding): + self._base_initializer = base_initializer + self._parameter_encoding = parameter_encoding + + def __call__(self, shape, dtype, partition_info=None): + if self._base_initializer is None: + # mimic default initialization in tf.get_variable() + if dtype.is_floating: + ret = tf.glorot_uniform_initializer()(shape, dtype) + else: + ret = tf.zeros(shape, dtype) + else: + ret = self._base_initializer(shape, dtype, partition_info=partition_info) + noise = 0.0 # no random noise in the initializer. + return tf.cast(self._parameter_encoding.encode(ret, noise), dtype) + + +class EighthPowerEncoding(ParameterEncoding): + """enc(x) = sign(x) * (abs(x)*128)^8. + + This provides less range and more resolution. + The range of representable positive values is approximately [2^-23, 2^9] + Resolution is 8x better than bfloat16. + """ + + def encode(self, x, noise): + x = tf.to_float(x) + # we can't use tf.pow(..., 8.0) because of a high-error approximation + # on TPU. Instead we square three times. + x = tf.sign(x) * tf.square(tf.square(tf.square(tf.abs(x) * 128.0))) + x = _to_bfloat16_unbiased(x, noise) + return x + + def decode(self, x): + x = tf.to_float(x) + # we can't use tf.pow(..., 0.125) because of a high-error approximation + # on TPU. Instead we sqrt three times. + return tf.sign(x) * (tf.sqrt(tf.sqrt(tf.sqrt(tf.abs(x)))) / 128.0) diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py index 16a6c7437..ef0a6cfc0 100644 --- a/tensor2tensor/utils/registry.py +++ b/tensor2tensor/utils/registry.py @@ -245,8 +245,7 @@ def parse_problem_name(problem_name): """Determines if problem_name specifies a copy and/or reversal. Args: - problem_name: A string containing a single problem name from - FLAGS.problems. + problem_name: str, problem name, possibly with suffixes. Returns: base_name: A string with the base problem name. diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index 577b47af0..50d036f33 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -21,6 +21,7 @@ import collections import contextlib import copy +import functools import math import time @@ -31,17 +32,18 @@ from tensor2tensor.data_generators import text_encoder from tensor2tensor.data_generators.problem import problem_hparams_to_features from tensor2tensor.layers import common_layers +from tensor2tensor.layers import modalities # pylint: disable=unused-import from tensor2tensor.utils import beam_search from tensor2tensor.utils import decoding from tensor2tensor.utils import expert_utils as eu from tensor2tensor.utils import learning_rate from tensor2tensor.utils import metrics from tensor2tensor.utils import optimize +from tensor2tensor.utils import quantization from tensor2tensor.utils import registry import tensorflow as tf -from tensorflow.python.eager import context from tensorflow.python.layers import base from tensorflow.python.ops import variable_scope @@ -73,7 +75,7 @@ def __init__(self, hparams: tf.contrib.training.HParams, model hyperparameters. mode: tf.estimator.ModeKeys, the execution mode. problem_hparams: tf.contrib.training.HParams, hyperparameters for the - Problem. If provided here or in hparams.problems, the model will + Problem. If provided here or in hparams.problem_hparams, the model will automatically determine bottom, top, and loss methods. If not provided, calling the model will only invoke body. data_parallelism: a expert_utils.Parallelism object, @@ -90,8 +92,8 @@ def __init__(self, super(T2TModel, self).__init__( trainable=mode == tf.estimator.ModeKeys.TRAIN, name=name) - if not problem_hparams and hasattr(hparams, "problems"): - problem_hparams = hparams.problems[0] + if not problem_hparams and hasattr(hparams, "problem_hparams"): + problem_hparams = hparams.problem_hparams self._problem_hparams = problem_hparams # Setup hparams @@ -129,13 +131,22 @@ def has_input(self): else: return True - def call(self, features): - custom_getter = None - if self.hparams.activation_dtype == "bfloat16": - custom_getter = common_layers.bfloat16_var_getter + @property + def _custom_getter(self): if self.hparams.weight_dtype == "bfloat16": - custom_getter = common_layers.bfloat16_weights_var_getter - tf.get_variable_scope().set_custom_getter(custom_getter) + if self.hparams.optimizer != "Adafactor": + raise NotImplementedError( + "weight_dtype=bfloat16 only implemented with Adafactor optimizer") + return quantization.EighthPowerEncoding().custom_getter( + activation_dtype=tf.bfloat16 + if self.hparams.activation_dtype == "bfloat16" else tf.float32) + elif self.hparams.activation_dtype == "bfloat16": + return quantization.bfloat16_activations_var_getter + else: + return None + + def call(self, features): + set_custom_getter_compose(self._custom_getter) tf.get_variable_scope().set_initializer( optimize.get_variable_initializer(self.hparams)) with self._eager_var_store.as_default(): @@ -222,8 +233,7 @@ def model_fn_sharded(self, sharded_features): def model_fn(self, features): transformed_features = self.bottom(features) - if (self.hparams.activation_dtype == "bfloat16" or - self.hparams.weight_dtype == "bfloat16"): + if self.hparams.activation_dtype == "bfloat16": for k, v in six.iteritems(transformed_features): if v.dtype == tf.float32: transformed_features[k] = tf.cast(v, tf.bfloat16) @@ -239,7 +249,9 @@ def model_fn(self, features): logits = output else: logits = self.top(output, features) - losses["training"] = self.loss(logits, features) + losses["training"] = 0.0 + if self._hparams.mode != tf.estimator.ModeKeys.PREDICT: + losses["training"] = self.loss(logits, features) return logits, losses @@ -338,9 +350,9 @@ def top(self, body_output, features): target_modality = self._problem_hparams.target_modality else: target_modality = {k: None for k in body_output.keys()} - # assert set(body_output.keys()) == set(target_modality.keys()), ( - # "The keys of model_body's returned logits dict must match the keys " - # "of problem_hparams.target_modality's dict.") + assert set(body_output.keys()) == set(target_modality.keys()), ( + "The keys of model_body's returned logits dict must match the keys " + "of problem_hparams.target_modality's dict.") logits = {} for k, v in six.iteritems(body_output): with tf.variable_scope(k): # TODO(aidangomez): share variables here? @@ -351,9 +363,9 @@ def top(self, body_output, features): target_modality = self._problem_hparams.target_modality else: target_modality = None - # assert not isinstance(target_modality, dict), ( - # "model_body must return a dictionary of logits when " - # "problem_hparams.target_modality is a dict.") + assert not isinstance(target_modality, dict), ( + "model_body must return a dictionary of logits when " + "problem_hparams.target_modality is a dict.") return self._top_single(body_output, target_modality, features) def _loss_single(self, logits, target_modality, feature): @@ -517,6 +529,7 @@ def infer(self, "losses": a dictionary: {loss-name (string): floating point `Scalar` } """ + set_custom_getter_compose(self._custom_getter) with self._eager_var_store.as_default(): # TODO(rsepassi): Make decoding work with real-valued model outputs # (i.e. if the target modality is RealModality). @@ -693,7 +706,13 @@ def _slow_greedy_infer(self, features, decode_length): inputs_old = features["inputs"] features["inputs"] = tf.expand_dims(features["inputs"], 2) if not self.has_input: - features["partial_targets"] = tf.to_int64(features["inputs"]) + # Prepare partial targets. + # In either features["inputs"] or features["targets"]. + # We force the outputs to begin with these sequences. + partial_targets = features.get("inputs") + if partial_targets is None: + partial_targets = features["targets"] + features["partial_targets"] = tf.to_int64(partial_targets) # Save the targets in a var and reassign it after the tf.while loop to avoid # having targets being in a 'while' frame. This ensures targets when used # in metric functions stays in the same frame as other vars. @@ -703,7 +722,7 @@ def _slow_greedy_infer(self, features, decode_length): def infer_step(recent_output, recent_logits, unused_loss): """Inference step.""" - if not context.in_eager_mode(): + if not tf.contrib.eager.in_eager_mode(): recent_output.set_shape([None, None, None, 1]) padded = tf.pad(recent_output, [[0, 0], [0, 1], [0, 0], [0, 0]]) features["targets"] = padded @@ -719,7 +738,7 @@ def infer_step(recent_output, recent_logits, unused_loss): common_layers.shape_list(recent_output)[1], :, :] cur_sample = tf.to_int64(tf.expand_dims(cur_sample, axis=1)) samples = tf.concat([recent_output, cur_sample], axis=1) - if not context.in_eager_mode(): + if not tf.contrib.eager.in_eager_mode(): samples.set_shape([None, None, None, 1]) # Assuming we have one shard for logits. @@ -745,13 +764,19 @@ def infer_step(recent_output, recent_logits, unused_loss): if target_modality.is_class_modality: decode_length = 1 else: - decode_length = common_layers.shape_list( - features["inputs"])[1] + decode_length + if "partial_targets" in features: + prefix_length = common_layers.shape_list( + features["partial_targets"])[1] + else: + prefix_length = common_layers.shape_list( + features["inputs"])[1] + decode_length = prefix_length + decode_length + # Initial values of result, logits and loss. result = initial_output # tensor of shape [batch_size, time, 1, 1, vocab_size] logits = tf.zeros((batch_size, 0, 1, 1, target_modality.top_dimensionality)) - if not context.in_eager_mode(): + if not tf.contrib.eager.in_eager_mode(): logits.set_shape([None, None, None, None, None]) loss = 0.0 @@ -1002,10 +1027,10 @@ def estimator_spec_eval(self, features, logits, labels, loss, losses_dict): """Construct EstimatorSpec for EVAL mode.""" hparams = self.hparams - if not hasattr(hparams, "problem_instances"): + if not hasattr(hparams, "problem"): raise NotImplementedError(_no_problem_err("estimator_spec_eval")) - problem = hparams.problem_instances[0] + problem = hparams.problem if common_layers.is_on_tpu(): _remove_summaries() if isinstance(logits, dict): @@ -1066,15 +1091,11 @@ def estimator_spec_predict(self, features): if inputs is None: inputs = features["targets"] - batched_problem_choice = ( - features["problem_choice"] * tf.ones( - (common_layers.shape_list(inputs)[0],), dtype=tf.int32)) predictions = { "outputs": outputs, "scores": scores, "inputs": inputs, "targets": features.get("infer_targets"), - "problem_choice": batched_problem_choice, "batch_prediction_key": features.get("batch_prediction_key"), } _del_dict_nones(predictions) @@ -1294,7 +1315,7 @@ def as_default(self): def create_eager_var_store(): - if context.in_eager_mode(): + if tf.contrib.eager.in_eager_mode(): return variable_scope.EagerVariableStore() else: return DummyVariableStore() @@ -1395,7 +1416,7 @@ def summarize_features(features, num_shards=1): def _eager_log(level, *args): - if context.in_eager_mode() and args in _already_logged: + if tf.contrib.eager.in_eager_mode() and args in _already_logged: return _already_logged.add(args) getattr(tf.logging, level)(*args) @@ -1407,3 +1428,42 @@ def log_info(*args): def log_warn(*args): _eager_log("warn", *args) + + +def _compose_custom_getters(getter_a, getter_b): + """Compose two custom getters. + + Example use: + tf.get_variable_scope().set_custom_getter( + compose_custom_getters(tf.get_variable_scope().custom_getter, new_getter)) + + This composes getters in the same way as creating a new variable scope with + the new_getter, but it does not actually create a new variable scope. + + Args: + getter_a: a custom getter - generally from the existing variable scope. + getter_b: a custom getter + + Returns: + a custom getter + """ + if not getter_a: + return getter_b + if not getter_b: + return getter_a + def getter_fn(getter, *args, **kwargs): + return getter_b(functools.partial(getter_a, getter), *args, **kwargs) + return getter_fn + + +def set_custom_getter_compose(custom_getter): + """Set a custom getter in the current variable scope. + + Do not overwrite the existing custom getter - rather compose with it. + + Args: + custom_getter: a custom getter. + """ + tf.get_variable_scope().set_custom_getter( + _compose_custom_getters( + tf.get_variable_scope().custom_getter, custom_getter)) diff --git a/tensor2tensor/utils/trainer_lib.py b/tensor2tensor/utils/trainer_lib.py index 4a6e5f2b6..9cfd1264a 100644 --- a/tensor2tensor/utils/trainer_lib.py +++ b/tensor2tensor/utils/trainer_lib.py @@ -185,7 +185,7 @@ def create_estimator(model_name, model_name, hparams, decode_hparams=decode_hparams, use_tpu=use_tpu) if use_tpu: - problem = hparams.problem_instances[0] + problem = hparams.problem batch_size = (problem.tpu_batch_size_per_shard(hparams) * run_config.tpu_config.num_shards) return tf.contrib.tpu.TPUEstimator( @@ -217,12 +217,7 @@ def create_hooks(use_tfdbg=False, use_dbgprofile=False, dbgprofile_kwargs=None, tf.logging.info("Using ProfilerHook") defaults = dict(save_steps=10, show_dataflow=True, show_memory=True) defaults.update(dbgprofile_kwargs) - # To handle different versions of TF - if hasattr(tf.train, "ProfilerHook"): - hook_mod = tf.train - else: - hook_mod = tf.contrib.hooks - train_monitors.append(hook_mod.ProfilerHook(**defaults)) + train_monitors.append(tf.train.ProfilerHook(**defaults)) if use_validation_monitor: tf.logging.info("Using ValidationMonitor") @@ -276,7 +271,7 @@ def create_experiment(run_config, use_tpu=use_tpu) # Input fns from Problem - problem = hparams.problem_instances[0] + problem = hparams.problem train_input_fn = problem.make_estimator_input_fn( tf.estimator.ModeKeys.TRAIN, hparams) eval_input_fn = problem.make_estimator_input_fn( @@ -358,8 +353,8 @@ def add_problem_hparams(hparams, problem_name): problem = registry.problem(problem_name) p_hparams = problem.get_hparams(hparams) - hparams.problem_instances = [problem] - hparams.problems = [p_hparams] + hparams.problem = problem + hparams.problem_hparams = p_hparams def set_random_seed(seed): diff --git a/tensor2tensor/utils/trainer_lib_test.py b/tensor2tensor/utils/trainer_lib_test.py index a9a6e692c..6ae599721 100644 --- a/tensor2tensor/utils/trainer_lib_test.py +++ b/tensor2tensor/utils/trainer_lib_test.py @@ -80,7 +80,7 @@ def testModel(self): "transformer_tiny", data_dir=self.data_dir, problem_name="tiny_algo") # Dataset - problem = hparams.problem_instances[0] + problem = hparams.problem dataset = problem.dataset(tf.estimator.ModeKeys.TRAIN, self.data_dir) dataset = dataset.repeat(None).padded_batch(10, dataset.output_shapes) features = dataset.make_one_shot_iterator().get_next() @@ -105,15 +105,15 @@ def testMultipleTargetModalities(self): # HParams hparams = trainer_lib.create_hparams( "transformer_tiny", data_dir=self.data_dir, problem_name="tiny_algo") - tm = hparams.problem_instances[0].get_hparams().target_modality - hparams.problem_instances[0].get_hparams().target_modality = { + tm = hparams.problem.get_hparams().target_modality + hparams.problem.get_hparams().target_modality = { "targets": tm, "A": tm, "B": tm } # Dataset - problem = hparams.problem_instances[0] + problem = hparams.problem dataset = problem.dataset(tf.estimator.ModeKeys.TRAIN, self.data_dir) dataset = dataset.repeat(None).padded_batch(10, dataset.output_shapes) features = dataset.make_one_shot_iterator().get_next()