Merge pull request #552 from rsepassi/push

v1.4.3
tensorflow · Feb 2, 2018 · 103d057 · 103d057
2 parents 23f906d + 1cd3f25
commit 103d057
Show file tree

Hide file tree

Showing 74 changed files with 38,673 additions and 3,356 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -6,23 +6,33 @@ before_install:
   - sudo apt-get update -qq
   - sudo apt-get install -qq libhdf5-dev
 install:
-  - pip install -q tensorflow
+  - pip install -q .[tensorflow]
   - pip install -q .[tests]
+  # Make sure we have the latest version of numpy - avoid problems we were
+  # seeing with Python 3
+  - pip install -q -U numpy
 env:
   global:
     - T2T_PROBLEM=algorithmic_reverse_binary40_test
     - T2T_DATA_DIR=/tmp/t2t-data
     - T2T_TRAIN_DIR=/tmp/t2t-train
 script:
+  # Check import
+  - python -c "from tensor2tensor.models import transformer; print(transformer.Transformer.__name__)"
+
+  # Run tests
   - pytest --ignore=tensor2tensor/utils/registry_test.py --ignore=tensor2tensor/problems_test.py --ignore=tensor2tensor/utils/trainer_lib_test.py --ignore=tensor2tensor/data_generators/algorithmic_math_test.py
   - pytest tensor2tensor/utils/registry_test.py
   - pytest tensor2tensor/utils/trainer_lib_test.py
+
+  # Run installed scripts
   - t2t-datagen 2>&1 | grep translate && echo passed
-  - t2t-trainer --registry_help --t2t_usr_dir=./tensor2tensor/test_data/example_usr_dir 2>&1 | grep my_very_own_hparams && echo passed
-  - python -c "from tensor2tensor.models import transformer; print(transformer.Transformer.__name__)"
   - t2t-trainer --registry_help
-  - mkdir $T2T_DATA_DIR
-  - mkdir $T2T_TRAIN_DIR
+
+  # Test --t2t_usr_dir
+  - t2t-trainer --registry_help --t2t_usr_dir=./tensor2tensor/test_data/example_usr_dir 2>&1 | grep my_very_own_hparams && echo passed
+
+  # Run data generation, training, and decoding on a dummy problem
   - t2t-datagen --problem=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR
   - t2t-trainer --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --train_steps=5 --eval_steps=5 --output_dir=$T2T_TRAIN_DIR
   - t2t-decoder --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR --decode_hparams='num_samples=10'

diff --git a/README.md b/README.md
@@ -148,8 +148,11 @@ t2t-decoder \
   --decode_from_file=$DECODE_FILE \
   --decode_to_file=translation.en
 
-# Eval BLEU
-# (Always report proper BLEU in papers, not the internal approx_bleu.)
+# See the translations
+cat translation.en
+
+# Evaluate the BLEU score
+# Note: Report this BLEU score in papers, not the internal approx_bleu metric.
 t2t-bleu --translation=translation.en --reference=ref-translation.de
 ```
 

diff --git a/docs/cloud_tpu.md b/docs/cloud_tpu.md
@@ -10,76 +10,39 @@ Models and hparams that are known to work on TPU:
 * `resnet50` with `resnet_base`
 * `revnet104` with `revnet_base`
 
-To run on TPUs, you need to be part of the alpha program; if you're not, these
-commands won't work for you currently, but access will expand soon, so get
-excited for your future ML supercomputers in the cloud.
+TPU access is currently limited, but access will expand soon, so get excited for
+your future ML supercomputers in the cloud.
 
 ## Tutorial: Transformer En-De translation on TPU
 
-Update `gcloud`: `gcloud components update`
+**Note**: You'll need TensorFlow 1.5+.
 
-Set your default zone to a TPU-enabled zone. TPU machines are only available in
-certain zones for now.
+Configure the `gcloud` CLI:
 ```
+gcloud components update
+gcloud auth application-default login
+# Set your default zone to a TPU-enabled zone.
 gcloud config set compute/zone us-central1-f
 ```
 
-Launch a GCE instance; this will run the Python trainer.
-```
-gcloud compute instances create $USER-vm \
-  --machine-type=n1-standard-8 \
-  --image-family=tf-nightly \
-  --image-project=ml-images \
-  --scopes=https://www.googleapis.com/auth/cloud-platform
-```
-
-Launch the TPU instance; the Python program will connect to this to train on the
-TPU device.
-```
-gcloud alpha compute tpus list
-# Make an IP with structure 10.240.X.2 that’s unique in the list
-TPU_IP=10.240.0.2
-gcloud alpha compute tpus create \
-  $USER-tpu \
-  --range=${TPU_IP/%2/0}/29 \
-  --version=nightly
-```
-
-SSH in with port forwarding for TensorBoard
-```
-gcloud compute ssh $USER-vm -- -L 6006:localhost:6006
-```
-
-Now that you're on the cloud instance, install T2T:
-```
-pip install tensor2tensor --user
-# Add the python bin dir to your path
-export PATH=$HOME/.local/bin:$PATH
-```
-
-Generate data to GCS
+Generate data to GCS.
 If you already have the data, use `gsutil cp` to copy to GCS.
 ```
 GCS_BUCKET=gs://my-bucket
 DATA_DIR=$GCS_BUCKET/t2t/data/
 t2t-datagen --problem=translate_ende_wmt8k --data_dir=$DATA_DIR
 ```
 
-Setup some vars used below. `TPU_IP` and `DATA_DIR` should be the same as what
-was used above. Note that the `DATA_DIR` and `OUT_DIR` must be GCS buckets.
+Specify an output directory and launch TensorBoard to monitor training:
 ```
-TPU_IP=10.240.0.2
-DATA_DIR=$GCS_BUCKET/t2t/data/
-OUT_DIR=$GCS_BUCKET/t2t/training/transformer_ende_1
-TPU_MASTER=grpc://$TPU_IP:8470
+OUT_DIR=$GCS_BUCKET/t2t/training/transformer_v1
+tensorboard --logdir=$OUT_DIR
 ```
 
-Launch TensorBoard in the background so you can monitor training:
-```
-tensorboard --logdir=$OUT_DIR > /tmp/tensorboard_logs.txt 2>&1 &
-```
+Note that both the data and output directories must be Google Cloud Storage
+buckets (i.e. start with `gs://`).
 
-Train and evaluate.
+Launch! It's as simple as adding the `--cloud_tpu` flag.
 ```
 t2t-trainer \
   --model=transformer \
@@ -88,21 +51,27 @@ t2t-trainer \
   --train_steps=10 \
   --eval_steps=10 \
   --local_eval_frequency=10 \
-  --iterations_per_loop=10 \
-  --master=$TPU_MASTER \
-  --use_tpu=True \
   --data_dir=$DATA_DIR \
-  --output_dir=$OUT_DIR
+  --output_dir=$OUT_DIR \
+  --cloud_tpu \
+  --cloud_delete_on_done
 ```
 
 The above command will train for 10 steps, then evaluate for 10 steps. You can
 (and should) increase the number of total training steps with the
 `--train_steps` flag. Evaluation will happen every `--local_eval_frequency`
-steps, each time for `--eval_steps`. When you increase then number of training
-steps, also increase `--iterations_per_loop`, which controls how frequently the
-TPU machine returns control to the host machine (1000 seems like a fine number).
-
-Back on your local machine, open your browser and navigate to `localhost:6006`
-for TensorBoard.
+steps, each time for `--eval_steps`. The `--cloud_delete_on_done` flag has the
+trainer delete the VMs on completion.
 
 Voila. Enjoy your new supercomputer.
+
+Note that checkpoints are compatible between CPU, GPU, and TPU models so you can
+switch between hardware at will.
+
+## Additional flags
+
+* `--cloud_vm_name`: The name of the VM to use or create. This can be reused
+  across multiple concurrent runs.
+* `--cloud_tpu_name`: The name of the TPU instance to use or create. If you want
+  to launch multiple jobs on TPU, provide different names here for each one.
+  Each TPU instance can only be training one model at a time.
diff --git a/docs/walkthrough.md b/docs/walkthrough.md
@@ -133,6 +133,7 @@ t2t-trainer \
 DECODE_FILE=$DATA_DIR/decode_this.txt
 echo "Hello world" >> $DECODE_FILE
 echo "Goodbye world" >> $DECODE_FILE
+echo -e 'Hallo Welt\nAuf Wiedersehen Welt' > ref-translation.de
 
 BEAM_SIZE=4
 ALPHA=0.6
@@ -144,9 +145,15 @@ t2t-decoder \
   --hparams_set=$HPARAMS \
   --output_dir=$TRAIN_DIR \
   --decode_hparams="beam_size=$BEAM_SIZE,alpha=$ALPHA" \
-  --decode_from_file=$DECODE_FILE
+  --decode_from_file=$DECODE_FILE \
+  --decode_to_file=translation.en
 
-cat $DECODE_FILE.$MODEL.$HPARAMS.beam$BEAM_SIZE.alpha$ALPHA.decodes
+# See the translations
+cat translation.en
+
+# Evaluate the BLEU score
+# Note: Report this BLEU score in papers, not the internal approx_bleu metric.
+t2t-bleu --translation=translation.en --reference=ref-translation.de
 ```
 
 ---

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.4.2',
+    version='1.4.3',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='[email protected]',
@@ -36,16 +36,15 @@
         'future',
         'gevent',
         'gunicorn',
-        'gym',
         'numpy',
         'requests',
         'scipy',
         'sympy',
         'six',
     ],
     extras_require={
-        'tensorflow': ['tensorflow>=1.4.1'],
-        'tensorflow_gpu': ['tensorflow-gpu>=1.4.1'],
+        'tensorflow': ['tensorflow>=1.5.0'],
+        'tensorflow_gpu': ['tensorflow-gpu>=1.5.0'],
         'tests': ['pytest', 'h5py', 'mock'],
     },
     classifiers=[

diff --git a/tensor2tensor/bin/t2t_datagen.py b/tensor2tensor/bin/t2t_datagen.py
@@ -168,6 +168,8 @@ def main(_):
     tf.logging.warning("It is strongly recommended to specify --data_dir. "
                        "Data will be written to default data_dir=%s.",
                        FLAGS.data_dir)
+  FLAGS.data_dir = os.path.expanduser(FLAGS.data_dir)
+  tf.gfile.MakeDirs(FLAGS.data_dir)
 
   tf.logging.info("Generating problems:\n%s"
                   % registry.display_list_by_prefix(problems,

diff --git a/tensor2tensor/bin/t2t_decoder.py b/tensor2tensor/bin/t2t_decoder.py
@@ -53,8 +53,9 @@
                     "Path to the source file for decoding")
 flags.DEFINE_string("decode_to_file", None,
                     "Path to the decoded (output) file")
-flags.DEFINE_bool("keep_timestamp", True,
-                  "Set the mtime of the decoded file to the checkpoint_path+'.index' mtime.")
+flags.DEFINE_bool("keep_timestamp", False,
+                  "Set the mtime of the decoded file to the "
+                  "checkpoint_path+'.index' mtime.")
 flags.DEFINE_bool("decode_interactive", False,
                   "Interactive local inference mode.")
 flags.DEFINE_integer("decode_shards", 1, "Number of decoding replicas.")
@@ -83,7 +84,7 @@ def decode(estimator, hparams, decode_hp):
                               decode_hp, FLAGS.decode_to_file,
                               checkpoint_path=FLAGS.checkpoint_path)
     if FLAGS.checkpoint_path and FLAGS.keep_timestamp:
-      ckpt_time = os.path.getmtime(FLAGS.checkpoint_path + '.index')
+      ckpt_time = os.path.getmtime(FLAGS.checkpoint_path + ".index")
       os.utime(FLAGS.decode_to_file, (ckpt_time, ckpt_time))
   else:
     decoding.decode_from_dataset(