Merge pull request #411 from rsepassi/push

v1.2.8
tensorflow · Nov 11, 2017 · 75b75f2 · 75b75f2
2 parents 8594b4c + 8d191e4
commit 75b75f2
Show file tree

Hide file tree

Showing 45 changed files with 2,373 additions and 909 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -24,6 +24,6 @@ script:
   - mkdir $T2T_TRAIN_DIR
   - t2t-datagen --problem=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR
   - t2t-trainer --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --train_steps=5 --eval_steps=5 --output_dir=$T2T_TRAIN_DIR
-  - t2t-decoder --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR --decode_hparams='num_samples=10,use_last_position_only=True'
+  - t2t-decoder --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR --decode_hparams='num_samples=10'
 git:
   depth: 3
diff --git a/README.md b/README.md
@@ -124,7 +124,7 @@ t2t-decoder \
   --model=$MODEL \
   --hparams_set=$HPARAMS \
   --output_dir=$TRAIN_DIR \
-  --decode_hparams="beam_size=$BEAM_SIZE,alpha=$ALPHA,use_last_position_only=True" \
+  --decode_hparams="beam_size=$BEAM_SIZE,alpha=$ALPHA" \
   --decode_from_file=$DECODE_FILE
 
 cat $DECODE_FILE.$MODEL.$HPARAMS.beam$BEAM_SIZE.alpha$ALPHA.decodes

diff --git a/docs/cloud_tpu.md b/docs/cloud_tpu.md
@@ -0,0 +1,99 @@
+# Running on Cloud TPUs
+
+Tensor2Tensor supports running on Google Cloud Platforms TPUs, chips specialized
+for ML training.
+
+Not all models are supported but we've tested so far with Transformer (sequence
+model) as well as Xception (image model).
+
+To run on TPUs, you need to be part of the alpha program; if you're not, these
+commands won't work for you currently, but access will expand soon, so get
+excited for your future ML supercomputers in the cloud.
+
+## Tutorial: Transformer En-De translation on TPU
+
+Set your default zone to a TPU-enabled zone. TPU machines are only available in
+certain zones for now.
+```
+gcloud config set compute/zone us-central1-f
+```
+
+Launch a GCE instance; this will run the Python trainer.
+```
+gcloud compute instances create $USER-vm \
+  --machine-type=n1-standard-8 \
+  --image-family=tf-nightly \
+  --image-project=ml-images \
+  --scopes=https://www.googleapis.com/auth/cloud-platform
+```
+
+Launch the TPU instance; the Python program will connect to this to train on the
+TPU device.
+```
+TPU_IP=10.240.0.2
+gcloud alpha compute tpus create \
+  $USER-tpu \
+  --range=${TPU_IP/%2/0}/29 \
+  --version=nightly
+```
+
+To see all TPU instances running: `gcloud alpha compute tpus list`.  The
+`TPU_IP` should be unique amongst the list and follow the format `10.240.i.2`.
+
+Generate data to GCS
+If you already have the data locally, use `gsutil cp` to cp to GCS.
+```
+DATA_DIR=gs://my-bucket/t2t/data/
+t2t-datagen --problem=translate_ende_wmt8k --data_dir=$DATA_DIR
+```
+
+SSH in with port forwarding for TensorBoard
+```
+gcloud compute ssh $USER-vm -L 6006:localhost:6006
+```
+
+Now that you're on the cloud instance, install T2T:
+```
+pip install tensor2tensor
+```
+
+Setup some vars used below. `TPU_IP` and `DATA_DIR` should be the same as what
+was used above. Note that the `DATA_DIR` and `OUT_DIR` must be GCS buckets.
+```
+TPU_IP=<IP of TPU machine>
+DATA_DIR=gs://my-bucket/t2t/data/
+OUT_DIR=gs://my-bucket/t2t/training/
+TPU_MASTER=grpc://$TPU_IP:8470
+```
+
+Launch TensorBoard in the background so you can monitor training:
+```
+tensorboard --logdir=$OUT_DIR > /tmp/tensorboard_logs.txt 2>&1 &
+```
+
+Train and evaluate.
+```
+t2t-tpu-trainer \
+  --master=$TPU_MASTER \
+  --data_dir=$DATA_DIR \
+  --output_dir=$OUT_DIR \
+  --problems=translate_ende_wmt8k \
+  --model=transformer \
+  --hparams_set=transformer_tiny_tpu \
+  --train_steps=10 \
+  --eval_steps=10 \
+  --local_eval_frequency=10 \
+  --iterations_per_loop=10
+```
+
+The above command will train for 10 steps, then evaluate for 10 steps. You can
+(and should) increase the number of total training steps with the
+`--train_steps` flag. Evaluation will happen every `--local_eval_frequency`
+steps, each time for `--eval_steps`. When you increase then number of training
+steps, also increase `--iterations_per_loop`, which controls how frequently the
+TPU machine returns control to the Python code (1000 seems like a fine number).
+
+Back on your local machine, open your browser and navigate to `localhost:6006`
+for TensorBoard.
+
+Voila. Enjoy your new supercomputer.
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.2.7',
+    version='1.2.8',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='[email protected]',

diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
@@ -51,15 +51,17 @@ def resize_by_area(img, size):
 
 class ImageProblem(problem.Problem):
 
-  def example_reading_spec(self, label_key=None):
-    if label_key is None:
-      label_key = "image/class/label"
+  def example_reading_spec(self, label_repr=None):
+    if label_repr is None:
+      label_repr = ("image/class/label", tf.FixedLenFeature((1,), tf.int64))
 
     data_fields = {
         "image/encoded": tf.FixedLenFeature((), tf.string),
         "image/format": tf.FixedLenFeature((), tf.string),
-        label_key: tf.VarLenFeature(tf.int64)
     }
+    label_key, label_type = label_repr  # pylint: disable=unpacking-non-sequence
+    data_fields[label_key] = label_type
+
     data_items_to_decoders = {
         "inputs":
             tf.contrib.slim.tfexample_decoder.Image(
@@ -244,8 +246,9 @@ def hparams(self, defaults, unused_model_hparams):
 
   def example_reading_spec(self):
     label_key = "image/unpadded_label"
+    label_type = tf.VarLenFeature(tf.int64)
     return super(ImageFSNS, self).example_reading_spec(
-        self, label_key=label_key)
+        self, label_repr=(label_key, label_type))
 
 
 class Image2ClassProblem(ImageProblem):
@@ -283,10 +286,8 @@ def generator(self, data_dir, tmp_dir, is_training):
 
   def hparams(self, defaults, unused_model_hparams):
     p = defaults
-    small_modality = "%s:small_image_modality" % registry.Modalities.IMAGE
-    modality = small_modality if self.is_small else registry.Modalities.IMAGE
-    p.input_modality = {"inputs": (modality, None)}
-    p.target_modality = ("%s:2d" % registry.Modalities.CLASS_LABEL,
+    p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)}
+    p.target_modality = (registry.Modalities.CLASS_LABEL,
                          self.num_classes)
     p.batch_size_multiplier = 4 if self.is_small else 256
     p.max_expected_batch_size_per_shard = 8 if self.is_small else 2
@@ -382,6 +383,38 @@ def preprocess_example(self, example, mode, unused_hparams):
     return example
 
 
+@registry.register_problem
+class ImageImagenet64(Image2ClassProblem):
+  """Imagenet rescaled to 64x64."""
+
+  def dataset_filename(self):
+    return "image_imagenet"  # Reuse Imagenet data.
+
+  @property
+  def is_small(self):
+    return True  # Modalities like for CIFAR.
+
+  @property
+  def num_classes(self):
+    return 1000
+
+  def generate_data(self, data_dir, tmp_dir, task_id=-1):
+    # TODO(lukaszkaiser): find a better way than printing this.
+    print("To generate the ImageNet dataset in the proper format, follow "
+          "instructions at https://github.com/tensorflow/models/blob/master"
+          "/inception/README.md#getting-started")
+
+  def preprocess_example(self, example, mode, unused_hparams):
+    inputs = example["inputs"]
+    # Just resize with area.
+    if self._was_reversed:
+      example["inputs"] = resize_by_area(inputs, 64)
+    else:
+      example = imagenet_preprocess_example(example, mode)
+      example["inputs"] = example["inputs"] = resize_by_area(inputs, 64)
+    return example
+
+
 @registry.register_problem
 class Img2imgImagenet(ImageProblem):
   """Imagenet rescaled to 8x8 for input and 32x32 for output."""
@@ -623,9 +656,11 @@ def class_labels(self):
     ]
 
   def preprocess_example(self, example, mode, unused_hparams):
+    example["inputs"].set_shape([_CIFAR10_IMAGE_SIZE, _CIFAR10_IMAGE_SIZE, 3])
     if mode == tf.estimator.ModeKeys.TRAIN:
       example["inputs"] = common_layers.cifar_image_augmentation(
           example["inputs"])
+    example["inputs"] = tf.to_int64(example["inputs"])
     return example
 
   def generator(self, data_dir, tmp_dir, is_training):
@@ -649,6 +684,7 @@ def generator(self, data_dir, tmp_dir, is_training):
 class ImageCifar10Plain(ImageCifar10):
 
   def preprocess_example(self, example, mode, unused_hparams):
+    example["inputs"].set_shape([_CIFAR10_IMAGE_SIZE, _CIFAR10_IMAGE_SIZE, 3])
     example["inputs"] = tf.to_int64(example["inputs"])
     return example
 

diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
@@ -536,7 +536,7 @@ def _default_hparams():
       # During inference for autoregressive problems, if the batch_size is 1,
       # the inference will stop when the model predict a text_encoder.EOS_ID
       # token.
-      stop_at_eos=int(False),
+      stop_at_eos=False,
 
       # Modalities used to map from input features to a space compatible with
       # chosen model architecture.  One modality spec (which is a 2-tuple,

diff --git a/tensor2tensor/data_generators/translate_enfr.py b/tensor2tensor/data_generators/translate_enfr.py
@@ -151,6 +151,10 @@ class TranslateEnfrWmtSmallCharacters(translate.TranslateProblem):
   def is_character_level(self):
     return True
 
+  @property
+  def use_small_dataset(self):
+    return True
+
   @property
   def vocab_name(self):
     return "vocab.enfr"