From 3c2cf3b02a29f44e540648bcecd5ff663ad6f2b5 Mon Sep 17 00:00:00 2001
From: Myle Ott <myleott@fb.com>
Date: Wed, 21 Aug 2019 17:41:23 -0700
Subject: [PATCH] Misc changes

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/840

Differential Revision: D16947645

Pulled By: myleott

fbshipit-source-id: e869789bc22bbf5cb08d9adfa44f9fc09b3805af
---
 examples/language_model/README.md         |   8 +-
 examples/roberta/README.md                |   7 +
 examples/roberta/README.pretraining.md    |   2 +-
 fairseq/tasks/tagged_language_modeling.py | 164 ----------------------
 4 files changed, 15 insertions(+), 166 deletions(-)
 delete mode 100644 fairseq/tasks/tagged_language_modeling.py

diff --git a/examples/language_model/README.md b/examples/language_model/README.md
index a103755228..6199e69ece 100644
--- a/examples/language_model/README.md
+++ b/examples/language_model/README.md
@@ -12,7 +12,7 @@ Model | Description | Dataset | Download
 
 ## Example usage
 
-Sampling from a language model using PyTorch Hub:
+To sample from a language model using PyTorch Hub:
 ```python
 import torch
 
@@ -25,6 +25,12 @@ en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='
 # Sample from the language model
 en_lm.sample('Barack Obama', beam=1, sampling=True, sampling_topk=10, temperature=0.8)
 # "Barack Obama is coming to Sydney and New Zealand (...)"
+
+# The same interface can be used with custom models as well
+from fairseq.models.transformer_lm import TransformerLanguageModel
+custom_lm = TransformerLanguageModel.from_pretrained('/path/to/model/dir', 'checkpoint100.pt', tokenizer='moses', bpe='fastbpe')
+custom_lm.sample('Barack Obama', beam=5)
+# "Barack Obama (...)"
 ```
 
 ## Training a transformer language model with the CLI tools
diff --git a/examples/roberta/README.md b/examples/roberta/README.md
index e4d9e4fee1..9006e4f193 100644
--- a/examples/roberta/README.md
+++ b/examples/roberta/README.md
@@ -76,6 +76,13 @@ Model | Accuracy
 ---|---
 `roberta.large` | 78.1
 
+**[XNLI (Conneau et al., 2018)](https://arxiv.org/abs/1809.05053)**
+_(TRANSLATE-TEST)_
+
+Model | en | fr | es | de | el | bg | ru | tr | ar | vi | th | zh | hi | sw | ur
+---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---
+`roberta.large.mnli` | 91.3 | 82.91 | 84.27 | 81.24 | 81.74 | 83.13 | 78.28 | 76.79 | 76.64 | 74.17 | 74.05 | 77.5 | 70.9 | 66.65 | 66.81
+
 ## Example usage
 
 ##### Load RoBERTa from torch.hub (PyTorch >= 1.1):
diff --git a/examples/roberta/README.pretraining.md b/examples/roberta/README.pretraining.md
index 527d4a2e57..43bdf17676 100644
--- a/examples/roberta/README.pretraining.md
+++ b/examples/roberta/README.pretraining.md
@@ -54,7 +54,7 @@ PEAK_LR=0.0005          # Peak learning rate, adjust as needed
 TOKENS_PER_SAMPLE=512   # Max sequence length
 MAX_POSITIONS=512       # Num. positional embeddings (usually same as above)
 MAX_SENTENCES=16        # Number of sequences per batch (batch size)
-UPDATE_FREQ=16           # Increase the batch size 16x
+UPDATE_FREQ=16          # Increase the batch size 16x
 
 DATA_DIR=data-bin/wikitext-103
 
diff --git a/fairseq/tasks/tagged_language_modeling.py b/fairseq/tasks/tagged_language_modeling.py
deleted file mode 100644
index 3c49ef7664..0000000000
--- a/fairseq/tasks/tagged_language_modeling.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-
-import os
-
-from fairseq.data import (
-    ConcatDataset,
-    data_utils,
-    MonolingualDataset,
-    PrependDataset,
-    ReplaceDataset,
-    ShardedDataset,
-    SubsampleDataset,
-    TokenBlockDataset,
-)
-from fairseq.tasks import register_task
-
-from fairseq.tasks.language_modeling import LanguageModelingTask
-
-
-@register_task("tagged_language_modeling")
-class TaggedLanguageModelingTask(LanguageModelingTask):
-    """
-    Like the language modeling task, but prepends tags to each sample
-    """
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        LanguageModelingTask.add_args(parser)
-        parser.add_argument(
-            "--multiple-datasets",
-            action="store_true",
-            help="if set, treats paths in data as separate datasets to be combined, "
-            "rather than as splits of a single dataset",
-        )
-        parser.add_argument(
-            "--prepend-ds-name",
-            action="store_true",
-            help="if set and multiple-datasets is also set, prepends the name of the ds instead of "
-            "bos/eos token",
-        )
-        parser.add_argument(
-            "--generic-ds-name-chance",
-            type=float,
-            metavar="P",
-            default=0,
-            help='if multiple datasets is used, sets the prepended ds name to "generic" '
-            "this percentage of time",
-        )
-        parser.add_argument(
-            "--subsample-splits",
-            type=str,
-            metavar="SPLITS",
-            default="valid",
-            help="if multiple datasets is used, subsamples specified split(colon separated) to "
-            "the size of the smallest split",
-        )
-
-    def __init__(self, args, dictionary, output_dictionary=None, targets=None):
-        super().__init__(args, dictionary, output_dictionary, targets)
-        self.subsample_splits = (
-            set()
-            if args.subsample_splits is None
-            else set(args.subsample_splits.split(":"))
-        )
-
-    def make_prepended_ds(self, dataset):
-        def ds_name(dataset, index):
-            if (
-                self.args.generic_ds_name_chance > 0
-                and np.random.rand() <= self.args.generic_ds_name_chance
-            ):
-                ds_name = "generic"
-            else:
-                ds_name = dataset.attr("name", index)
-            assert ds_name is not None
-            return self.dictionary.indices[ds_name]
-
-        dataset = PrependDataset(
-            dataset, prepend_getter=ds_name, ensure_first_token_is=self.dictionary.eos()
-        )
-        return dataset
-
-    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        paths = self.args.data.split(":")
-        assert len(paths) > 0
-
-        if self.args.multiple_datasets:
-            if len(paths) == 1:
-                paths = [os.path.join(paths[0], p) for p in next(os.walk(paths[0]))[1]]
-            datasets = [
-                ShardedDataset(
-                    self.dictionary,
-                    self.args.dataset_impl,
-                    path,
-                    split,
-                    epoch,
-                    combine=combine,
-                )
-                for path in paths
-            ]
-
-            if split in self.subsample_splits:
-                sizes = [sum(d.sizes) for d in datasets]
-                min_sz = min(sizes)
-                ratios = [min_sz / sz for sz in sizes]
-                datasets = [
-                    SubsampleDataset(d, r) if r < 1 else d
-                    for d, r in zip(datasets, ratios)
-                ]
-
-            dataset = ConcatDataset(datasets)
-        else:
-            data_path = paths[epoch % len(paths)]
-            split_path = os.path.join(data_path, split)
-
-            dataset = data_utils.load_indexed_dataset(
-                split_path, self.dictionary, self.args.dataset_impl, combine=combine
-            )
-            if dataset is None:
-                raise FileNotFoundError(
-                    "Dataset not found: {} ({})".format(split, split_path)
-                )
-
-        dataset = TokenBlockDataset(
-            dataset,
-            dataset.sizes,
-            self.args.tokens_per_sample,
-            pad=self.dictionary.pad(),
-            eos=self.dictionary.eos(),
-            break_mode=self.args.sample_break_mode,
-            include_targets=True,
-        )
-
-        if self.args.prepend_ds_name:
-            dataset = self.make_prepended_ds(dataset)
-
-        dataset = ReplaceDataset(dataset, { self.dictionary.eos(): self.dictionary.indices['\\n'] }, offset=1)
-
-        add_eos_for_other_targets = (
-            self.args.sample_break_mode is not None
-            and self.args.sample_break_mode != "none"
-        )
-
-        self.datasets[split] = MonolingualDataset(
-            dataset,
-            dataset.sizes,
-            self.dictionary,
-            self.output_dictionary,
-            add_eos_for_other_targets=add_eos_for_other_targets,
-            shuffle=True,
-            targets=self.targets,
-            add_bos_token=self.args.add_bos_token,
-        )