From 3c2cf3b02a29f44e540648bcecd5ff663ad6f2b5 Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Wed, 21 Aug 2019 17:41:23 -0700 Subject: [PATCH] Misc changes Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/840 Differential Revision: D16947645 Pulled By: myleott fbshipit-source-id: e869789bc22bbf5cb08d9adfa44f9fc09b3805af --- examples/language_model/README.md | 8 +- examples/roberta/README.md | 7 + examples/roberta/README.pretraining.md | 2 +- fairseq/tasks/tagged_language_modeling.py | 164 ---------------------- 4 files changed, 15 insertions(+), 166 deletions(-) delete mode 100644 fairseq/tasks/tagged_language_modeling.py diff --git a/examples/language_model/README.md b/examples/language_model/README.md index a103755228..6199e69ece 100644 --- a/examples/language_model/README.md +++ b/examples/language_model/README.md @@ -12,7 +12,7 @@ Model | Description | Dataset | Download ## Example usage -Sampling from a language model using PyTorch Hub: +To sample from a language model using PyTorch Hub: ```python import torch @@ -25,6 +25,12 @@ en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer=' # Sample from the language model en_lm.sample('Barack Obama', beam=1, sampling=True, sampling_topk=10, temperature=0.8) # "Barack Obama is coming to Sydney and New Zealand (...)" + +# The same interface can be used with custom models as well +from fairseq.models.transformer_lm import TransformerLanguageModel +custom_lm = TransformerLanguageModel.from_pretrained('/path/to/model/dir', 'checkpoint100.pt', tokenizer='moses', bpe='fastbpe') +custom_lm.sample('Barack Obama', beam=5) +# "Barack Obama (...)" ``` ## Training a transformer language model with the CLI tools diff --git a/examples/roberta/README.md b/examples/roberta/README.md index e4d9e4fee1..9006e4f193 100644 --- a/examples/roberta/README.md +++ b/examples/roberta/README.md @@ -76,6 +76,13 @@ Model | Accuracy ---|--- `roberta.large` | 78.1 +**[XNLI (Conneau et al., 2018)](https://arxiv.org/abs/1809.05053)** +_(TRANSLATE-TEST)_ + +Model | en | fr | es | de | el | bg | ru | tr | ar | vi | th | zh | hi | sw | ur +---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|--- +`roberta.large.mnli` | 91.3 | 82.91 | 84.27 | 81.24 | 81.74 | 83.13 | 78.28 | 76.79 | 76.64 | 74.17 | 74.05 | 77.5 | 70.9 | 66.65 | 66.81 + ## Example usage ##### Load RoBERTa from torch.hub (PyTorch >= 1.1): diff --git a/examples/roberta/README.pretraining.md b/examples/roberta/README.pretraining.md index 527d4a2e57..43bdf17676 100644 --- a/examples/roberta/README.pretraining.md +++ b/examples/roberta/README.pretraining.md @@ -54,7 +54,7 @@ PEAK_LR=0.0005 # Peak learning rate, adjust as needed TOKENS_PER_SAMPLE=512 # Max sequence length MAX_POSITIONS=512 # Num. positional embeddings (usually same as above) MAX_SENTENCES=16 # Number of sequences per batch (batch size) -UPDATE_FREQ=16 # Increase the batch size 16x +UPDATE_FREQ=16 # Increase the batch size 16x DATA_DIR=data-bin/wikitext-103 diff --git a/fairseq/tasks/tagged_language_modeling.py b/fairseq/tasks/tagged_language_modeling.py deleted file mode 100644 index 3c49ef7664..0000000000 --- a/fairseq/tasks/tagged_language_modeling.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import numpy as np - -import os - -from fairseq.data import ( - ConcatDataset, - data_utils, - MonolingualDataset, - PrependDataset, - ReplaceDataset, - ShardedDataset, - SubsampleDataset, - TokenBlockDataset, -) -from fairseq.tasks import register_task - -from fairseq.tasks.language_modeling import LanguageModelingTask - - -@register_task("tagged_language_modeling") -class TaggedLanguageModelingTask(LanguageModelingTask): - """ - Like the language modeling task, but prepends tags to each sample - """ - - @staticmethod - def add_args(parser): - """Add task-specific arguments to the parser.""" - LanguageModelingTask.add_args(parser) - parser.add_argument( - "--multiple-datasets", - action="store_true", - help="if set, treats paths in data as separate datasets to be combined, " - "rather than as splits of a single dataset", - ) - parser.add_argument( - "--prepend-ds-name", - action="store_true", - help="if set and multiple-datasets is also set, prepends the name of the ds instead of " - "bos/eos token", - ) - parser.add_argument( - "--generic-ds-name-chance", - type=float, - metavar="P", - default=0, - help='if multiple datasets is used, sets the prepended ds name to "generic" ' - "this percentage of time", - ) - parser.add_argument( - "--subsample-splits", - type=str, - metavar="SPLITS", - default="valid", - help="if multiple datasets is used, subsamples specified split(colon separated) to " - "the size of the smallest split", - ) - - def __init__(self, args, dictionary, output_dictionary=None, targets=None): - super().__init__(args, dictionary, output_dictionary, targets) - self.subsample_splits = ( - set() - if args.subsample_splits is None - else set(args.subsample_splits.split(":")) - ) - - def make_prepended_ds(self, dataset): - def ds_name(dataset, index): - if ( - self.args.generic_ds_name_chance > 0 - and np.random.rand() <= self.args.generic_ds_name_chance - ): - ds_name = "generic" - else: - ds_name = dataset.attr("name", index) - assert ds_name is not None - return self.dictionary.indices[ds_name] - - dataset = PrependDataset( - dataset, prepend_getter=ds_name, ensure_first_token_is=self.dictionary.eos() - ) - return dataset - - def load_dataset(self, split, epoch=0, combine=False, **kwargs): - """Load a given dataset split. - - Args: - split (str): name of the split (e.g., train, valid, test) - """ - paths = self.args.data.split(":") - assert len(paths) > 0 - - if self.args.multiple_datasets: - if len(paths) == 1: - paths = [os.path.join(paths[0], p) for p in next(os.walk(paths[0]))[1]] - datasets = [ - ShardedDataset( - self.dictionary, - self.args.dataset_impl, - path, - split, - epoch, - combine=combine, - ) - for path in paths - ] - - if split in self.subsample_splits: - sizes = [sum(d.sizes) for d in datasets] - min_sz = min(sizes) - ratios = [min_sz / sz for sz in sizes] - datasets = [ - SubsampleDataset(d, r) if r < 1 else d - for d, r in zip(datasets, ratios) - ] - - dataset = ConcatDataset(datasets) - else: - data_path = paths[epoch % len(paths)] - split_path = os.path.join(data_path, split) - - dataset = data_utils.load_indexed_dataset( - split_path, self.dictionary, self.args.dataset_impl, combine=combine - ) - if dataset is None: - raise FileNotFoundError( - "Dataset not found: {} ({})".format(split, split_path) - ) - - dataset = TokenBlockDataset( - dataset, - dataset.sizes, - self.args.tokens_per_sample, - pad=self.dictionary.pad(), - eos=self.dictionary.eos(), - break_mode=self.args.sample_break_mode, - include_targets=True, - ) - - if self.args.prepend_ds_name: - dataset = self.make_prepended_ds(dataset) - - dataset = ReplaceDataset(dataset, { self.dictionary.eos(): self.dictionary.indices['\\n'] }, offset=1) - - add_eos_for_other_targets = ( - self.args.sample_break_mode is not None - and self.args.sample_break_mode != "none" - ) - - self.datasets[split] = MonolingualDataset( - dataset, - dataset.sizes, - self.dictionary, - self.output_dictionary, - add_eos_for_other_targets=add_eos_for_other_targets, - shuffle=True, - targets=self.targets, - add_bos_token=self.args.add_bos_token, - )