From cce6dcb1cca85955a82879ea5064fe8202e8f412 Mon Sep 17 00:00:00 2001 From: Simone Francia Date: Fri, 24 Jan 2020 09:58:19 -0800 Subject: [PATCH] adding support for Italian Umberto ( umberto.commoncrawl and umberto.wikipedia ) from Musixmatch (#1008) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/1008 # Before submitting - [ ] Was this discussed/approved via a Github issue? (no need for typos, doc improvements) - [ ] Did you read the [contributor guideline](https://github.com/pytorch/fairseq/blob/master/CONTRIBUTING.md)? - [ ] Did you make sure to update the docs? - [ ] Did you write any new necessary tests? ## What does this PR do? Fixes # (issue). ## PR review Anyone in the community is free to review the PR once the tests have passed. If we didn't discuss your PR in Github issues there's a high chance it will not be merged. ## Did you have fun? Make sure you had fun coding � Pull Request resolved: https://github.com/pytorch/fairseq/pull/1639 Differential Revision: D19555377 Pulled By: myleott fbshipit-source-id: 8ef2b6635a2c609f6ed7dd8ba403eba0787590d8 --- examples/roberta/README.md | 5 ++- fairseq/models/roberta/model.py | 48 ----------------------- fairseq/models/roberta/model_camembert.py | 36 +++++++++++++++++ fairseq/models/roberta/model_xlmr.py | 37 +++++++++++++++++ 4 files changed, 76 insertions(+), 50 deletions(-) create mode 100644 fairseq/models/roberta/model_camembert.py create mode 100644 fairseq/models/roberta/model_xlmr.py diff --git a/examples/roberta/README.md b/examples/roberta/README.md index 0c603f2002..f9250f2ea4 100644 --- a/examples/roberta/README.md +++ b/examples/roberta/README.md @@ -8,8 +8,9 @@ RoBERTa iterates on BERT's pretraining procedure, including training the model l ### What's New: -- November 2019: French model (CamemBERT) is available [CamemBERT](https://github.com/pytorch/fairseq/tree/master/examples/camembert). -- November 2019: Multilingual encoder (XLM-RoBERTa) is available [XLM-R](https://github.com/pytorch/fairseq/tree/master/examples/xlmr). +- January 2020: Italian model (UmBERTo) is available from [Musixmatch Research](https://github.com/musixmatchresearch): [UmBERTo](https://github.com/musixmatchresearch/umberto). +- November 2019: French model (CamemBERT) is available: [CamemBERT](https://github.com/pytorch/fairseq/tree/master/examples/camembert). +- November 2019: Multilingual encoder (XLM-RoBERTa) is available: [XLM-R](https://github.com/pytorch/fairseq/tree/master/examples/xlmr). - September 2019: TensorFlow and TPU support via the [transformers library](https://github.com/huggingface/transformers). - August 2019: RoBERTa is now supported in the [pytorch-transformers library](https://github.com/huggingface/pytorch-transformers). - August 2019: Added [tutorial for finetuning on WinoGrande](https://github.com/pytorch/fairseq/tree/master/examples/roberta/wsc#roberta-training-on-winogrande-dataset). diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py index 925298c204..95a5ff33de 100644 --- a/fairseq/models/roberta/model.py +++ b/fairseq/models/roberta/model.py @@ -199,53 +199,6 @@ def upgrade_state_dict_named(self, state_dict, name): state_dict[prefix + 'classification_heads.' + k] = v -@register_model('xlmr') -class XLMRModel(RobertaModel): - @classmethod - def hub_models(cls): - return { - 'xlmr.base': 'http://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz', - 'xlmr.large': 'http://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz', - } - - @classmethod - def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='sentencepiece', **kwargs): - from fairseq import hub_utils - x = hub_utils.from_pretrained( - model_name_or_path, - checkpoint_file, - data_name_or_path, - archive_map=cls.hub_models(), - bpe=bpe, - load_checkpoint_heads=True, - **kwargs, - ) - return RobertaHubInterface(x['args'], x['task'], x['models'][0]) - - -@register_model('camembert') -class CamembertModel(RobertaModel): - @classmethod - def hub_models(cls): - return { - 'camembert.v0': 'http://dl.fbaipublicfiles.com/fairseq/models/camembert.v0.tar.gz', - } - - @classmethod - def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='sentencepiece', **kwargs): - from fairseq import hub_utils - x = hub_utils.from_pretrained( - model_name_or_path, - checkpoint_file, - data_name_or_path, - archive_map=cls.hub_models(), - bpe=bpe, - load_checkpoint_heads=True, - **kwargs, - ) - return RobertaHubInterface(x['args'], x['task'], x['models'][0]) - - class RobertaLMHead(nn.Module): """Head for masked language modeling.""" @@ -413,5 +366,4 @@ def xlm_architecture(args): args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1280) args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 1280*4) args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 16) - base_architecture(args) diff --git a/fairseq/models/roberta/model_camembert.py b/fairseq/models/roberta/model_camembert.py new file mode 100644 index 0000000000..b62e3e3197 --- /dev/null +++ b/fairseq/models/roberta/model_camembert.py @@ -0,0 +1,36 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +CamemBERT: a Tasty French Language Model +""" + +from fairseq.models import register_model + +from .hub_interface import RobertaHubInterface +from .model import RobertaModel + + +@register_model('camembert') +class CamembertModel(RobertaModel): + + @classmethod + def hub_models(cls): + return { + 'camembert.v0': 'http://dl.fbaipublicfiles.com/fairseq/models/camembert.v0.tar.gz', + } + + @classmethod + def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='sentencepiece', **kwargs): + from fairseq import hub_utils + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + bpe=bpe, + load_checkpoint_heads=True, + **kwargs, + ) + return RobertaHubInterface(x['args'], x['task'], x['models'][0]) diff --git a/fairseq/models/roberta/model_xlmr.py b/fairseq/models/roberta/model_xlmr.py new file mode 100644 index 0000000000..fa71a27d12 --- /dev/null +++ b/fairseq/models/roberta/model_xlmr.py @@ -0,0 +1,37 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +Unsupervised Cross-lingual Representation Learning at Scale +""" + +from fairseq.models import register_model + +from .hub_interface import RobertaHubInterface +from .model import RobertaModel + + +@register_model('xlmr') +class XLMRModel(RobertaModel): + + @classmethod + def hub_models(cls): + return { + 'xlmr.base': 'http://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz', + 'xlmr.large': 'http://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz', + } + + @classmethod + def from_pretrained(cls, model_name_or_path, checkpoint_file='model.pt', data_name_or_path='.', bpe='sentencepiece', **kwargs): + from fairseq import hub_utils + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + bpe=bpe, + load_checkpoint_heads=True, + **kwargs, + ) + return RobertaHubInterface(x['args'], x['task'], x['models'][0])