Add code to realign RoBERTa features to word-level tokenizers

Summary: Pull Request resolved: fairinternal/fairseq-py#805 Differential Revision: D16670825 Pulled By: myleott fbshipit-source-id: 872a1a0274681a34d54bda00bfcfcda2e94144c6
LLL-Orleans · Aug 7, 2019 · 2b7843d · 2b7843d
1 parent e40e4b2
commit 2b7843d
Show file tree

Hide file tree

Showing 4 changed files with 170 additions and 2 deletions.
diff --git a/examples/roberta/README.md b/examples/roberta/README.md
@@ -76,6 +76,28 @@ assert len(all_layers) == 25
 assert torch.all(all_layers[-1] == last_layer_features)
 ```
 
+By default RoBERTa outputs one feature vector per BPE token. You can instead
+realign the features to match [spaCy's word-level tokenization](https://spacy.io/usage/linguistic-features#tokenization)
+with the `extract_features_aligned_to_words` method. This will compute a
+weighted average of the BPE-level features for each word and expose them in
+spaCy's `Token.vector` attribute:
+```python
+doc = roberta.extract_features_aligned_to_words('I said, "hello RoBERTa."')
+assert len(doc) == 10
+for tok in doc:
+    print('{:10}{} (...)'.format(str(tok), tok.vector[:5]))
+# <s>       tensor([-0.1316, -0.0386, -0.0832, -0.0477,  0.1943], grad_fn=<SliceBackward>) (...)
+# I         tensor([ 0.0559,  0.1541, -0.4832,  0.0880,  0.0120], grad_fn=<SliceBackward>) (...)
+# said      tensor([-0.1565, -0.0069, -0.8915,  0.0501, -0.0647], grad_fn=<SliceBackward>) (...)
+# ,         tensor([-0.1318, -0.0387, -0.0834, -0.0477,  0.1944], grad_fn=<SliceBackward>) (...)
+# "         tensor([-0.0486,  0.1818, -0.3946, -0.0553,  0.0981], grad_fn=<SliceBackward>) (...)
+# hello     tensor([ 0.0079,  0.1799, -0.6204, -0.0777, -0.0923], grad_fn=<SliceBackward>) (...)
+# RoBERTa   tensor([-0.2339, -0.1184, -0.7343, -0.0492,  0.5829], grad_fn=<SliceBackward>) (...)
+# .         tensor([-0.1341, -0.1203, -0.1012, -0.0621,  0.1892], grad_fn=<SliceBackward>) (...)
+# "         tensor([-0.1341, -0.1203, -0.1012, -0.0621,  0.1892], grad_fn=<SliceBackward>) (...)
+# </s>      tensor([-0.0930, -0.0392, -0.0821,  0.0158,  0.0649], grad_fn=<SliceBackward>) (...)
+```
+
 ##### Use RoBERTa for sentence-pair classification tasks:
 ```python
 # Download RoBERTa already finetuned for MNLI

diff --git a/fairseq/data/encoders/fastbpe.py b/fairseq/data/encoders/fastbpe.py
@@ -25,7 +25,7 @@ def __init__(self, args):
             self.bpe = fastBPE.fastBPE(codes)
             self.bpe_symbol = "@@ "
         except ImportError:
-            raise ImportError('Please install fastbpe at https://github.com/glample/fastBPE')
+            raise ImportError('Please install fastBPE with: pip install fastBPE')
 
     def encode(self, x: str) -> str:
         return self.bpe.apply([x])[0]

diff --git a/fairseq/models/roberta/alignment_utils.py b/fairseq/models/roberta/alignment_utils.py
@@ -0,0 +1,115 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import Counter
+from typing import List
+
+import torch
+
+
+def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor, other_tokens: List[str]):
+    """
+    Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy).
+
+    Args:
+        roberta (RobertaHubInterface): RoBERTa instance
+        bpe_tokens (torch.LongTensor): GPT-2 BPE tokens of shape `(T_bpe)`
+        other_tokens (List[str]): other tokens of shape `(T_words)`
+
+    Returns:
+        List[str]: mapping from *other_tokens* to corresponding *bpe_tokens*.
+    """
+    assert bpe_tokens.dim() == 1
+
+    def clean(text):
+        return text.strip()
+
+    # remove whitespaces to simplify alignment
+    bpe_tokens = [roberta.task.source_dictionary.string([x]) for x in bpe_tokens]
+    bpe_tokens = [clean(roberta.bpe.decode(x) if x not in {'<s>', ''} else x) for x in bpe_tokens]
+    other_tokens = [clean(str(o)) for o in other_tokens]
+
+    # strip leading <s>
+    assert bpe_tokens[0] == '<s>'
+    bpe_tokens = bpe_tokens[1:]
+    assert ''.join(bpe_tokens) == ''.join(other_tokens)
+
+    # create alignment from every word to a list of BPE tokens
+    alignment = []
+    bpe_toks = filter(lambda item: item[1] != '', enumerate(bpe_tokens, start=1))
+    j, bpe_tok = next(bpe_toks)
+    for other_tok in other_tokens:
+        bpe_indices = []
+        while True:
+            if other_tok.startswith(bpe_tok):
+                bpe_indices.append(j)
+                other_tok = other_tok[len(bpe_tok):]
+                try:
+                    j, bpe_tok = next(bpe_toks)
+                except StopIteration:
+                    j, bpe_tok = None, None
+            elif bpe_tok.startswith(other_tok):
+                # other_tok spans multiple BPE tokens
+                bpe_indices.append(j)
+                bpe_tok = bpe_tok[len(other_tok):]
+                other_tok = ''
+            else:
+                raise Exception('Cannot align "{}" and "{}"'.format(other_tok, bpe_tok))
+            if other_tok == '':
+                break
+        assert len(bpe_indices) > 0
+        alignment.append(bpe_indices)
+    assert len(alignment) == len(other_tokens)
+
+    return alignment
+
+
+def align_features_to_words(roberta, features, alignment):
+    """
+    Align given features to words.
+
+    Args:
+        roberta (RobertaHubInterface): RoBERTa instance
+        features (torch.Tensor): features to align of shape `(T_bpe x C)`
+        alignment: alignment between BPE tokens and words returned by
+            func:`align_bpe_to_words`.
+    """
+    assert features.dim() == 2
+
+    bpe_counts = Counter(j for bpe_indices in alignment for j in bpe_indices)
+    assert bpe_counts[0] == 0  # <s> shouldn't be aligned
+    denom = features.new([bpe_counts.get(j, 1) for j in range(len(features))])
+    weighted_features = features / denom.unsqueeze(-1)
+
+    output = [weighted_features[0]]
+    largest_j = -1
+    for bpe_indices in alignment:
+        output.append(weighted_features[bpe_indices].sum(dim=0))
+        largest_j = max(largest_j, *bpe_indices)
+    for j in range(largest_j + 1, len(features)):
+        output.append(weighted_features[j])
+    output = torch.stack(output)
+    assert torch.all(torch.abs(output.sum(dim=0) - features.sum(dim=0)) < 1e-4)
+    return output
+
+
+def spacy_nlp():
+    if getattr(spacy_nlp, '_nlp', None) is None:
+        try:
+            from spacy.lang.en import English
+            spacy_nlp._nlp = English()
+        except ImportError:
+            raise ImportError('Please install spacy with: pip install spacy')
+    return spacy_nlp._nlp
+
+
+def spacy_tokenizer():
+    if getattr(spacy_tokenizer, '_tokenizer', None) is None:
+        try:
+            nlp = spacy_nlp()
+            spacy_tokenizer._tokenizer = nlp.Defaults.create_tokenizer(nlp)
+        except ImportError:
+            raise ImportError('Please install spacy with: pip install spacy')
+    return spacy_tokenizer._tokenizer
diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import List
+
 import numpy as np
 import torch
 import torch.nn as nn
@@ -72,7 +74,7 @@ def decode(self, tokens: torch.LongTensor):
             return sentences[0]
         return sentences
 
-    def extract_features(self, tokens: torch.LongTensor, return_all_hiddens=False) -> torch.Tensor:
+    def extract_features(self, tokens: torch.LongTensor, return_all_hiddens: bool = False) -> torch.Tensor:
         if tokens.dim() == 1:
             tokens = tokens.unsqueeze(0)
         if tokens.size(-1) > self.model.max_positions():
@@ -102,3 +104,32 @@ def predict(self, head: str, tokens: torch.LongTensor):
         features = self.extract_features(tokens)
         logits = self.model.classification_heads[head](features)
         return F.log_softmax(logits, dim=-1)
+
+    def extract_features_aligned_to_words(self, sentence: str, return_all_hiddens: bool = False) -> torch.Tensor:
+        """Extract RoBERTa features, aligned to spaCy's word-level tokenizer."""
+        from fairseq.models.roberta import alignment_utils
+        from spacy.tokens import Doc
+
+        nlp = alignment_utils.spacy_nlp()
+        tokenizer = alignment_utils.spacy_tokenizer()
+
+        # tokenize both with GPT-2 BPE and spaCy
+        bpe_toks = self.encode(sentence)
+        spacy_toks = tokenizer(sentence)
+        spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)]
+        alignment = alignment_utils.align_bpe_to_words(self, bpe_toks, spacy_toks_ws)
+
+        # extract features and align them
+        features = self.extract_features(bpe_toks, return_all_hiddens=return_all_hiddens)
+        features = features.squeeze(0)
+        aligned_feats = alignment_utils.align_features_to_words(self, features, alignment)
+
+        # wrap in spaCy Doc
+        doc = Doc(
+            nlp.vocab,
+            words=['<s>'] + [x.text for x in spacy_toks] + ['</s>'],
+            spaces=[True] + [x.endswith(' ') for x in spacy_toks_ws[:-1]] + [True, False],
+        )
+        assert len(doc) == aligned_feats.size(0)
+        doc.user_token_hooks['vector'] = lambda token: aligned_feats[token.i]
+        return doc