-
Notifications
You must be signed in to change notification settings - Fork 2.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] Safe bpe dropout for LM + joiner disjoin dropout #2009
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,8 @@ | ||
from onmt.utils.logging import logger | ||
from onmt.transforms import register_transform | ||
from .transform import Transform | ||
import random | ||
from onmt.constants import ModelTask, SubwordMarker | ||
|
||
|
||
@register_transform(name='filtertoolong') | ||
|
@@ -41,6 +43,84 @@ def _repr_args(self): | |
) | ||
|
||
|
||
@register_transform(name="joiner_dropout") | ||
class JoinerDropoutTransform(Transform): | ||
"""Disjoin joiner with probability dropout""" | ||
|
||
def __init__(self, opts): | ||
super().__init__(opts) | ||
|
||
@classmethod | ||
def add_options(cls, parser): | ||
"""Avalilable options relate to this Transform.""" | ||
group = parser.add_argument_group("Transform/JoinerDropout") | ||
group.add( | ||
"--src_joiner_dropout", | ||
"-src_joiner_dropout", | ||
type=float, | ||
default=0.0, | ||
help="Source dropout probability.", | ||
) | ||
group.add( | ||
"--tgt_joiner_dropout", | ||
"-tgt_joiner_dropout", | ||
type=float, | ||
default=0.0, | ||
help="Target dropout probability.", | ||
) | ||
|
||
def _parse_opts(self): | ||
self.src_joiner_dropout = self.opts.src_joiner_dropout | ||
self.tgt_joiner_dropout = self.opts.tgt_joiner_dropout | ||
|
||
def dropout_separate_joiner(self, seq, side="src"): | ||
out_seq = [] | ||
dropout = ( | ||
self.src_joiner_dropout | ||
if side == "src" | ||
else self.tgt_joiner_dropout | ||
) | ||
for elem in seq: | ||
if elem == SubwordMarker.JOINER: | ||
continue | ||
if elem.startswith(SubwordMarker.JOINER): | ||
if random.random() < dropout: | ||
out_seq.append(SubwordMarker.JOINER) | ||
elem = elem[1:] | ||
if len(elem) > 1 and elem.endswith(SubwordMarker.JOINER): | ||
if random.random() < dropout: | ||
out_seq.append(elem[:-1]) | ||
out_seq.append(SubwordMarker.JOINER) | ||
else: | ||
out_seq.append(elem) | ||
else: | ||
out_seq.append(elem) | ||
return out_seq | ||
|
||
def apply(self, example, is_train=False, stats=None, **kwargs): | ||
"""Return None if too long else return as is.""" | ||
if not is_train: | ||
return example | ||
else: | ||
src_out = self.dropout_separate_joiner(example["src"], "src") | ||
example["src"] = src_out | ||
if self.opts.model_task == ModelTask.LANGUAGE_MODEL: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently this won't work on build_vocab, model_task is a model parameter and not a dynamic corpus parameter. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
example["tgt"] = src_out | ||
else: | ||
tgt_out = self.dropout_separate_joiner(example["tgt"], "tgt") | ||
example["tgt"] = tgt_out | ||
return example | ||
|
||
def _repr_args(self): | ||
"""Return str represent key arguments for class.""" | ||
return "{}={}, {}={}".format( | ||
"src_joiner_dropout", | ||
self.src_joiner_dropout, | ||
"tgt_joiner_dropout", | ||
self.tgt_joiner_dropout, | ||
) | ||
|
||
|
||
@register_transform(name='prefix') | ||
class PrefixTransform(Transform): | ||
"""Add Prefix to src (& tgt) sentence.""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
from onmt.utils.logging import logger | ||
from onmt.transforms import register_transform | ||
from .transform import Transform | ||
from onmt.constants import ModelTask | ||
|
||
|
||
class TokenizerTransform(Transform): | ||
|
@@ -90,6 +91,7 @@ def _parse_opts(self): | |
self.tgt_subword_vocab = self.opts.tgt_subword_vocab | ||
self.src_vocab_threshold = self.opts.src_vocab_threshold | ||
self.tgt_vocab_threshold = self.opts.tgt_vocab_threshold | ||
self.model_task = self.opts.model_task | ||
|
||
def _repr_args(self): | ||
"""Return str represent key arguments for TokenizerTransform.""" | ||
|
@@ -169,7 +171,10 @@ def _tokenize(self, tokens, side='src', is_train=False): | |
def apply(self, example, is_train=False, stats=None, **kwargs): | ||
"""Apply sentencepiece subword encode to src & tgt.""" | ||
src_out = self._tokenize(example['src'], 'src', is_train) | ||
tgt_out = self._tokenize(example['tgt'], 'tgt', is_train) | ||
if self.model_task == ModelTask.LANGUAGE_MODEL: | ||
tgt_out = src_out | ||
else: | ||
tgt_out = self._tokenize(example['tgt'], 'tgt', is_train) | ||
if stats is not None: | ||
n_words = len(example['src']) + len(example['tgt']) | ||
n_subwords = len(src_out) + len(tgt_out) | ||
|
@@ -243,7 +248,10 @@ def _tokenize(self, tokens, side='src', is_train=False): | |
def apply(self, example, is_train=False, stats=None, **kwargs): | ||
"""Apply bpe subword encode to src & tgt.""" | ||
src_out = self._tokenize(example['src'], 'src', is_train) | ||
tgt_out = self._tokenize(example['tgt'], 'tgt', is_train) | ||
if self.model_task == ModelTask.LANGUAGE_MODEL: | ||
tgt_out = src_out | ||
else: | ||
tgt_out = self._tokenize(example['tgt'], 'tgt', is_train) | ||
if stats is not None: | ||
n_words = len(example['src']) + len(example['tgt']) | ||
n_subwords = len(src_out) + len(tgt_out) | ||
|
@@ -327,7 +335,7 @@ def get_specials(cls, opts): | |
tgt_specials.update(_case_specials) | ||
return (set(), set()) | ||
|
||
def _get_subword_kwargs(self, side='src'): | ||
def _get_subword_kwargs(self, side='src', is_train=False): | ||
"""Return a dict containing kwargs relate to `side` subwords.""" | ||
subword_type = self.tgt_subword_type if side == 'tgt' \ | ||
else self.src_subword_type | ||
|
@@ -340,7 +348,7 @@ def _get_subword_kwargs(self, side='src'): | |
kwopts = dict() | ||
if subword_type == 'bpe': | ||
kwopts['bpe_model_path'] = subword_model | ||
kwopts['bpe_dropout'] = subword_alpha | ||
kwopts['bpe_dropout'] = subword_alpha if is_train else 0 | ||
elif subword_type == 'sentencepiece': | ||
kwopts['sp_model_path'] = subword_model | ||
kwopts['sp_nbest_size'] = subword_nbest | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can directly reassign on |
||
|
@@ -360,42 +368,65 @@ def warm_up(self, vocabs=None): | |
"""Initialize Tokenizer models.""" | ||
super().warm_up(None) | ||
import pyonmttok | ||
src_subword_kwargs = self._get_subword_kwargs(side='src') | ||
|
||
src_subword_kwargs = self._get_subword_kwargs( | ||
side="src", is_train=True | ||
) | ||
valid_src_subword_kwargs = self._get_subword_kwargs( | ||
side="src", is_train=False | ||
) | ||
src_tokenizer = pyonmttok.Tokenizer( | ||
**src_subword_kwargs, **self.src_other_kwargs | ||
) | ||
tgt_subword_kwargs = self._get_subword_kwargs(side='tgt') | ||
_diff_vocab = ( | ||
src_subword_kwargs.get('vocabulary_path', '') != | ||
tgt_subword_kwargs.get('vocabulary_path', '') or | ||
src_subword_kwargs.get('vocabulary_threshold', 0) != | ||
tgt_subword_kwargs.get('vocabulary_threshold', 0)) | ||
Comment on lines
-368
to
-372
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I personally prefer current one which seems more readable |
||
valid_src_tokenizer = pyonmttok.Tokenizer( | ||
**valid_src_subword_kwargs, **self.src_other_kwargs | ||
) | ||
tgt_subword_kwargs = self._get_subword_kwargs( | ||
side="tgt", is_train=True | ||
) | ||
_diff_vocab = src_subword_kwargs.get( | ||
"vocabulary_path", "" | ||
) != tgt_subword_kwargs.get( | ||
"vocabulary_path", "" | ||
) or src_subword_kwargs.get( | ||
"vocabulary_threshold", 0 | ||
) != tgt_subword_kwargs.get( | ||
"vocabulary_threshold", 0 | ||
) | ||
if self.share_vocab and not _diff_vocab: | ||
self.load_models = { | ||
'src': src_tokenizer, | ||
'tgt': src_tokenizer | ||
"src": {"train": src_tokenizer, "valid": valid_src_tokenizer}, | ||
"tgt": {"train": src_tokenizer, "valid": valid_src_tokenizer}, | ||
} | ||
else: | ||
tgt_subword_kwargs = self._get_subword_kwargs(side='tgt') | ||
tgt_tokenizer = pyonmttok.Tokenizer( | ||
**tgt_subword_kwargs, **self.tgt_other_kwargs | ||
) | ||
valid_tgt_subword_kwargs = self._get_subword_kwargs( | ||
side="tgt", is_train=False | ||
) | ||
valid_tgt_tokenizer = pyonmttok.Tokenizer( | ||
**valid_tgt_subword_kwargs, **self.tgt_other_kwargs | ||
) | ||
self.load_models = { | ||
'src': src_tokenizer, | ||
'tgt': tgt_tokenizer | ||
"src": {"train": src_tokenizer, "valid": valid_src_tokenizer}, | ||
"tgt": {"train": tgt_tokenizer, "valid": valid_tgt_tokenizer}, | ||
} | ||
|
||
def _tokenize(self, tokens, side='src', is_train=False): | ||
"""Do OpenNMT Tokenizer's tokenize.""" | ||
tokenizer = self.load_models[side] | ||
tokenizer = self.load_models[side]['train' if is_train else 'valid'] | ||
sentence = ' '.join(tokens) | ||
segmented, _ = tokenizer.tokenize(sentence) | ||
return segmented | ||
|
||
def apply(self, example, is_train=False, stats=None, **kwargs): | ||
"""Apply OpenNMT Tokenizer to src & tgt.""" | ||
src_out = self._tokenize(example['src'], 'src') | ||
tgt_out = self._tokenize(example['tgt'], 'tgt') | ||
src_out = self._tokenize(example['src'], 'src', is_train) | ||
if self.model_task == ModelTask.LANGUAGE_MODEL: | ||
tgt_out = src_out | ||
else: | ||
tgt_out = self._tokenize(example['tgt'], 'tgt', is_train) | ||
if stats is not None: | ||
n_words = len(example['src']) + len(example['tgt']) | ||
n_subwords = len(src_out) + len(tgt_out) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure about the necessity of doing right and left token sides
It might create a difficulty to retrieve the initial token when detokenizing.
I can use a special token do distinguish left and right or remove the left side disjoin which doesn't occur much (mainly in punctuation)