From 7e34c1ac4c0a5b2107e7a5f098d716577b21006c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Thu, 19 Nov 2020 14:43:32 +0100 Subject: [PATCH 1/2] add build_vocab.py at repo root --- build_vocab.py | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 build_vocab.py diff --git a/build_vocab.py b/build_vocab.py new file mode 100644 index 0000000000..577c2c1c9c --- /dev/null +++ b/build_vocab.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python +from onmt.bin.build_vocab import main + + +if __name__ == "__main__": + main() From 6d7a5464a341488ed520a7422214ff9a9c974e53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?= Date: Thu, 19 Nov 2020 15:25:11 +0100 Subject: [PATCH 2/2] update rebuild_test_models.sh for v2, fix an assert --- data/morph_data.yaml | 8 +++ onmt/model_builder.py | 2 +- onmt/tests/rebuild_test_models.sh | 96 +++++++++++++++++++++---------- 3 files changed, 74 insertions(+), 32 deletions(-) create mode 100644 data/morph_data.yaml diff --git a/data/morph_data.yaml b/data/morph_data.yaml new file mode 100644 index 0000000000..b7c931d74d --- /dev/null +++ b/data/morph_data.yaml @@ -0,0 +1,8 @@ +# Corpus opts: +data: + corpus_1: + path_src: data/morph/src.train + path_tgt: data/morph/tgt.train + valid: + path_src: data/morph/src.valid + path_tgt: data/morph/tgt.valid diff --git a/onmt/model_builder.py b/onmt/model_builder.py index d5d0b12f6f..a1f4eb1289 100644 --- a/onmt/model_builder.py +++ b/onmt/model_builder.py @@ -144,7 +144,7 @@ def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ - "preprocess with -share_vocab if you use share_embeddings" + "-share_vocab is required if you use -share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight diff --git a/onmt/tests/rebuild_test_models.sh b/onmt/tests/rebuild_test_models.sh index 8b40562c93..efdaaa07a3 100755 --- a/onmt/tests/rebuild_test_models.sh +++ b/onmt/tests/rebuild_test_models.sh @@ -3,60 +3,94 @@ my_python=python ############### TEST regular RNN choose either -rnn_type LSTM / GRU / SRU and set input_feed 0 for SRU -if true; then -rm data/*.pt -$my_python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000 - -$my_python train.py -data data/data -save_model tmp -world_size 1 -gpu_ranks 0 -rnn_size 256 -word_vec_size 256 -layers 1 -train_steps 10000 -optim adam -learning_rate 0.001 -rnn_type LSTM -input_feed 0 -#-truncated_decoder 5 -#-label_smoothing 0.1 +if false; then +$my_python build_vocab.py \ + -config data/data.yaml -save_data data/data \ + -src_vocab data/data.vocab.src -tgt_vocab data/data.vocab.tgt \ + -overwrite true +$my_python train.py \ + -config data/data.yaml -src_vocab data/data.vocab.src -tgt_vocab data/data.vocab.tgt \ + -src_vocab_size 1000 -tgt_vocab_size 1000 \ + -save_model tmp -world_size 1 -gpu_ranks 0 \ + -rnn_type LSTM -input_feed 0 \ + -rnn_size 256 -word_vec_size 256 \ + -layers 1 -train_steps 10000 \ + -optim adam -learning_rate 0.001 + # -truncated_decoder 5 + # -label_smoothing 0.1 mv tmp*10000.pt onmt/tests/test_model.pt rm tmp*.pt fi -# -# -############### TEST CNN -if false; then -rm data/*.pt -$my_python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000 -$my_python train.py -data data/data -save_model /tmp/tmp -world_size 1 -gpu_ranks 0 -rnn_size 256 -word_vec_size 256 -layers 2 -train_steps 10000 -optim adam -learning_rate 0.001 -encoder_type cnn -decoder_type cnn +############### TEST CNN +if false; then +$my_python build_vocab.py \ + -config data/data.yaml -save_data data/data \ + -src_vocab data/data.vocab.src -tgt_vocab data/data.vocab.tgt \ + -overwrite true +$my_python train.py \ + -config data/data.yaml -src_vocab data/data.vocab.src -tgt_vocab data/data.vocab.tgt \ + -src_vocab_size 1000 -tgt_vocab_size 1000 \ + -save_model /tmp/tmp -world_size 1 -gpu_ranks 0 \ + -encoder_type cnn -decoder_type cnn \ + -rnn_size 256 -word_vec_size 256 \ + -layers 2 -train_steps 10000 \ + -optim adam -learning_rate 0.001 mv /tmp/tmp*10000.pt onmt/tests/test_model.pt - rm /tmp/tmp*.pt fi -# -################# MORPH DATA -if true; then -rm data/morph/*.pt -$my_python preprocess.py -train_src data/morph/src.train -train_tgt data/morph/tgt.train -valid_src data/morph/src.valid -valid_tgt data/morph/tgt.valid -save_data data/morph/data -$my_python train.py -data data/morph/data -save_model tmp -world_size 1 -gpu_ranks 0 -rnn_size 400 -word_vec_size 100 -layers 1 -train_steps 8000 -optim adam -learning_rate 0.001 + +################# MORPH DATA +if false; then +$my_python build_vocab.py \ + -config data/morph_data.yaml -save_data data/data \ + -src_vocab data/morph_data.vocab.src -tgt_vocab data/morph_data.vocab.tgt \ + -overwrite true +$my_python train.py \ + -config data/morph_data.yaml -src_vocab data/morph_data.vocab.src -tgt_vocab data/morph_data.vocab.tgt \ + -save_model tmp -world_size 1 -gpu_ranks 0 \ + -rnn_size 400 -word_vec_size 100 \ + -layers 1 -train_steps 8000 \ + -optim adam -learning_rate 0.001 mv tmp*8000.pt onmt/tests/test_model2.pt rm tmp*.pt fi + + ############### TEST TRANSFORMER if false; then -rm data/*.pt -$my_python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000 -share_vocab - +$my_python build_vocab.py \ + -config data/data.yaml -save_data data/data \ + -src_vocab data/data.vocab.src -tgt_vocab data/data.vocab.tgt \ + -overwrite true -share_vocab + +$my_python train.py \ + -config data/data.yaml -src_vocab data/data.vocab.src -tgt_vocab data/data.vocab.tgt \ + -save_model /tmp/tmp \ + -batch_type tokens -batch_size 1024 -accum_count 4 \ + -layers 4 -rnn_size 256 -word_vec_size 256 \ + -encoder_type transformer -decoder_type transformer \ + -share_embedding -share_vocab \ + -train_steps 10000 -world_size 1 -gpu_ranks 0 \ + -max_generator_batches 4 -dropout 0.1 \ + -normalization tokens \ + -max_grad_norm 0 -optim adam -decay_method noam \ + -learning_rate 2 -label_smoothing 0.1 \ + -position_encoding -param_init 0 \ + -warmup_steps 100 -param_init_glorot -adam_beta2 0.998 -$my_python train.py -data data/data -save_model /tmp/tmp -batch_type tokens -batch_size 1024 -accum_count 4 \ - -layers 4 -rnn_size 256 -word_vec_size 256 -encoder_type transformer -decoder_type transformer -share_embedding \ - -train_steps 10000 -world_size 1 -gpu_ranks 0 -max_generator_batches 4 -dropout 0.1 -normalization tokens \ - -max_grad_norm 0 -optim adam -decay_method noam -learning_rate 2 -label_smoothing 0.1 \ - -position_encoding -param_init 0 -warmup_steps 100 -param_init_glorot -adam_beta2 0.998 -# mv /tmp/tmp*10000.pt onmt/tests/test_model.pt rm /tmp/tmp*.pt fi -# + + if false; then $my_python translate.py -gpu 0 -model onmt/tests/test_model.pt \ -src data/src-val.txt -output onmt/tests/output_hyp.txt -beam 5 -batch_size 16