diff --git a/examples/roberta/README.finetune_custom_classification.md b/examples/roberta/README.finetune_custom_classification.md new file mode 100644 index 0000000000..de3a4cc37a --- /dev/null +++ b/examples/roberta/README.finetune_custom_classification.md @@ -0,0 +1,120 @@ +# RoBERTa fine-tuning on custom classification task (example IMDB) + +## 1) Get the data +``` +wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz +tar zxvf aclImdb_v1.tar.gz +``` + +## 2) Format data +`IMDB` data has one data-sample in each file, below python code-snippet converts it one file for train and valid each for ease of processing. +``` +import argparse +import os +import random +from glob import glob + +random.seed(0) + +def main(args): + for split in ['train', 'test']: + samples = [] + for class_label in ['pos', 'neg']: + fnames = glob(os.path.join(args.datadir, split, class_label) + '/*.txt') + for fname in fnames: + with open(fname) as fin: + line = fin.readline() + samples.append((line, 1 if class_label == 'pos' else 0)) + random.shuffle(samples) + out_fname = 'train' if split == 'train' else 'dev' + f1 = open(os.path.join(args.datadir, out_fname + '.input0'), 'w') + f2 = open(os.path.join(args.datadir, out_fname + '.label'), 'w') + for sample in samples: + f1.write(sample[0] + '\n') + f2.write(str(sample[1]) + '\n') + f1.close() + f2.close() + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--datadir', default='aclImdb') + args = parser.parse_args() + main(args) +``` + +## 3) BPE Encode +Run `multiprocessing_bpe_encoder`, you can also do this in previous step for each sample but that might be slower. +``` +# Download encoder.json and vocab.bpe +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' + +for SPLIT in train dev; +do + python -m examples.roberta.multiprocessing_bpe_encoder \ + --encoder-json encoder.json \ + --vocab-bpe vocab.bpe \ + --inputs "aclImdb/$SPLIT.input0" \ + --outputs "aclImdb/$SPLIT.input0.bpe" \ + --workers 60 \ + --keep-empty; +done +``` + + +## 4) Preprocess data + +``` +# Download fairseq dictionary. +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt' + +fairseq-preprocess \ + --only-source \ + --trainpref "aclImdb/train.input0.bpe" \ + --validpref "aclImdb/dev.input0.bpe" \ + --destdir "IMDB-bin/input0" \ + --workers 60 \ + --srcdict dict.txt; + +fairseq-preprocess \ + --only-source \ + --trainpref "aclImdb/train.label" \ + --validpref "aclImdb/dev.label" \ + --destdir "IMDB-bin/label" \ + --workers 60; + +``` + +## 5) Run Training + +``` +TOTAL_NUM_UPDATES=7812 # 10 epochs through IMDB for bsz 32 +WARMUP_UPDATES=469 # 6 percent of the number of updates +LR=1e-05 # Peak LR for polynomial LR scheduler. +NUM_CLASSES=2 +MAX_SENTENCES=8 # Batch size. + +CUDA_VISIBLE_DEVICES=0 python train.py IMDB-bin/ \ +--restore-file \ +--max-positions 512 \ +--max-sentences $MAX_SENTENCES \ +--max-tokens 4400 \ +--task sentence_prediction \ +--reset-optimizer --reset-dataloader --reset-meters \ +--required-batch-size-multiple 1 \ +--init-token 0 --separator-token 2 \ +--arch roberta_large \ +--criterion sentence_prediction \ +--num-classes $NUM_CLASSES \ +--dropout 0.1 --attention-dropout 0.1 \ +--weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ +--clip-norm 0.0 \ +--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ +--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ +--max-epoch 10 \ +--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ +--truncate-sequence \ +--update-freq 4; +``` +Above will train with effective batch-size of `32`, tested on one `Nvidia V100 32gb`. +Expected `best-validation-accuracy` after `10` epochs is `~96.5%`. diff --git a/examples/roberta/README.md b/examples/roberta/README.md index f8c3974bd5..989c9d750e 100644 --- a/examples/roberta/README.md +++ b/examples/roberta/README.md @@ -208,6 +208,9 @@ b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb c) All the settings in above table are suggested settings based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search. +## Fine-tuning on custom classification tasks +[Example of fine-tuning Roberta on simple custom classification task](README.finetune_custom_classification.md) + ## Pretraining using your own data You can use the [`masked_lm` task](/fairseq/tasks/masked_lm.py) to pretrain RoBERTa from scratch, or to continue pretraining RoBERTa starting from one of the released checkpoints.