Here're some resources about the Training Objective / Loss Functions for LLMs
tag: Multi-token Prediction
| Meta
paper link: here
citation:
@misc{gloeckle2024better,
title={Better & Faster Large Language Models via Multi-token Prediction},
author={Fabian Gloeckle and Badr Youbi Idrissi and Baptiste Rozière and David Lopez-Paz and Gabriel Synnaeve},
year={2024},
eprint={2404.19737},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: SpacTor-T5
| SC
| RTD
| Google
paper link: here
citation:
@misc{ye2024spactort5pretrainingt5models,
title={SpacTor-T5: Pre-training T5 Models with Span Corruption and Replaced Token Detection},
author={Ke Ye and Heinrich Jiang and Afshin Rostamizadeh and Ayan Chakrabarti and Giulia DeSalvo and Jean-François Kagy and Lazaros Karydas and Gui Citovsky and Sanjiv Kumar},
year={2024},
eprint={2401.13160},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2401.13160},
}
tag: FIM
| Fill-in-the-Middle
| Infilling
| OpenAI
paper link: here
code link: here
citation:
@misc{bavarian2022efficienttraininglanguagemodels,
title={Efficient Training of Language Models to Fill in the Middle},
author={Mohammad Bavarian and Heewoo Jun and Nikolas Tezak and John Schulman and Christine McLeavey and Jerry Tworek and Mark Chen},
year={2022},
eprint={2207.14255},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2207.14255},
}
tag: UL2
| MoD
| Mixture-of-Denoisers
| Google Brain
paper link: here
code link: here
citation:
@misc{tay2023ul2,
title={UL2: Unifying Language Learning Paradigms},
author={Yi Tay and Mostafa Dehghani and Vinh Q. Tran and Xavier Garcia and Jason Wei and Xuezhi Wang and Hyung Won Chung and Siamak Shakeri and Dara Bahri and Tal Schuster and Huaixiu Steven Zheng and Denny Zhou and Neil Houlsby and Donald Metzler},
year={2023},
eprint={2205.05131},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: ELECTRA
| RTD
| Replaced Token Detection
| Google
paper link: here
code link: here
citation:
@misc{clark2020electrapretrainingtextencoders,
title={ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators},
author={Kevin Clark and Minh-Thang Luong and Quoc V. Le and Christopher D. Manning},
year={2020},
eprint={2003.10555},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2003.10555},
}
tag: T5
| SC
| Span Corruption
| Seq2Seq
| Google
paper link: here
code link: here
citation:
@misc{raffel2023exploringlimitstransferlearning,
title={Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
author={Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
year={2023},
eprint={1910.10683},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/1910.10683},
}
tag: UniLM
| Microsoft
paper link: here
code link: here
citation:
@misc{dong2019unifiedlanguagemodelpretraining,
title={Unified Language Model Pre-training for Natural Language Understanding and Generation},
author={Li Dong and Nan Yang and Wenhui Wang and Furu Wei and Xiaodong Liu and Yu Wang and Jianfeng Gao and Ming Zhou and Hsiao-Wuen Hon},
year={2019},
eprint={1905.03197},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/1905.03197},
}
tag: GPT
| CLM
| Causal Language Modeling
| OpenAI
paper link: here
code link: here
citation:
@article{radford2018improving,
title={Improving Language Understanding by Generative Pre-Training},
author={Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya},
journal={OpenAI},
year={2018},
url={https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf}
}
tag: BERT
| MLM
| Masked Language Modeling
| Google
paper link: here
code link: here
citation:
@misc{devlin2019bertpretrainingdeepbidirectional,
title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
author={Jacob Devlin and Ming-Wei Chang and Kenton Lee and Kristina Toutanova},
year={2019},
eprint={1810.04805},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/1810.04805},
}
tag: FaceNet
| Triplet Loss
| Google
paper link: here
citation:
@inproceedings{Schroff_2015,
title={FaceNet: A unified embedding for face recognition and clustering},
url={http://dx.doi.org/10.1109/CVPR.2015.7298682},
DOI={10.1109/cvpr.2015.7298682},
booktitle={2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
publisher={IEEE},
author={Schroff, Florian and Kalenichenko, Dmitry and Philbin, James},
year={2015},
month=jun
}
TAG: Skip-gram
| Negative Sampling
| Google
paper link: here
citation:
@article{mikolov2013distributed,
title={Distributed representations of words and phrases and their compositionality},
author={Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S and Dean, Jeff},
journal={Advances in neural information processing systems},
volume={26},
year={2013}
}
tag: NCE
| Contractive Learning
paper link: here
citation:
@inproceedings{gutmann2010noise,
title={Noise-contrastive estimation: A new estimation principle for unnormalized statistical models},
author={Gutmann, Michael and Hyv{\"a}rinen, Aapo},
booktitle={Proceedings of the thirteenth international conference on artificial intelligence and statistics},
pages={297--304},
year={2010},
organization={JMLR Workshop and Conference Proceedings}
}