Here're some resources about general Empirical Study on LLMs, especially scaling laws, emergence abilities, etc
tag: LLM Development
| Characterization Study
| Fault-Tolerant
| NSDI24
| Shanghai AI Laboratory
paper link: here
github link: here
dataset link: here
citation:
@misc{hu2024characterizationlargelanguagemodel,
title={Characterization of Large Language Model Development in the Datacenter},
author={Qinghao Hu and Zhisheng Ye and Zerui Wang and Guoteng Wang and Meng Zhang and Qiaoling Chen and Peng Sun and Dahua Lin and Xiaolin Wang and Yingwei Luo and Yonggang Wen and Tianwei Zhang},
year={2024},
eprint={2403.07648},
archivePrefix={arXiv},
primaryClass={cs.DC},
url={https://arxiv.org/abs/2403.07648},
}
tag: Fractal Patterns
| NIPS24
| Google DeepMind
paper link: here
github link: here
citation:
@misc{alabdulmohsin2024fractalpatternsilluminatesuccess,
title={Fractal Patterns May Illuminate the Success of Next-Token Prediction},
author={Ibrahim Alabdulmohsin and Vinh Q. Tran and Mostafa Dehghani},
year={2024},
eprint={2402.01825},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2402.01825},
}
tag: Long-Context
| Survey
paper link: here
github link: here
citation:
@misc{huang2024advancing,
title={Advancing Transformer Architecture in Long-Context Large Language Models: A Comprehensive Survey},
author={Yunpeng Huang and Jingwei Xu and Junyu Lai and Zixu Jiang and Taolue Chen and Zenan Li and Yuan Yao and Xiaoxing Ma and Lijuan Yang and Hao Chen and Shupeng Li and Penghao Zhao},
year={2024},
eprint={2311.12351},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
Dissecting the Runtime Performance of the Training, Fine-tuning, and Inference of Large Language Models
tag: Runtime Performance Dissection
| HKU
paper link: here
citation:
@misc{zhang2023dissectingruntimeperformancetraining,
title={Dissecting the Runtime Performance of the Training, Fine-tuning, and Inference of Large Language Models},
author={Longteng Zhang and Xiang Liu and Zeyu Li and Xinglin Pan and Peijie Dong and Ruibo Fan and Rui Guo and Xin Wang and Qiong Luo and Shaohuai Shi and Xiaowen Chu},
year={2023},
eprint={2311.03687},
archivePrefix={arXiv},
primaryClass={cs.PF},
url={https://arxiv.org/abs/2311.03687},
}
tag: LLM Challenges
| LLM Applications
paper link: here
citation:
@article{kaddour2023challenges,
title={Challenges and applications of large language models},
author={Kaddour, Jean and Harris, Joshua and Mozes, Maximilian and Bradley, Herbie and Raileanu, Roberta and McHardy, Robert},
journal={arXiv preprint arXiv:2307.10169},
year={2023}
}
tag: Scaling Laws
| RM
| ICML23
| OpenAI
paper link: here
citation:
@inproceedings{gao2023scaling,
title={Scaling laws for reward model overoptimization},
author={Gao, Leo and Schulman, John and Hilton, Jacob},
booktitle={International Conference on Machine Learning},
pages={10835--10866},
year={2023},
organization={PMLR}
}
tag: Loss Spike
| Shanghai Jiao Tong University
paper link: here
citation:
@misc{zhang2023loss,
title={Loss Spike in Training Neural Networks},
author={Zhongwang Zhang and Zhi-Qin John Xu},
year={2023},
eprint={2305.12133},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: Emergent Abilities
| NIPS23
| Stanford University
paper link: here
citation:
@article{schaeffer2023emergent,
title={Are emergent abilities of Large Language Models a mirage?},
author={Schaeffer, Rylan and Miranda, Brando and Koyejo, Sanmi},
journal={arXiv preprint arXiv:2304.15004},
year={2023}
}
tag: Tuned Lens
| Logits Lens
| UCB
paper link: here
github link: here
citation:
@misc{belrose2023elicitinglatentpredictionstransformers,
title={Eliciting Latent Predictions from Transformers with the Tuned Lens},
author={Nora Belrose and Zach Furman and Logan Smith and Danny Halawi and Igor Ostrovsky and Lev McKinney and Stella Biderman and Jacob Steinhardt},
year={2023},
eprint={2303.08112},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2303.08112},
}
tag: GPU(s) for DL
blog link: here
citation:
@misc{Dettmers2023WhichGPU,
author = {Tim Dettmers},
title = {Which GPU for Deep Learning: My Experience and Advice for Using GPUs in Deep Learning},
year = {2023},
month = {Jan},
howpublished = {\url{https://timdettmers.com/2023/01/30/which-gpu-for-deep-learning/}},
}
tag: Emergent Abilities
| TMLR22
| Google
| Stanford University
paper link: here
citation:
@article{wei2022emergent,
title={Emergent abilities of large language models},
author={Wei, Jason and Tay, Yi and Bommasani, Rishi and Raffel, Colin and Zoph, Barret and Borgeaud, Sebastian and Yogatama, Dani and Bosma, Maarten and Zhou, Denny and Metzler, Donald and others},
journal={arXiv preprint arXiv:2206.07682},
year={2022}
}
tag: Modality Gap
| NIPS22
| Stanford University
paper link: here
github link: here
citation:
@misc{liang2022mindgapunderstandingmodality,
title={Mind the Gap: Understanding the Modality Gap in Multi-modal Contrastive Representation Learning},
author={Weixin Liang and Yuhui Zhang and Yongchan Kwon and Serena Yeung and James Zou},
year={2022},
eprint={2203.02053},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2203.02053},
}
tag: ConvNets
| Transformers
| Transferability
| HKU
paper link: here
citation:
@misc{zhou2021convnetsvstransformersvisual,
title={ConvNets vs. Transformers: Whose Visual Representations are More Transferable?},
author={Hong-Yu Zhou and Chixiang Lu and Sibei Yang and Yizhou Yu},
year={2021},
eprint={2108.05305},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2108.05305},
}
tag: Loss Landscape Geometry
| ICML21
paper link: here
citation:
@misc{şimşek2021geometrylosslandscapeoverparameterized,
title={Geometry of the Loss Landscape in Overparameterized Neural Networks: Symmetries and Invariances},
author={Berfin Şimşek and François Ged and Arthur Jacot and Francesco Spadaro and Clément Hongler and Wulfram Gerstner and Johanni Brea},
year={2021},
eprint={2105.12221},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2105.12221},
}
tag: EoS
| Edge of Stability
| ICLR21
| CMU
paper link: here
citation:
@misc{cohen2022gradientdescentneuralnetworks,
title={Gradient Descent on Neural Networks Typically Occurs at the Edge of Stability},
author={Jeremy M. Cohen and Simran Kaur and Yuanzhi Li and J. Zico Kolter and Ameet Talwalkar},
year={2022},
eprint={2103.00065},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2103.00065},
}
tag: Scaling Laws
| OpenAI
paper link: here
citation:
@misc{kaplan2020scalinglawsneurallanguage,
title={Scaling Laws for Neural Language Models},
author={Jared Kaplan and Sam McCandlish and Tom Henighan and Tom B. Brown and Benjamin Chess and Rewon Child and Scott Gray and Alec Radford and Jeffrey Wu and Dario Amodei},
year={2020},
eprint={2001.08361},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2001.08361},
}
tag: SGD
| NIPS18
| Peking University
| Princeton University
paper link: here
citation:
@inproceedings{wu2018sgd,
author = {Wu, Lei and Ma, Chao and E, Weinan},
booktitle = {Advances in Neural Information Processing Systems},
editor = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett},
pages = {},
publisher = {Curran Associates, Inc.},
title = {How SGD Selects the Global Minima in Over-parameterized Learning: A Dynamical Stability Perspective},
url = {https://proceedings.neurips.cc/paper_files/paper/2018/file/6651526b6fb8f29a00507de6a49ce30f-Paper.pdf},
volume = {31},
year = {2018}
}
tag: Generalization Gap
| Sharp Minima
| ICLR17
| Intel
paper link: here
citation:
@misc{keskar2017largebatchtrainingdeeplearning,
title={On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima},
author={Nitish Shirish Keskar and Dheevatsa Mudigere and Jorge Nocedal and Mikhail Smelyanskiy and Ping Tak Peter Tang},
year={2017},
eprint={1609.04836},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/1609.04836},
}