Here're some resources about Integrated Parallelism for LLMs Training
tag: MegaScale
| veScale
| ByteDance
| Peking University
paper link: here
github link: here
citation:
@misc{jiang2024megascalescalinglargelanguage,
title={MegaScale: Scaling Large Language Model Training to More Than 10,000 GPUs},
author={Ziheng Jiang and Haibin Lin and Yinmin Zhong and Qi Huang and Yangrui Chen and Zhi Zhang and Yanghua Peng and Xiang Li and Cong Xie and Shibiao Nong and Yulu Jia and Sun He and Hongmin Chen and Zhihao Bai and Qi Hou and Shipeng Yan and Ding Zhou and Yiyao Sheng and Zhuo Jiang and Haohan Xu and Haoran Wei and Zhang Zhang and Pengfei Nie and Leqi Zou and Sida Zhao and Liang Xiang and Zherui Liu and Zhe Li and Xiaoying Jia and Jianxi Ye and Xin Jin and Xin Liu},
year={2024},
eprint={2402.15627},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2402.15627},
}
Fold3D: Rethinking and Parallelizing Computational and Communicational Tasks in the Training of Large DNN Models
tag: Fold3D
| TPDS23
| HKU
| Shanghai AILab
paper link: here
citation:
@article{10050126,
author={Li, Fanxin and Zhao, Shixiong and Qing, Yuhao and Chen, Xusheng and Guan, Xiuxian and Wang, Sen and Zhang, Gong and Cui, Heming},
journal={IEEE Transactions on Parallel and Distributed Systems},
title={Fold3D: Rethinking and Parallelizing Computational and Communicational Tasks in the Training of Large DNN Models},
year={2023},
volume={34},
number={5},
pages={1432-1449},
keywords={Task analysis;Training;Three-dimensional displays;Processor scheduling;Graphics processing units;Pipeline processing;Computational modeling;3D parallelism;DNN;deep learning;distributed training;GPU;machine learning;pipeline parallelism},
doi={10.1109/TPDS.2023.3247883}
}
tag: Hy-Fi
| ISC HP22
paper link: here
slides link: here
github link: here
citation:
@inproceedings{jain2022hyfi,
author = {Jain, Arpan and Shafi, Aamir and Anthony, Quentin and Kousha, Pouya and Subramoni, Hari and Panda, Dhableswar K.},
title = {Hy-Fi: Hybrid Five-Dimensional Parallel DNN Training on High-Performance GPU Clusters},
year = {2022},
isbn = {978-3-031-07311-3},
publisher = {Springer-Verlag},
address = {Berlin, Heidelberg},
url = {https://doi.org/10.1007/978-3-031-07312-0_6},
doi = {10.1007/978-3-031-07312-0_6},
booktitle = {High Performance Computing: 37th International Conference, ISC High Performance 2022, Hamburg, Germany, May 29 – June 2, 2022, Proceedings},
pages = {109–130},
numpages = {22},
keywords = {DNN, Model-parallelism, Distributed training, Hybrid parallelism, MPI, GPU},
location = {Hamburg, Germany}
}
Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model
tag: Megatron-Turing
| Megatron
| DeepSpeed
paper link: here
citation:
@misc{smith2022using,
title={Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model},
author={Shaden Smith and Mostofa Patwary and Brandon Norick and Patrick LeGresley and Samyam Rajbhandari and Jared Casper and Zhun Liu and Shrimai Prabhumoye and George Zerveas and Vijay Korthikanti and Elton Zhang and Rewon Child and Reza Yazdani Aminabadi and Julie Bernauer and Xia Song and Mohammad Shoeybi and Yuxiong He and Michael Houston and Saurabh Tiwary and Bryan Catanzaro},
year={2022},
eprint={2201.11990},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
tag: OneFlow
| SBP
| OneFlow Research
paper link: here
github link: here
citation:
@misc{yuan2022oneflowredesigndistributeddeep,
title={OneFlow: Redesign the Distributed Deep Learning Framework from Scratch},
author={Jinhui Yuan and Xinqi Li and Cheng Cheng and Juncheng Liu and Ran Guo and Shenghang Cai and Chi Yao and Fei Yang and Xiaodong Yi and Chuan Wu and Haoran Zhang and Jie Zhao},
year={2022},
eprint={2110.15032},
archivePrefix={arXiv},
primaryClass={cs.DC},
url={https://arxiv.org/abs/2110.15032},
}
tag: Colossal-AI
| ICPP23
| NUS
paper link: here
github link: here
citation:
@inproceedings{li2023colossal,
title={Colossal-ai: A unified deep learning system for large-scale parallel training},
author={Li, Shenggui and Liu, Hongxin and Bian, Zhengda and Fang, Jiarui and Huang, Haichen and Liu, Yuliang and Wang, Boxiang and You, Yang},
booktitle={Proceedings of the 52nd International Conference on Parallel Processing},
pages={766--775},
year={2023}
}
tag: GSPMD
| TPU-v3
| Google
paper link: here
citation:
@misc{xu2021gspmd,
title={GSPMD: General and Scalable Parallelization for ML Computation Graphs},
author={Yuanzhong Xu and HyoukJoong Lee and Dehao Chen and Blake Hechtman and Yanping Huang and Rahul Joshi and Maxim Krikun and Dmitry Lepikhin and Andy Ly and Marcello Maggioni and Ruoming Pang and Noam Shazeer and Shibo Wang and Tao Wang and Yonghui Wu and Zhifeng Chen},
year={2021},
eprint={2105.04663},
archivePrefix={arXiv},
primaryClass={id='cs.DC' full_name='Distributed, Parallel, and Cluster Computing' is_active=True alt_name=None in_archive='cs' is_general=False description='Covers fault-tolerance, distributed algorithms, stabilility, parallel computation, and cluster computing. Roughly includes material in ACM Subject Classes C.1.2, C.1.4, C.2.4, D.1.3, D.4.5, D.4.7, E.1.'}
}
tag: PTD-P
| Megatron-LM
| Nvidia
| Microsoft
| Stanford University
paper link: here
github link: here
citation:
@misc{narayanan2021efficient,
title={Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM},
author={Deepak Narayanan and Mohammad Shoeybi and Jared Casper and Patrick LeGresley and Mostofa Patwary and Vijay Anand Korthikanti and Dmitri Vainbrand and Prethvi Kashinkunti and Julie Bernauer and Bryan Catanzaro and Amar Phanishayee and Matei Zaharia},
year={2021},
eprint={2104.04473},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters
tag: DeepSpeed
| KDD20
| Microsoft
paper link: here
github link: here
blog link: here
docs link: here
citation:
@inproceedings{rasley2020deepspeed,
title={Deepspeed: System optimizations enable training deep learning models with over 100 billion parameters},
author={Rasley, Jeff and Rajbhandari, Samyam and Ruwase, Olatunji and He, Yuxiong},
booktitle={Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining},
pages={3505--3506},
year={2020}
}
tag: Megatron-LM
| Nvidia
paper link: here
github link: here
follow-up work: here
citation:
@misc{shoeybi2020megatronlm,
title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism},
author={Mohammad Shoeybi and Mostofa Patwary and Raul Puri and Patrick LeGresley and Jared Casper and Bryan Catanzaro},
year={2020},
eprint={1909.08053},
archivePrefix={arXiv},
primaryClass={cs.CL}
}