Skip to content

Latest commit

 

History

History
236 lines (163 loc) · 8.88 KB

File metadata and controls

236 lines (163 loc) · 8.88 KB

Integrated Parallelism for LLMs Training

Here're some resources about Integrated Parallelism for LLMs Training

MegaScale: Scaling Large Language Model Training to More Than 10,000 GPUs

tag: MegaScale | veScale | ByteDance | Peking University

paper link: here

github link: here

citation:

@misc{jiang2024megascalescalinglargelanguage,
      title={MegaScale: Scaling Large Language Model Training to More Than 10,000 GPUs}, 
      author={Ziheng Jiang and Haibin Lin and Yinmin Zhong and Qi Huang and Yangrui Chen and Zhi Zhang and Yanghua Peng and Xiang Li and Cong Xie and Shibiao Nong and Yulu Jia and Sun He and Hongmin Chen and Zhihao Bai and Qi Hou and Shipeng Yan and Ding Zhou and Yiyao Sheng and Zhuo Jiang and Haohan Xu and Haoran Wei and Zhang Zhang and Pengfei Nie and Leqi Zou and Sida Zhao and Liang Xiang and Zherui Liu and Zhe Li and Xiaoying Jia and Jianxi Ye and Xin Jin and Xin Liu},
      year={2024},
      eprint={2402.15627},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2402.15627}, 
}

Fold3D: Rethinking and Parallelizing Computational and Communicational Tasks in the Training of Large DNN Models

tag: Fold3D | TPDS23 | HKU | Shanghai AILab

paper link: here

citation:

@article{10050126,
  author={Li, Fanxin and Zhao, Shixiong and Qing, Yuhao and Chen, Xusheng and Guan, Xiuxian and Wang, Sen and Zhang, Gong and Cui, Heming},
  journal={IEEE Transactions on Parallel and Distributed Systems}, 
  title={Fold3D: Rethinking and Parallelizing Computational and Communicational Tasks in the Training of Large DNN Models}, 
  year={2023},
  volume={34},
  number={5},
  pages={1432-1449},
  keywords={Task analysis;Training;Three-dimensional displays;Processor scheduling;Graphics processing units;Pipeline processing;Computational modeling;3D parallelism;DNN;deep learning;distributed training;GPU;machine learning;pipeline parallelism},
  doi={10.1109/TPDS.2023.3247883}
}

Hy-Fi: Hybrid Five-Dimensional Parallel DNN Training on High-Performance GPU Clusters

tag: Hy-Fi | ISC HP22

paper link: here

slides link: here

github link: here

citation:

@inproceedings{jain2022hyfi,
      author = {Jain, Arpan and Shafi, Aamir and Anthony, Quentin and Kousha, Pouya and Subramoni, Hari and Panda, Dhableswar K.},
      title = {Hy-Fi: Hybrid Five-Dimensional Parallel DNN Training on High-Performance GPU Clusters},
      year = {2022},
      isbn = {978-3-031-07311-3},
      publisher = {Springer-Verlag},
      address = {Berlin, Heidelberg},
      url = {https://doi.org/10.1007/978-3-031-07312-0_6},
      doi = {10.1007/978-3-031-07312-0_6},
      booktitle = {High Performance Computing: 37th International Conference, ISC High Performance 2022, Hamburg, Germany, May 29 – June 2, 2022, Proceedings},
      pages = {109–130},
      numpages = {22},
      keywords = {DNN, Model-parallelism, Distributed training, Hybrid parallelism, MPI, GPU},
      location = {Hamburg, Germany}
}

Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model

tag: Megatron-Turing | Megatron | DeepSpeed

paper link: here

citation:

@misc{smith2022using,
      title={Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model}, 
      author={Shaden Smith and Mostofa Patwary and Brandon Norick and Patrick LeGresley and Samyam Rajbhandari and Jared Casper and Zhun Liu and Shrimai Prabhumoye and George Zerveas and Vijay Korthikanti and Elton Zhang and Rewon Child and Reza Yazdani Aminabadi and Julie Bernauer and Xia Song and Mohammad Shoeybi and Yuxiong He and Michael Houston and Saurabh Tiwary and Bryan Catanzaro},
      year={2022},
      eprint={2201.11990},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

OneFlow: Redesign the Distributed Deep Learning Framework from Scratch

tag: OneFlow | SBP | OneFlow Research

paper link: here

github link: here

citation:

@misc{yuan2022oneflowredesigndistributeddeep,
      title={OneFlow: Redesign the Distributed Deep Learning Framework from Scratch}, 
      author={Jinhui Yuan and Xinqi Li and Cheng Cheng and Juncheng Liu and Ran Guo and Shenghang Cai and Chi Yao and Fei Yang and Xiaodong Yi and Chuan Wu and Haoran Zhang and Jie Zhao},
      year={2022},
      eprint={2110.15032},
      archivePrefix={arXiv},
      primaryClass={cs.DC},
      url={https://arxiv.org/abs/2110.15032}, 
}

Colossal-ai: A unified deep learning system for large-scale parallel training

tag: Colossal-AI | ICPP23 | NUS

paper link: here

github link: here

citation:

@inproceedings{li2023colossal,
      title={Colossal-ai: A unified deep learning system for large-scale parallel training},
      author={Li, Shenggui and Liu, Hongxin and Bian, Zhengda and Fang, Jiarui and Huang, Haichen and Liu, Yuliang and Wang, Boxiang and You, Yang},
      booktitle={Proceedings of the 52nd International Conference on Parallel Processing},
      pages={766--775},
      year={2023}
}

GSPMD: General and Scalable Parallelization for ML Computation Graphs

tag: GSPMD | TPU-v3 | Google

paper link: here

citation:

@misc{xu2021gspmd,
      title={GSPMD: General and Scalable Parallelization for ML Computation Graphs}, 
      author={Yuanzhong Xu and HyoukJoong Lee and Dehao Chen and Blake Hechtman and Yanping Huang and Rahul Joshi and Maxim Krikun and Dmitry Lepikhin and Andy Ly and Marcello Maggioni and Ruoming Pang and Noam Shazeer and Shibo Wang and Tao Wang and Yonghui Wu and Zhifeng Chen},
      year={2021},
      eprint={2105.04663},
      archivePrefix={arXiv},
      primaryClass={id='cs.DC' full_name='Distributed, Parallel, and Cluster Computing' is_active=True alt_name=None in_archive='cs' is_general=False description='Covers fault-tolerance, distributed algorithms, stabilility, parallel computation, and cluster computing. Roughly includes material in ACM Subject Classes C.1.2, C.1.4, C.2.4, D.1.3, D.4.5, D.4.7, E.1.'}
}

Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM

tag: PTD-P | Megatron-LM | Nvidia | Microsoft | Stanford University

paper link: here

github link: here

citation:

@misc{narayanan2021efficient,
      title={Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM}, 
      author={Deepak Narayanan and Mohammad Shoeybi and Jared Casper and Patrick LeGresley and Mostofa Patwary and Vijay Anand Korthikanti and Dmitri Vainbrand and Prethvi Kashinkunti and Julie Bernauer and Bryan Catanzaro and Amar Phanishayee and Matei Zaharia},
      year={2021},
      eprint={2104.04473},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters

tag: DeepSpeed | KDD20 | Microsoft

paper link: here

github link: here

blog link: here

docs link: here

citation:

@inproceedings{rasley2020deepspeed,
  title={Deepspeed: System optimizations enable training deep learning models with over 100 billion parameters},
  author={Rasley, Jeff and Rajbhandari, Samyam and Ruwase, Olatunji and He, Yuxiong},
  booktitle={Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining},
  pages={3505--3506},
  year={2020}
}

Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism

tag: Megatron-LM | Nvidia

paper link: here

github link: here

follow-up work: here

citation:

@misc{shoeybi2020megatronlm,
      title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism}, 
      author={Mohammad Shoeybi and Mostofa Patwary and Raul Puri and Patrick LeGresley and Jared Casper and Bryan Catanzaro},
      year={2020},
      eprint={1909.08053},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}