Skip to content

Latest commit

 

History

History
254 lines (187 loc) · 8.42 KB

File metadata and controls

254 lines (187 loc) · 8.42 KB

Distributed Communication for LLMs Training

Here're some resources about Distributed Communication for LLMs Training

Method

OpenDiLoCo: An Open-Source Framework for Globally Distributed Low-Communication Training

tag: OpenDiLoCo

paper link: here

github link: here

citation:

@misc{jaghouar2024opendilocoopensourceframeworkglobally,
      title={OpenDiLoCo: An Open-Source Framework for Globally Distributed Low-Communication Training}, 
      author={Sami Jaghouar and Jack Min Ong and Johannes Hagemann},
      year={2024},
      eprint={2407.07852},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2407.07852}, 
}

DiLoCo: Distributed Low-Communication Training of Language Models

tag: DiLoCo | Google DeepMind

paper link: here

follow-up work: here

citation:

@misc{douillard2023diloco,
      title={DiLoCo: Distributed Low-Communication Training of Language Models}, 
      author={Arthur Douillard and Qixuan Feng and Andrei A. Rusu and Rachita Chhaparia and Yani Donchev and Adhiguna Kuncoro and Marc'Aurelio Ranzato and Arthur Szlam and Jiajun Shen},
      year={2023},
      eprint={2311.08105},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

ZeRO++: Extremely Efficient Collective Communication for Giant Model Training

tag: ZeRO++ | DeepSpeed | ICLR24 | Microsoft

paper link: here

blog link: here

slides link: here

github link: here

citation:

@misc{wang2023zeroextremelyefficientcollective,
      title={ZeRO++: Extremely Efficient Collective Communication for Giant Model Training}, 
      author={Guanhua Wang and Heyang Qin and Sam Ade Jacobs and Connor Holmes and Samyam Rajbhandari and Olatunji Ruwase and Feng Yan and Lei Yang and Yuxiong He},
      year={2023},
      eprint={2306.10209},
      archivePrefix={arXiv},
      primaryClass={cs.DC},
      url={https://arxiv.org/abs/2306.10209}, 
}

MSCCLang: Microsoft Collective Communication Language

tag: MSCCLang | ASPLOS23 | Microsoft

paper link: here

github link: here

citation:

@inproceedings{10.1145/3575693.3575724,
      author = {Cowan, Meghan and Maleki, Saeed and Musuvathi, Madanlal and Saarikivi, Olli and Xiong, Yifan},
      title = {MSCCLang: Microsoft Collective Communication Language},
      year = {2023},
      isbn = {9781450399166},
      publisher = {Association for Computing Machinery},
      address = {New York, NY, USA},
      url = {https://doi.org/10.1145/3575693.3575724},
      doi = {10.1145/3575693.3575724},
      booktitle = {Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2},
      pages = {502–514},
      numpages = {13},
      keywords = {Collective Communication, Compilers, GPU},
      location = {Vancouver, BC, Canada},
      series = {ASPLOS 2023}
}

TACCL: Guiding Collective Algorithm Synthesis using Communication Sketches

tag: TACCL | OSDI23 | Microsoft

paper link: here

citation:

@inproceedings {285084,
      author = {Aashaka Shah and Vijay Chidambaram and Meghan Cowan and Saeed Maleki and Madan Musuvathi and Todd Mytkowicz and Jacob Nelson and Olli Saarikivi and Rachee Singh},
      title = {{TACCL}: Guiding Collective Algorithm Synthesis using Communication Sketches},
      booktitle = {20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)},
      year = {2023},
      isbn = {978-1-939133-33-5},
      address = {Boston, MA},
      pages = {593--612},
      url = {https://www.usenix.org/conference/nsdi23/presentation/shah},
      publisher = {USENIX Association},
      month = apr
}

On Optimizing the Communication of Model Parallelism

tag: AlpaComm | MBZUAI | CMU | Tsinghua University | UCB

paper link: here

citation:

@misc{zhuang2024optimizingcommunicationmodelparallelism,
      title={On Optimizing the Communication of Model Parallelism}, 
      author={Yonghao Zhuang and Hexu Zhao and Lianmin Zheng and Zhuohan Li and Eric P. Xing and Qirong Ho and Joseph E. Gonzalez and Ion Stoica and Hao Zhang},
      year={2024},
      eprint={2211.05322},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2211.05322}, 
}

Enabling Compute-Communication Overlap in Distributed Deep Learning Training Platforms

tag: ACE | ISCA21

paper link: here

citation:

@inproceedings{rashidi2021enabling,
      author = {Rashidi, Saeed and Denton, Matthew and Sridharan, Srinivas and Srinivasan, Sudarshan and Suresh, Amoghavarsha and Nie, Jade and Krishna, Tushar},
      title = {Enabling compute-communication overlap in distributed deep learning training platforms},
      year = {2021},
      isbn = {9781450390866},
      publisher = {IEEE Press},
      url = {https://doi.org/10.1109/ISCA52012.2021.00049},
      doi = {10.1109/ISCA52012.2021.00049},
      pages = {540–553},
      numpages = {14},
      keywords = {deep learning training, communication accelerator, collective communication, accelerator fabric},
      location = {Virtual Event, Spain},
      series = {ISCA '21}
}

Breaking the Computation and Communication Abstraction Barrier in Distributed Machine Learning Workloads

tag: CoCoNet | ASPLOS22| Microsoft

paper link: here

citation:

@inproceedings{10.1145/3503222.3507778,
      author = {Jangda, Abhinav and Huang, Jun and Liu, Guodong and Sabet, Amir Hossein Nodehi and Maleki, Saeed and Miao, Youshan and Musuvathi, Madanlal and Mytkowicz, Todd and Saarikivi, Olli},
      title = {Breaking the computation and communication abstraction barrier in distributed machine learning workloads},
      year = {2022},
      isbn = {9781450392051},
      publisher = {Association for Computing Machinery},
      address = {New York, NY, USA},
      url = {https://doi.org/10.1145/3503222.3507778},
      doi = {10.1145/3503222.3507778},
      pages = {402–416},
      numpages = {15},
      keywords = {CUDA, Code Generation, Collective Communication, Compiler Optimizations, Distributed Machine Learning, MPI},
      location = {Lausanne, Switzerland},
      series = {ASPLOS '22}
}

Efficient sparse collective communication and its application to accelerate distributed deep learning

tag: OmniReduce | SIGCOMM21 | NUDT

paper link: here

citation:

@inproceedings{fei2021efficient,
      author = {Fei, Jiawei and Ho, Chen-Yu and Sahu, Atal N. and Canini, Marco and Sapio, Amedeo},
      title = {Efficient sparse collective communication and its application to accelerate distributed deep learning},
      year = {2021},
      isbn = {9781450383837},
      publisher = {Association for Computing Machinery},
      address = {New York, NY, USA},
      url = {https://doi.org/10.1145/3452296.3472904},
      doi = {10.1145/3452296.3472904},
      booktitle = {Proceedings of the 2021 ACM SIGCOMM 2021 Conference},
      pages = {676–691},
      numpages = {16},
      keywords = {distributed training, deep learning},
      location = {Virtual Event, USA},
      series = {SIGCOMM '21}
}

Survey

Communication-Efficient Distributed Deep Learning: A Comprehensive Survey

tag: Communication-Efficient | Distributed DL | Survey | HKBU

paper link: here

citation:

@misc{tang2023communicationefficientdistributeddeeplearning,
      title={Communication-Efficient Distributed Deep Learning: A Comprehensive Survey}, 
      author={Zhenheng Tang and Shaohuai Shi and Wei Wang and Bo Li and Xiaowen Chu},
      year={2023},
      eprint={2003.06307},
      archivePrefix={arXiv},
      primaryClass={cs.DC},
      url={https://arxiv.org/abs/2003.06307}, 
}