Here're some resources about Distributed Communication for LLMs Training
tag: OpenDiLoCo
paper link: here
github link: here
citation:
@misc{jaghouar2024opendilocoopensourceframeworkglobally,
title={OpenDiLoCo: An Open-Source Framework for Globally Distributed Low-Communication Training},
author={Sami Jaghouar and Jack Min Ong and Johannes Hagemann},
year={2024},
eprint={2407.07852},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2407.07852},
}
tag: DiLoCo
| Google DeepMind
paper link: here
follow-up work: here
citation:
@misc{douillard2023diloco,
title={DiLoCo: Distributed Low-Communication Training of Language Models},
author={Arthur Douillard and Qixuan Feng and Andrei A. Rusu and Rachita Chhaparia and Yani Donchev and Adhiguna Kuncoro and Marc'Aurelio Ranzato and Arthur Szlam and Jiajun Shen},
year={2023},
eprint={2311.08105},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
tag: ZeRO++
| DeepSpeed
| ICLR24
| Microsoft
paper link: here
blog link: here
slides link: here
github link: here
citation:
@misc{wang2023zeroextremelyefficientcollective,
title={ZeRO++: Extremely Efficient Collective Communication for Giant Model Training},
author={Guanhua Wang and Heyang Qin and Sam Ade Jacobs and Connor Holmes and Samyam Rajbhandari and Olatunji Ruwase and Feng Yan and Lei Yang and Yuxiong He},
year={2023},
eprint={2306.10209},
archivePrefix={arXiv},
primaryClass={cs.DC},
url={https://arxiv.org/abs/2306.10209},
}
tag: MSCCLang
| ASPLOS23
| Microsoft
paper link: here
github link: here
citation:
@inproceedings{10.1145/3575693.3575724,
author = {Cowan, Meghan and Maleki, Saeed and Musuvathi, Madanlal and Saarikivi, Olli and Xiong, Yifan},
title = {MSCCLang: Microsoft Collective Communication Language},
year = {2023},
isbn = {9781450399166},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3575693.3575724},
doi = {10.1145/3575693.3575724},
booktitle = {Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2},
pages = {502–514},
numpages = {13},
keywords = {Collective Communication, Compilers, GPU},
location = {Vancouver, BC, Canada},
series = {ASPLOS 2023}
}
tag: TACCL
| OSDI23
| Microsoft
paper link: here
citation:
@inproceedings {285084,
author = {Aashaka Shah and Vijay Chidambaram and Meghan Cowan and Saeed Maleki and Madan Musuvathi and Todd Mytkowicz and Jacob Nelson and Olli Saarikivi and Rachee Singh},
title = {{TACCL}: Guiding Collective Algorithm Synthesis using Communication Sketches},
booktitle = {20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)},
year = {2023},
isbn = {978-1-939133-33-5},
address = {Boston, MA},
pages = {593--612},
url = {https://www.usenix.org/conference/nsdi23/presentation/shah},
publisher = {USENIX Association},
month = apr
}
tag: AlpaComm
| MBZUAI
| CMU
| Tsinghua University
| UCB
paper link: here
citation:
@misc{zhuang2024optimizingcommunicationmodelparallelism,
title={On Optimizing the Communication of Model Parallelism},
author={Yonghao Zhuang and Hexu Zhao and Lianmin Zheng and Zhuohan Li and Eric P. Xing and Qirong Ho and Joseph E. Gonzalez and Ion Stoica and Hao Zhang},
year={2024},
eprint={2211.05322},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2211.05322},
}
tag: ACE
| ISCA21
paper link: here
citation:
@inproceedings{rashidi2021enabling,
author = {Rashidi, Saeed and Denton, Matthew and Sridharan, Srinivas and Srinivasan, Sudarshan and Suresh, Amoghavarsha and Nie, Jade and Krishna, Tushar},
title = {Enabling compute-communication overlap in distributed deep learning training platforms},
year = {2021},
isbn = {9781450390866},
publisher = {IEEE Press},
url = {https://doi.org/10.1109/ISCA52012.2021.00049},
doi = {10.1109/ISCA52012.2021.00049},
pages = {540–553},
numpages = {14},
keywords = {deep learning training, communication accelerator, collective communication, accelerator fabric},
location = {Virtual Event, Spain},
series = {ISCA '21}
}
Breaking the Computation and Communication Abstraction Barrier in Distributed Machine Learning Workloads
tag: CoCoNet
| ASPLOS22
| Microsoft
paper link: here
citation:
@inproceedings{10.1145/3503222.3507778,
author = {Jangda, Abhinav and Huang, Jun and Liu, Guodong and Sabet, Amir Hossein Nodehi and Maleki, Saeed and Miao, Youshan and Musuvathi, Madanlal and Mytkowicz, Todd and Saarikivi, Olli},
title = {Breaking the computation and communication abstraction barrier in distributed machine learning workloads},
year = {2022},
isbn = {9781450392051},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3503222.3507778},
doi = {10.1145/3503222.3507778},
pages = {402–416},
numpages = {15},
keywords = {CUDA, Code Generation, Collective Communication, Compiler Optimizations, Distributed Machine Learning, MPI},
location = {Lausanne, Switzerland},
series = {ASPLOS '22}
}
Efficient sparse collective communication and its application to accelerate distributed deep learning
tag: OmniReduce
| SIGCOMM21
| NUDT
paper link: here
citation:
@inproceedings{fei2021efficient,
author = {Fei, Jiawei and Ho, Chen-Yu and Sahu, Atal N. and Canini, Marco and Sapio, Amedeo},
title = {Efficient sparse collective communication and its application to accelerate distributed deep learning},
year = {2021},
isbn = {9781450383837},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3452296.3472904},
doi = {10.1145/3452296.3472904},
booktitle = {Proceedings of the 2021 ACM SIGCOMM 2021 Conference},
pages = {676–691},
numpages = {16},
keywords = {distributed training, deep learning},
location = {Virtual Event, USA},
series = {SIGCOMM '21}
}
tag: Communication-Efficient
| Distributed DL
| Survey
| HKBU
paper link: here
citation:
@misc{tang2023communicationefficientdistributeddeeplearning,
title={Communication-Efficient Distributed Deep Learning: A Comprehensive Survey},
author={Zhenheng Tang and Shaohuai Shi and Wei Wang and Bo Li and Xiaowen Chu},
year={2023},
eprint={2003.06307},
archivePrefix={arXiv},
primaryClass={cs.DC},
url={https://arxiv.org/abs/2003.06307},
}