Here're some resources about Model Pruning Strategies for LLMs
paper link: here
github link: here
@misc{muralidharan2024compactlanguagemodelspruning,
title={Compact Language Models via Pruning and Knowledge Distillation},
author={Saurav Muralidharan and Sharath Turuvekere Sreenivas and Raviraj Joshi and Marcin Chochowski and Mostofa Patwary and Mohammad Shoeybi and Bryan Catanzaro and Jan Kautz and Pavlo Molchanov},
year={2024},
eprint={2407.14679},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2407.14679},
}
paper link: here
citation:
@misc{sun2023simple,
title={A Simple and Effective Pruning Approach for Large Language Models},
author={Mingjie Sun and Zhuang Liu and Anna Bair and J. Zico Kolter},
year={2023},
eprint={2306.11695},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
paper link: here
citation:
@article{ainslie2023gqa,
title={GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints},
author={Ainslie, Joshua and Lee-Thorp, James and de Jong, Michiel and Zemlyanskiy, Yury and Lebr{\'o}n, Federico and Sanghai, Sumit},
journal={arXiv preprint arXiv:2305.13245},
year={2023}
}
paper link: here
citation:
@misc{ma2023llmpruner,
title={LLM-Pruner: On the Structural Pruning of Large Language Models},
author={Xinyin Ma and Gongfan Fang and Xinchao Wang},
year={2023},
eprint={2305.11627},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
paper link: here
citation:
@misc{tang2021scop,
title={SCOP: Scientific Control for Reliable Neural Network Pruning},
author={Yehui Tang and Yunhe Wang and Yixing Xu and Dacheng Tao and Chunjing Xu and Chao Xu and Chang Xu},
year={2021},
eprint={2010.10732},
archivePrefix={arXiv},
primaryClass={id='cs.CV' full_name='Computer Vision and Pattern Recognition' is_active=True alt_name=None in_archive='cs' is_general=False description='Covers image processing, computer vision, pattern recognition, and scene understanding. Roughly includes material in ACM Subject Classes I.2.10, I.4, and I.5.'}
}
paper link: here
citation:
@article{shazeer2019fast,
title={Fast transformer decoding: One write-head is all you need},
author={Shazeer, Noam},
journal={arXiv preprint arXiv:1911.02150},
year={2019}
}
paper link: here
citation:
@article{michel2019sixteen,
title={Are sixteen heads really better than one?},
author={Michel, Paul and Levy, Omer and Neubig, Graham},
journal={Advances in neural information processing systems},
volume={32},
year={2019}
}