Skip to content

Latest commit

 

History

History
122 lines (89 loc) · 3.62 KB

prune.md

File metadata and controls

122 lines (89 loc) · 3.62 KB

Model Pruning Strategies for LLMs

Here're some resources about Model Pruning Strategies for LLMs

Compact Language Models via Pruning and Knowledge Distillation (Minitron)

paper link: here

github link: here

@misc{muralidharan2024compactlanguagemodelspruning,
      title={Compact Language Models via Pruning and Knowledge Distillation}, 
      author={Saurav Muralidharan and Sharath Turuvekere Sreenivas and Raviraj Joshi and Marcin Chochowski and Mostofa Patwary and Mohammad Shoeybi and Bryan Catanzaro and Jan Kautz and Pavlo Molchanov},
      year={2024},
      eprint={2407.14679},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2407.14679}, 
}

A Simple and Effective Pruning Approach for Large Language Models (Wanda)

paper link: here

citation:

@misc{sun2023simple,
      title={A Simple and Effective Pruning Approach for Large Language Models}, 
      author={Mingjie Sun and Zhuang Liu and Anna Bair and J. Zico Kolter},
      year={2023},
      eprint={2306.11695},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints

paper link: here

citation:

@article{ainslie2023gqa,
  title={GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints},
  author={Ainslie, Joshua and Lee-Thorp, James and de Jong, Michiel and Zemlyanskiy, Yury and Lebr{\'o}n, Federico and Sanghai, Sumit},
  journal={arXiv preprint arXiv:2305.13245},
  year={2023}
}

LLM-Pruner: On the Structural Pruning of Large Language Models

paper link: here

citation:

@misc{ma2023llmpruner,
      title={LLM-Pruner: On the Structural Pruning of Large Language Models}, 
      author={Xinyin Ma and Gongfan Fang and Xinchao Wang},
      year={2023},
      eprint={2305.11627},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

SCOP: Scientific Control for Reliable Neural Network Pruning

paper link: here

citation:

@misc{tang2021scop,
      title={SCOP: Scientific Control for Reliable Neural Network Pruning}, 
      author={Yehui Tang and Yunhe Wang and Yixing Xu and Dacheng Tao and Chunjing Xu and Chao Xu and Chang Xu},
      year={2021},
      eprint={2010.10732},
      archivePrefix={arXiv},
      primaryClass={id='cs.CV' full_name='Computer Vision and Pattern Recognition' is_active=True alt_name=None in_archive='cs' is_general=False description='Covers image processing, computer vision, pattern recognition, and scene understanding. Roughly includes material in ACM Subject Classes I.2.10, I.4, and I.5.'}
}

Fast transformer decoding: One write-head is all you need (MQA)

paper link: here

citation:

@article{shazeer2019fast,
  title={Fast transformer decoding: One write-head is all you need},
  author={Shazeer, Noam},
  journal={arXiv preprint arXiv:1911.02150},
  year={2019}
}

Are sixteen heads really better than one?

paper link: here

citation:

@article{michel2019sixteen,
  title={Are sixteen heads really better than one?},
  author={Michel, Paul and Levy, Omer and Neubig, Graham},
  journal={Advances in neural information processing systems},
  volume={32},
  year={2019}
}