Here're some resources about Quantization Strategies for LLMs
paper link: here
citation:
@article{li2023loftq,
title={Loftq: Lora-fine-tuning-aware quantization for large language models},
author={Li, Yixiao and Yu, Yifan and Liang, Chen and He, Pengcheng and Karampatziakis, Nikos and Chen, Weizhu and Zhao, Tuo},
journal={arXiv preprint arXiv:2310.08659},
year={2023}
}
paper link: here
citation:
@misc{li2023qft,
title={QFT: Quantized Full-parameter Tuning of LLMs with Affordable Resources},
author={Zhikai Li and Xiaoxuan Liu and Banghua Zhu and Zhen Dong and Qingyi Gu and Kurt Keutzer},
year={2023},
eprint={2310.07147},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
paper link: here
citation:
@article{xu2023qa,
title={QA-LoRA: Quantization-Aware Low-Rank Adaptation of Large Language Models},
author={Xu, Yuhui and Xie, Lingxi and Gu, Xiaotao and Chen, Xin and Chang, Heng and Zhang, Hengheng and Chen, Zhensu and Zhang, Xiaopeng and Tian, Qi},
journal={arXiv preprint arXiv:2309.14717},
year={2023}
}
paper link: here
citation:
@article{kim2023squeezellm,
title={SqueezeLLM: Dense-and-Sparse Quantization},
author={Kim, Sehoon and Hooper, Coleman and Gholami, Amir and Dong, Zhen and Li, Xiuyu and Shen, Sheng and Mahoney, Michael W and Keutzer, Kurt},
journal={arXiv preprint arXiv:2306.07629},
year={2023}
}
paper link: here
citation:
@article{dettmers2023spqr,
title={SpQR: A Sparse-Quantized Representation for Near-Lossless LLM Weight Compression},
author={Dettmers, Tim and Svirschevski, Ruslan and Egiazarian, Vage and Kuznedelev, Denis and Frantar, Elias and Ashkboos, Saleh and Borzunov, Alexander and Hoefler, Torsten and Alistarh, Dan},
journal={arXiv preprint arXiv:2306.03078},
year={2023}
}
paper link: here
citation:
@article{kim2024memory,
title={Memory-efficient fine-tuning of compressed large language models via sub-4-bit integer quantization},
author={Kim, Jeonghoon and Lee, Jung Hyun and Kim, Sungdong and Park, Joonsuk and Yoo, Kang Min and Kwon, Se Jung and Lee, Dongsoo},
journal={Advances in Neural Information Processing Systems},
volume={36},
year={2024}
}
paper link: here
github link: here
tutorial link: here
citation:
@article{dettmers2023qlora,
title={Qlora: Efficient finetuning of quantized llms},
author={Dettmers, Tim and Pagnoni, Artidoro and Holtzman, Ari and Zettlemoyer, Luke},
journal={arXiv preprint arXiv:2305.14314},
year={2023}
}
paper link: here
citation:
@inproceedings{zafrir2019q8bert,
author = "Zafrir, Ofir and Boudoukh, Guy and Izsak, Peter and Wasserblat, Moshe",
title = "Q8bert: Quantized 8bit bert",
booktitle = "2019 Fifth Workshop on Energy Efficient Machine Learning and Cognitive Computing-NeurIPS Edition (EMC2-NIPS)",
pages = "36--39",
year = "2019",
organization = "IEEE"
}
tag: eXmY
paper link: here
citation:
@misc{agrawal2024exmydatatypetechnique,
title={eXmY: A Data Type and Technique for Arbitrary Bit Precision Quantization},
author={Aditya Agrawal and Matthew Hedlund and Blake Hechtman},
year={2024},
eprint={2405.13938},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2405.13938},
}
blog link: here
github link: here
citation:
@misc{badri2023hqq,
title = {Half-Quadratic Quantization of Large Machine Learning Models},
url = {https://mobiusml.github.io/hqq_blog/},
author = {Hicham Badri and Appu Shaji},
month = {November},
year = {2023}
}
paper link: here
github link: here
citation:
@article{wu2023zeroquant,
title={Zeroquant-fp: A leap forward in llms post-training w4a8 quantization using floating-point formats},
author={Wu, Xiaoxia and Yao, Zhewei and He, Yuxiong},
journal={arXiv preprint arXiv:2307.09782},
year={2023}
}
paper link: here
citation:
@misc{lin2023awq,
title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration},
author={Ji Lin and Jiaming Tang and Haotian Tang and Shang Yang and Xingyu Dang and Chuang Gan and Song Han},
year={2023},
eprint={2306.00978},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
github link: here
citation:
@misc{gguf,
author = {Georgi Gerganov},
title = {GGML: GPT-Generated Model Language},
year = {2023},
month = {Aug},
url = {\url{https://github.com/ggerganov/llama.cpp}},
}
ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation
paper link: here
github link: here
follow-up work: here
citation:
@misc{yao2023zeroquantv2exploringposttrainingquantization,
title={ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation},
author={Zhewei Yao and Xiaoxia Wu and Cheng Li and Stephen Youn and Yuxiong He},
year={2023},
eprint={2303.08302},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2303.08302},
}
paper link: here
github link: here
citation:
@inproceedings{xiao2023smoothquant,
title={Smoothquant: Accurate and efficient post-training quantization for large language models},
author={Xiao, Guangxuan and Lin, Ji and Seznec, Mickael and Wu, Hao and Demouth, Julien and Han, Song},
booktitle={International Conference on Machine Learning},
pages={38087--38099},
year={2023},
organization={PMLR}
}
paper link: here
github link: here
citation:
@misc{frantar2023gptq,
title={GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers},
author={Elias Frantar and Saleh Ashkboos and Torsten Hoefler and Dan Alistarh},
year={2023},
eprint={2210.17323},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
paper link: here
blog link: here
github link: here
citation:
@misc{dettmers2022llmint8,
title={LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale},
author={Tim Dettmers and Mike Lewis and Younes Belkada and Luke Zettlemoyer},
year={2022},
eprint={2208.07339},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
github link: here
citation:
@misc{ggml,
author = {Georgi Gerganov},
title = {GGML: GPT-Generated Model Language},
year = {2022},
url = {\url{https://github.com/ggerganov/ggml}},
}
paper link: here
github link: here
citation:
@misc{kuzmin2024fp8,
title={FP8 Quantization: The Power of the Exponent},
author={Andrey Kuzmin and Mart Van Baalen and Yuwei Ren and Markus Nagel and Jorn Peters and Tijmen Blankevoort},
year={2024},
eprint={2208.09225},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
paper link: here
github link: here
follow-up work: here
citation:
@misc{yao2022zeroquantefficientaffordableposttraining,
title={ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers},
author={Zhewei Yao and Reza Yazdani Aminabadi and Minjia Zhang and Xiaoxia Wu and Conglong Li and Yuxiong He},
year={2022},
eprint={2206.01861},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2206.01861},
}
paper link: here
github link: here
citation:
@misc{zhang2021trainingdeepneuralnetworks,
title={Training Deep Neural Networks with Joint Quantization and Pruning of Weights and Activations},
author={Xinyu Zhang and Ian Colbert and Ken Kreutz-Delgado and Srinjoy Das},
year={2021},
eprint={2110.08271},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2110.08271},
}
paper link: here
citation:
@misc{wu2020integerquantizationdeeplearning,
title={Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation},
author={Hao Wu and Patrick Judd and Xiaojie Zhang and Mikhail Isaev and Paulius Micikevicius},
year={2020},
eprint={2004.09602},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2004.09602},
}
paper link: here
citation:
@misc{yin2019understandingstraightthroughestimatortraining,
title={Understanding Straight-Through Estimator in Training Activation Quantized Neural Nets},
author={Penghang Yin and Jiancheng Lyu and Shuai Zhang and Stanley Osher and Yingyong Qi and Jack Xin},
year={2019},
eprint={1903.05662},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/1903.05662},
}
paper link: here
citation:
@misc{krishnamoorthi2018quantizing,
title={Quantizing deep convolutional networks for efficient inference: A whitepaper},
author={Raghuraman Krishnamoorthi},
year={2018},
eprint={1806.08342},
archivePrefix={arXiv},
primaryClass={id='cs.LG' full_name='Machine Learning' is_active=True alt_name=None in_archive='cs' is_general=False description='Papers on all aspects of machine learning research (supervised, unsupervised, reinforcement learning, bandit problems, and so on) including also robustness, explanation, fairness, and methodology. cs.LG is also an appropriate primary category for applications of machine learning methods.'}
}