Here're some resources about Evaluation on LLMs
On the Workflows and Smells of Leaderboard Operations (LBOps): An Exploratory Study of Foundation Model Leaderboards
paper link: here
github link: here
citation:
@misc{zhao2024workflowssmellsleaderboardoperations,
title={On the Workflows and Smells of Leaderboard Operations (LBOps): An Exploratory Study of Foundation Model Leaderboards},
author={Zhimin Zhao and Abdul Ali Bangash and Filipe Roseiro Côgo and Bram Adams and Ahmed E. Hassan},
year={2024},
eprint={2407.04065},
archivePrefix={arXiv},
primaryClass={cs.SE},
url={https://arxiv.org/abs/2407.04065},
}
paper link: here
citation
@misc{yang2023rethinking,
title={Rethinking Benchmark and Contamination for Language Models with Rephrased Samples},
author={Shuo Yang and Wei-Lin Chiang and Lianmin Zheng and Joseph E. Gonzalez and Ion Stoica},
year={2023},
eprint={2311.04850},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
paper link: here
citation
@misc{zhou2023dont,
title={Don't Make Your LLM an Evaluation Benchmark Cheater},
author={Kun Zhou and Yutao Zhu and Zhipeng Chen and Wentong Chen and Wayne Xin Zhao and Xu Chen and Yankai Lin and Ji-Rong Wen and Jiawei Han},
year={2023},
eprint={2311.01964},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
GPT-Fathom- Benchmarking Large Language Models to Decipher the Evolutionary Path towards GPT-4 and Beyond
paper link: here
citation
@article{zheng2023gpt,
title={GPT-Fathom: Benchmarking Large Language Models to Decipher the Evolutionary Path towards GPT-4 and Beyond},
author={Zheng, Shen and Zhang, Yuyu and Zhu, Yijie and Xi, Chenguang and Gao, Pengyang and Zhou, Xun and Chang, Kevin Chen-Chuan},
journal={arXiv preprint arXiv:2309.16583},
year={2023}
}
paper link: here
citation:
@misc{chen2023benchmarking,
title={Benchmarking Large Language Models in Retrieval-Augmented Generation},
author={Jiawei Chen and Hongyu Lin and Xianpei Han and Le Sun},
year={2023},
eprint={2309.01431},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
paper link: here
homepage link (chatbot Arena): here
citation:
@misc{zheng2023judging,
title={Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena},
author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
year={2023},
eprint={2306.05685},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
paper link: here
citation:
@misc{chia2023instructeval,
title={INSTRUCTEVAL: Towards Holistic Evaluation of Instruction-Tuned Large Language Models},
author={Yew Ken Chia and Pengfei Hong and Lidong Bing and Soujanya Poria},
year={2023},
eprint={2306.04757},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
paper link: here
citation:
@article{liang2022holistic,
title={Holistic evaluation of language models},
author={Liang, Percy and Bommasani, Rishi and Lee, Tony and Tsipras, Dimitris and Soylu, Dilara and Yasunaga, Michihiro and Zhang, Yian and Narayanan, Deepak and Wu, Yuhuai and Kumar, Ananya and others},
journal={arXiv preprint arXiv:2211.09110},
year={2022}
}
paper link: here
citation:
@article{ge2023openagi,
title={Openagi: When llm meets domain experts},
author={Ge, Yingqiang and Hua, Wenyue and Ji, Jianchao and Tan, Juntao and Xu, Shuyuan and Zhang, Yongfeng},
journal={arXiv preprint arXiv:2304.04370},
year={2023}
}
paper link: here
github link: here
citation:
@article{hendrycks2020measuring,
title={Measuring massive multitask language understanding},
author={Hendrycks, Dan and Burns, Collin and Basart, Steven and Zou, Andy and Mazeika, Mantas and Song, Dawn and Steinhardt, Jacob},
journal={arXiv preprint arXiv:2009.03300},
year={2020}
}
paper link: here
github link: here
dataset link: here
citation:
@misc{cobbe2021training,
title={Training Verifiers to Solve Math Word Problems},
author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Mark Chen and Heewoo Jun and Lukasz Kaiser and Matthias Plappert and Jerry Tworek and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
year={2021},
eprint={2110.14168},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
paper link: here
github link: here
dataset link: here
citation:
@article{hendrycksmath2021,
title={Measuring Mathematical Problem Solving With the MATH Dataset},
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
journal={NeurIPS},
year={2021}
}
paper link: here
github link: here
citation:
@article{zheng2021minif2f,
title={MiniF2F: a cross-system benchmark for formal Olympiad-level mathematics},
author={Zheng, Kunhao and Han, Jesse Michael and Polu, Stanislas},
journal={arXiv preprint arXiv:2109.00110},
year={2021}
}
paper link: here
github link: here
leaderboard link: here
citation:
@article{cassano2023multipl,
title={MultiPL-E: a scalable and polyglot approach to benchmarking neural code generation},
author={Cassano, Federico and Gouwar, John and Nguyen, Daniel and Nguyen, Sydney and Phipps-Costin, Luna and Pinckney, Donald and Yee, Ming-Ho and Zi, Yangtian and Anderson, Carolyn Jane and Feldman, Molly Q and others},
journal={IEEE Transactions on Software Engineering},
year={2023},
publisher={IEEE}
}
paper link: here
dataset link: here
citation:
@misc{chen2021evaluating,
title={Evaluating Large Language Models Trained on Code},
author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
year={2021},
eprint={2107.03374},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
paper link: here
citation:
@article{huang2023c,
title={C-eval: A multi-level multi-discipline chinese evaluation suite for foundation models},
author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and others},
journal={arXiv preprint arXiv:2305.08322},
year={2023}
}
paper link: here
citation:
@article{zhang2023cgce,
title={CGCE: A Chinese Generative Chat Evaluation Benchmark for General and Financial Domains},
author={Zhang, Xuanyu and Li, Bingbing and Yang, Qing},
journal={arXiv preprint arXiv:2305.14471},
year={2023}
}
paper link: here
github link: here
citation:
@misc{bai2023longbench,
title={LongBench: A Bilingual, Multitask Benchmark for Long Context Understanding},
author={Yushi Bai and Xin Lv and Jiajie Zhang and Hongchang Lyu and Jiankai Tang and Zhidian Huang and Zhengxiao Du and Xiao Liu and Aohan Zeng and Lei Hou and Yuxiao Dong and Jie Tang and Juanzi Li},
year={2023},
eprint={2308.14508},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
paper link: here
citation:
@article{meister2021language,
title={Language model evaluation beyond perplexity},
author={Meister, Clara and Cotterell, Ryan},
journal={arXiv preprint arXiv:2106.00085},
year={2021}
}