{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T10:33:29Z","timestamp":1768559609041,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,21]],"date-time":"2024-10-21T00:00:00Z","timestamp":1729468800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Quan Cheng Laboratory","award":["QCLZD202301"],"award-info":[{"award-number":["QCLZD202301"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,21]]},"DOI":"10.1145\/3627673.3679677","type":"proceedings-article","created":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T19:34:21Z","timestamp":1729452861000},"page":"384-393","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Automatic Large Language Model Evaluation via Peer Review"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1233-0184","authenticated-orcid":false,"given":"Zhumin","family":"Chu","sequence":"first","affiliation":[{"name":"DCST, Tsinghua University &amp; Quan Cheng Laboratory, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5030-709X","authenticated-orcid":false,"given":"Qingyao","family":"Ai","sequence":"additional","affiliation":[{"name":"Quan Cheng Laboratory &amp; DCST, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7392-0524","authenticated-orcid":false,"given":"Yiteng","family":"Tu","sequence":"additional","affiliation":[{"name":"DCST, Tsinghua University &amp; Zhongguancun Laboratory, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8766-8610","authenticated-orcid":false,"given":"Haitao","family":"Li","sequence":"additional","affiliation":[{"name":"DCST, Tsinghua University &amp; Zhongguancun Laboratory, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0140-4512","authenticated-orcid":false,"given":"Yiqun","family":"Liu","sequence":"additional","affiliation":[{"name":"DCST, Tsinghua University &amp; Zhongguancun Laboratory, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,21]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"John H Shin, Jared S Fridley, Wael F Asaad, Deus Cielo, Adetokunbo A Oyelese, Curtis E Doberstein, et al.","author":"Ali Rohaid","year":"2022","unstructured":"Rohaid Ali, Oliver Y Tang, Ian D Connolly, Patricia L Zadnik Sullivan, John H Shin, Jared S Fridley, Wael F Asaad, Deus Cielo, Adetokunbo A Oyelese, Curtis E Doberstein, et al. 2022. Performance of ChatGPT and GPT-4 on neurosurgery written board examinations. Neurosurgery (2022), 10--1227."},{"key":"e_1_3_2_1_2_1","unstructured":"Anthropic. 2023. Claude 2. https:\/\/www.anthropic.com\/index\/claude-2."},{"key":"e_1_3_2_1_3_1","unstructured":"Anthropic. 2023. Introducing Claude. https:\/\/www.anthropic.com\/index\/introducing-claude."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","unstructured":"Valeriia Bolotova Vladislav Blinov Falk Scholer W. Bruce Croft and Mark Sanderson. 2022. A Non-Factoid Question-Answering Taxonomy. 12 pages. https:\/\/doi.org\/10.1145\/3477495.3531926","DOI":"10.1145\/3477495.3531926"},{"key":"e_1_3_2_1_5_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_6_1","volume-title":"Chateval: Towards better llm-based evaluators through multi-agent debate. arXiv preprint arXiv:2308.07201","author":"Chan Chi-Min","year":"2023","unstructured":"Chi-Min Chan, Weize Chen, Yusheng Su, Jianxuan Yu, Wei Xue, Shanghang Zhang, Jie Fu, and Zhiyuan Liu. 2023. Chateval: Towards better llm-based evaluators through multi-agent debate. arXiv preprint arXiv:2308.07201 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys. org (accessed","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E Gonzalez, et al. 2023. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna. lmsys. org (accessed 14 April 2023) (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Charles Sutton, Sebastian Gehrmann, et al.","author":"Chowdhery Aakanksha","year":"2022","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. 2022. Palm: Scaling language modeling with pathways. arXiv preprint arXiv:2204.02311 (2022)."},{"key":"e_1_3_2_1_9_1","unstructured":"Hyung Won Chung Le Hou Shayne Longpre Barret Zoph Yi Tay William Fedus Eric Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma et al. 2022. Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)."},{"key":"e_1_3_2_1_10_1","volume-title":"Alexis Chevalier, and Julius Berner.","author":"Frieder Simon","year":"2023","unstructured":"Simon Frieder, Luca Pinchetti, Ryan-Rhys Griffiths, Tommaso Salvatori, Thomas Lukasiewicz, Philipp Christian Petersen, Alexis Chevalier, and Julius Berner. 2023. Mathematical capabilities of chatgpt. arXiv preprint arXiv:2301.13867 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Gptscore: Evaluate as you desire. arXiv preprint arXiv:2302.04166","author":"Fu Jinlan","year":"2023","unstructured":"Jinlan Fu, See-Kiong Ng, Zhengbao Jiang, and Pengfei Liu. 2023. Gptscore: Evaluate as you desire. arXiv preprint arXiv:2302.04166 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"April","volume":"1","author":"Geng Xinyang","year":"2023","unstructured":"Xinyang Geng, Arnav Gudibande, Hao Liu, Eric Wallace, Pieter Abbeel, Sergey Levine, and Dawn Song. 2023. Koala: A dialogue model for academic research. Blog post, April, Vol. 1 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Xiezhi: An Ever-Updating Benchmark for Holistic Domain Knowledge Evaluation. arXiv preprint arXiv:2306.05783","author":"Gu Zhouhong","year":"2023","unstructured":"Zhouhong Gu, Xiaoxuan Zhu, Haoning Ye, Lin Zhang, Jianchen Wang, Sihang Jiang, Zhuozhi Xiong, Zihan Li, Qianyu He, Rui Xu, et al. 2023. Xiezhi: An Ever-Updating Benchmark for Holistic Domain Knowledge Evaluation. arXiv preprint arXiv:2306.05783 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"Deberta: Decoding-enhanced bert with disentangled attention. arXiv preprint arXiv:2006.03654","author":"He Pengcheng","year":"2020","unstructured":"Pengcheng He, Xiaodong Liu, Jianfeng Gao, and Weizhu Chen. 2020. Deberta: Decoding-enhanced bert with disentangled attention. arXiv preprint arXiv:2006.03654 (2020)."},{"key":"e_1_3_2_1_15_1","volume-title":"Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2020. Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300 (2020)."},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the 29th International Conference on Computational Linguistics. 3680--3696","author":"Jang Myeongjun","year":"2022","unstructured":"Myeongjun Jang, Deuk Sin Kwon, and Thomas Lukasiewicz. 2022. BECEL: Benchmark for Consistency Evaluation of Language Models. In Proceedings of the 29th International Conference on Computational Linguistics. 3680--3696."},{"key":"e_1_3_2_1_17_1","volume-title":"A review of key Likert scale development advances: 1995--2019. Frontiers in psychology","author":"Jebb Andrew T","year":"2021","unstructured":"Andrew T Jebb, Vincent Ng, and Louis Tay. 2021. A review of key Likert scale development advances: 1995--2019. Frontiers in psychology, Vol. 12 (2021), 637547."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1093\/biomet\/30.1-2.81"},{"key":"e_1_3_2_1_19_1","volume-title":"Large language models are state-of-the-art evaluators of translation quality. arXiv preprint arXiv:2302.14520","author":"Kocmi Tom","year":"2023","unstructured":"Tom Kocmi and Christian Federmann. 2023. Large language models are state-of-the-art evaluators of translation quality. arXiv preprint arXiv:2302.14520 (2023)."},{"key":"e_1_3_2_1_20_1","unstructured":"Klaus Krippendorff. 2011. Computing Krippendorff's alpha-reliability. (2011)."},{"key":"e_1_3_2_1_21_1","volume-title":"Evaluating the factual consistency of abstractive text summarization. arXiv preprint arXiv:1910.12840","author":"Kry'sci'nski Wojciech","year":"2019","unstructured":"Wojciech Kry'sci'nski, Bryan McCann, Caiming Xiong, and Richard Socher. 2019. Evaluating the factual consistency of abstractive text summarization. arXiv preprint arXiv:1910.12840 (2019)."},{"key":"e_1_3_2_1_22_1","volume-title":"JMP for basic univariate and multivariate statistics: methods for researchers and social scientists","author":"Lehman Ann","unstructured":"Ann Lehman, Norm O'Rourke, Larry Hatcher, and Edward Stepanski. 2013. JMP for basic univariate and multivariate statistics: methods for researchers and social scientists. Sas Institute."},{"key":"e_1_3_2_1_23_1","volume-title":"Prd: Peer rank and discussion improve large language model based evaluations. arXiv preprint arXiv:2307.02762","author":"Li Ruosen","year":"2023","unstructured":"Ruosen Li, Teerth Patel, and Xinya Du. 2023. Prd: Peer rank and discussion improve large language model based evaluations. arXiv preprint arXiv:2307.02762 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190","author":"Li Xiang Lisa","year":"2021","unstructured":"Xiang Lisa Li and Percy Liang. 2021. Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190 (2021)."},{"key":"e_1_3_2_1_25_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81."},{"key":"e_1_3_2_1_26_1","volume-title":"Gpteval: Nlg evaluation using gpt-4 with better human alignment. arXiv preprint arXiv:2303.16634","author":"Liu Yang","year":"2023","unstructured":"Yang Liu, Dan Iter, Yichong Xu, Shuohang Wang, Ruochen Xu, and Chenguang Zhu. 2023. Gpteval: Nlg evaluation using gpt-4 with better human alignment. arXiv preprint arXiv:2303.16634 (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_28_1","unstructured":"LMSYS. 2023. FastChat. https:\/\/github.com\/lm-sys\/FastChat."},{"key":"e_1_3_2_1_29_1","volume-title":"GPTEval: A survey on assessments of ChatGPT and GPT-4. arXiv preprint arXiv:2308.12488","author":"Mao Rui","year":"2023","unstructured":"Rui Mao, Guanyi Chen, Xulang Zhang, Frank Guerin, and Erik Cambria. 2023. GPTEval: A survey on assessments of ChatGPT and GPT-4. arXiv preprint arXiv:2308.12488 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization. ArXiv","author":"Narayan Shashi","year":"2018","unstructured":"Shashi Narayan, Shay B. Cohen, and Mirella Lapata. 2018. Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization. ArXiv, Vol. abs\/1808.08745 (2018)."},{"key":"e_1_3_2_1_31_1","unstructured":"OpenAI. 2022. Introducing ChatGPT. https:\/\/openai.com\/blog\/chatgpt."},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_34_1","volume-title":"Kranthi Kiran GV, et al","author":"Peng Bo","year":"2023","unstructured":"Bo Peng, Eric Alcaide, Quentin Anthony, Alon Albalak, Samuel Arcadinho, Huanqi Cao, Xin Cheng, Michael Chung, Matteo Grella, Kranthi Kiran GV, et al. 2023. RWKV: Reinventing RNNs for the Transformer Era. arXiv preprint arXiv:2305.13048 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Learning representations by back-propagating errors. nature","author":"Rumelhart David E","year":"1986","unstructured":"David E Rumelhart, Geoffrey E Hinton, and Ronald J Williams. 1986. Learning representations by back-propagating errors. nature, Vol. 323, 6088 (1986), 533--536."},{"key":"e_1_3_2_1_36_1","volume-title":"Safety Assessment of Chinese Large Language Models. arXiv preprint arXiv:2304.10436","author":"Sun Hao","year":"2023","unstructured":"Hao Sun, Zhexin Zhang, Jiawen Deng, Jiale Cheng, and Minlie Huang. 2023. Safety Assessment of Chinese Large Language Models. arXiv preprint arXiv:2304.10436 (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"Alpaca: A strong, replicable instruction-following model","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B Hashimoto. 2023. Alpaca: A strong, replicable instruction-following model. Stanford Center for Research on Foundation Models. https:\/\/crfm. stanford. edu\/2023\/03\/13\/alpaca. html, Vol. 3, 6 (2023), 7."},{"key":"e_1_3_2_1_38_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Yonghui Wu Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_1_39_1","unstructured":"Baichuan Intelligent Technology. 2023. Baichuan-7B. https:\/\/github.com\/baichuan-inc\/Baichuan-7B."},{"key":"e_1_3_2_1_40_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_41_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_42_1","volume-title":"Self-instruct: Aligning language model with self generated instructions. arXiv preprint arXiv:2212.10560","author":"Wang Yizhong","year":"2022","unstructured":"Yizhong Wang, Yeganeh Kordi, Swaroop Mishra, Alisa Liu, Noah A Smith, Daniel Khashabi, and Hannaneh Hajishirzi. 2022. Self-instruct: Aligning language model with self generated instructions. arXiv preprint arXiv:2212.10560 (2022)."},{"key":"e_1_3_2_1_43_1","unstructured":"Yidong Wang Zhuohao Yu Zhengran Zeng Linyi Yang Cunxiang Wang Hao Chen Chaoya Jiang Rui Xie Jindong Wang Xing Xie et al. 2023. PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization. arXiv preprint arXiv:2306.05087 (2023)."},{"key":"e_1_3_2_1_44_1","unstructured":"Jason Wei Yi Tay Rishi Bommasani Colin Raffel Barret Zoph Sebastian Borgeaud Dani Yogatama Maarten Bosma Denny Zhou Donald Metzler et al. 2022. Emergent abilities of large language models. arXiv preprint arXiv:2206.07682 (2022)."},{"key":"e_1_3_2_1_45_1","volume-title":"SuperCLUE: A Comprehensive Chinese Large Language Model Benchmark. arXiv preprint arXiv:2307.15020","author":"Xu Liang","year":"2023","unstructured":"Liang Xu, Anqi Li, Lei Zhu, Hang Xue, Changtai Zhu, Kangkang Zhao, Haonan He, Xuanwei Zhang, Qiyue Kang, and Zhenzhong Lan. 2023. SuperCLUE: A Comprehensive Chinese Large Language Model Benchmark. arXiv preprint arXiv:2307.15020 (2023)."},{"key":"e_1_3_2_1_46_1","unstructured":"Aiyuan Yang Bin Xiao Bingning Wang Borong Zhang Chao Yin Chenxu Lv Da Pan Dian Wang Dong Yan Fan Yang et al. 2023. Baichuan 2: Open Large-scale Language Models. arXiv preprint arXiv:2309.10305 (2023)."},{"key":"e_1_3_2_1_47_1","volume-title":"Automatic evaluation of attribution by large language models. arXiv preprint arXiv:2305.06311","author":"Yue Xiang","year":"2023","unstructured":"Xiang Yue, Boshi Wang, Kai Zhang, Ziru Chen, Yu Su, and Huan Sun. 2023. Automatic evaluation of attribution by large language models. arXiv preprint arXiv:2305.06311 (2023)."},{"key":"e_1_3_2_1_48_1","unstructured":"Aohan Zeng Xiao Liu Zhengxiao Du Zihan Wang Hanyu Lai Ming Ding Zhuoyi Yang Yifan Xu Wendi Zheng Xiao Xia et al. 2022. Glm-130b: An open bilingual pre-trained model. arXiv preprint arXiv:2210.02414 (2022)."},{"key":"e_1_3_2_1_49_1","volume-title":"Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675","author":"Zhang Tianyi","year":"2019","unstructured":"Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q Weinberger, and Yoav Artzi. 2019. Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675 (2019)."},{"key":"e_1_3_2_1_50_1","volume-title":"Evaluating the Performance of Large Language Models on GAOKAO Benchmark. arXiv preprint arXiv:2305.12474","author":"Zhang Xiaotian","year":"2023","unstructured":"Xiaotian Zhang, Chunyang Li, Yi Zong, Zhengyu Ying, Liang He, and Xipeng Qiu. 2023. Evaluating the Performance of Large Language Models on GAOKAO Benchmark. arXiv preprint arXiv:2305.12474 (2023)."},{"key":"e_1_3_2_1_51_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric Xing et al. 2023. Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. arXiv preprint arXiv:2306.05685 (2023)."}],"event":{"name":"CIKM '24: The 33rd ACM International Conference on Information and Knowledge Management","location":"Boise ID USA","acronym":"CIKM '24","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 33rd ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627673.3679677","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3627673.3679677","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:58:13Z","timestamp":1750294693000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627673.3679677"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,21]]},"references-count":50,"alternative-id":["10.1145\/3627673.3679677","10.1145\/3627673"],"URL":"https:\/\/doi.org\/10.1145\/3627673.3679677","relation":{},"subject":[],"published":{"date-parts":[[2024,10,21]]},"assertion":[{"value":"2024-10-21","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}