{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T18:25:22Z","timestamp":1771698322396,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","funder":[{"name":"Dutch Research Council","award":["024.004.022, NWA.1389.20.-183, KICH3.LTP.20.006"],"award-info":[{"award-number":["024.004.022, NWA.1389.20.-183, KICH3.LTP.20.006"]}]},{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372431, 62472408 and 62441229"],"award-info":[{"award-number":["62372431, 62472408 and 62441229"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"the Strategic Priority Research Program of the CAS","award":["XDB0680102, XDB0680301"],"award-info":[{"award-number":["XDB0680102, XDB0680301"]}]},{"DOI":"10.13039\/501100006374","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2023YFA1011602"],"award-info":[{"award-number":["2023YFA1011602"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"European Union's Horizon Europe program","award":["101070212"],"award-info":[{"award-number":["101070212"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,13]]},"DOI":"10.1145\/3726302.3729895","type":"proceedings-article","created":{"date-parts":[[2025,7,14]],"date-time":"2025-07-14T14:55:26Z","timestamp":1752504926000},"page":"1141-1151","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["An Empirical Study of Evaluating Long-form Question Answering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-1220-3021","authenticated-orcid":false,"given":"Ning","family":"Xian","sequence":"first","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4317-2702","authenticated-orcid":false,"given":"Yixing","family":"Fan","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4294-2541","authenticated-orcid":false,"given":"Ruqing","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1086-0202","authenticated-orcid":false,"given":"Maarten","family":"de Rijke","sequence":"additional","affiliation":[{"name":"University of Amsterdam, Amsterdam, Netherlands"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9509-8674","authenticated-orcid":false,"given":"Jiafeng","family":"Guo","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,7,13]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.trustnlp-1.5"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2309.08210"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531926"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3495883"},{"key":"e_1_3_2_1_5_1","volume-title":"Yuanzhi Li, Scott Lundberg, et al.","author":"Bubeck S\u00e9bastien","year":"2023","unstructured":"S\u00e9bastien Bubeck, Varun Chandrasekaran, Ronen Eldan, Johannes Gehrke, Eric Horvitz, Ece Kamar, Peter Lee, Yin Tat Lee, Yuanzhi Li, Scott Lundberg, et al., 2023. Sparks of artificial general intelligence: Early experiments with GPT-4. arXiv. arXiv preprint arXiv:2303.12712 (2023)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-ijcnlp.32"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2023.ACL-LONG.870"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2023.FINDINGS-EMNLP.599"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.253"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics, EACL 2024 - System Demonstrations, St. Julians","author":"Jithin James Shahul ES","year":"2024","unstructured":"Shahul ES, Jithin James, Luis Espinosa Anke, and Steven Schockaert. 2024. RAGAs: Automated Evaluation of Retrieval Augmented Generation. In Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics, EACL 2024 - System Demonstrations, St. Julians, Malta, March 17-22, 2024. Association for Computational Linguistics, 150--158. https:\/\/aclanthology.org\/2024.eacl-demo.16"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1346"},{"key":"e_1_3_2_1_12_1","volume-title":"EVA-Score: Evaluating Abstractive Long-form Summarization on Informativeness through Extraction and Validation. arXiv preprint arXiv:2407.04969","author":"Fan Yuchen","year":"2024","unstructured":"Yuchen Fan, Xin Zhong, Yazhe Wan, Chengsi Wang, Haonan Cheng, Gaoche Wu, Ning Ding, and Bowen Zhou. 2024. EVA-Score: Evaluating Abstractive Long-form Summarization on Informativeness through Extraction and Validation. arXiv preprint arXiv:2407.04969 (2024)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.365"},{"key":"e_1_3_2_1_14_1","unstructured":"Team GLM Aohan Zeng Bin Xu Bowen Wang Chenhui Zhang Da Yin Diego Rojas Guanyu Feng Hanlin Zhao Hanyu Lai Hao Yu Hongning Wang Jiadai Sun Jiajie Zhang Jiale Cheng Jiayi Gui Jie Tang Jing Zhang Juanzi Li Lei Zhao Lindong Wu Lucen Zhong Mingdao Liu Minlie Huang Peng Zhang Qinkai Zheng Rui Lu Shuaiqi Duan Shudan Zhang Shulin Cao Shuxun Yang Weng Lam Tam Wenyi Zhao Xiao Liu Xiao Xia Xiaohan Zhang Xiaotao Gu Xin Lv Xinghan Liu Xinyi Liu Xinyue Yang Xixuan Song Xunkai Zhang Yifan An Yifan Xu Yilin Niu Yuantao Yang Yueyan Li Yushi Bai Yuxiao Dong Zehan Qi Zhaoyu Wang Zhen Yang Zhengxiao Du Zhenyu Hou and Zihan Wang. 2024. ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools. arxiv:2406.12793"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2020.EMNLP-MAIN.6"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2411.15594"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-45442-5_21"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3571730"},{"key":"e_1_3_2_1_19_1","volume-title":"Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al.","author":"Jiang Albert Q","year":"2023","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al., 2023. Mistral 7B. arXiv preprint arXiv:2310.06825 (2023)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2024.NAACL-INDUSTRY.3"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.393"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2023.FINDINGS-ACL.29"},{"key":"e_1_3_2_1_23_1","volume-title":"Shibani Santurkar, Surya Ganguli, Tatsunori Hashimoto, Thomas Icard, Tianyi Zhang, Vishrav Chaudhary, William Wang, Xuechen Li, Yifan Mai, Yuhui Zhang, and Yuta Koreeda.","author":"Liang Percy","year":"2022","unstructured":"Percy Liang, Rishi Bommasani, Tony Lee, Dimitris Tsipras, Dilara Soylu, Michihiro Yasunaga, Yian Zhang, Deepak Narayanan, Yuhuai Wu, Ananya Kumar, Benjamin Newman, Binhang Yuan, Bobby Yan, Ce Zhang, Christian Cosgrove, Christopher D. Manning, Christopher R\u00e9, Diana Acosta-Navas, Drew A. Hudson, Eric Zelikman, Esin Durmus, Faisal Ladhak, Frieda Rong, Hongyu Ren, Huaxiu Yao, Jue Wang, Keshav Santhanam, Laurel Orr, Lucia Zheng, Mert Yuksekgonul, Mirac Suzgun, Nathan Kim, Neel Guha, Niladri Chatterji, Omar Khattab, Peter Henderson, Qian Huang, Ryan Chi, Sang Michael Xie, Shibani Santurkar, Surya Ganguli, Tatsunori Hashimoto, Thomas Icard, Tianyi Zhang, Vishrav Chaudhary, William Wang, Xuechen Li, Yifan Mai, Yuhui Zhang, and Yuta Koreeda. 2022. Holistic Evaluation of Language Models. arXiv:2211.09110 (Nov. 2022). http:\/\/arxiv.org\/abs\/2211.09110 arXiv:2211.09110 [cs]."},{"key":"e_1_3_2_1_24_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.nlp4convai-1.5"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/2023.EMNLP-MAIN.153"},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics, EACL 2024 -","volume":"22","author":"Liusie Adian","year":"2024","unstructured":"Adian Liusie, Potsawee Manakul, and Mark J. F. Gales. 2024. LLM Comparative Assessment: Zero-shot NLG Evaluation through Pairwise Comparisons using Large Language Models. In Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics, EACL 2024 - Volume 1: Long Papers, St. Julian's, Malta, March 17-22, 2024. Association for Computational Linguistics, 139--151. https:\/\/aclanthology.org\/2024.eacl-long.8"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.741"},{"key":"e_1_3_2_1_29_1","unstructured":"OpenAI. 2023. GPT-3.5-Turbo-Instruct. https:\/\/www.openai.com Accessed: 2024-07--18."},{"key":"e_1_3_2_1_30_1","unstructured":"OpenAI. 2025. Explore developer resources tutorials API docs and dynamic examples to get the most out of OpenAI's platform. https:\/\/platform.openai.com Accessed: 2024-01-24."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1162\/coli.2009.35.4.35405"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.566"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.61"},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the 31st International Conference on Computational Linguistics. Association for Computational Linguistics, Abu Dhabi, UAE, 11070--11085","author":"Tao Mingxu","year":"2025","unstructured":"Mingxu Tao, Dongyan Zhao, and Yansong Feng. 2025. Chain-of-Discussion: A Multi-Model Framework for Complex Evidence-Based Question Answering. In Proceedings of the 31st International Conference on Computational Linguistics. Association for Computational Linguistics, Abu Dhabi, UAE, 11070--11085. https:\/\/aclanthology.org\/2025.coling-main.734\/"},{"key":"e_1_3_2_1_36_1","unstructured":"Meta LLaMA Team. 2024. Introducing Meta Llama 3: The most capable openly available LLM to date. https:\/\/ai.meta.com\/blog\/meta-llama-3\/"},{"key":"e_1_3_2_1_37_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.949"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.newsum-1.1"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.511"},{"key":"e_1_3_2_1_41_1","volume-title":"PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization. In The Twelfth International Conference on Learning Representations, ICLR 2024","author":"Wang Yidong","year":"2024","unstructured":"Yidong Wang, Zhuohao Yu, Wenjin Yao, Zhengran Zeng, Linyi Yang, Cunxiang Wang, Hao Chen, Chaoya Jiang, Rui Xie, Jindong Wang, Xing Xie, Wei Ye, Shikun Zhang, and Yue Zhang. 2024b. PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization. In The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024. OpenReview.net. https:\/\/openreview.net\/forum?id=5Nn2BLV7SB"},{"key":"e_1_3_2_1_42_1","volume-title":"Lidia Sam Chao, and Derek Fai Wong","author":"Wu Junchao","year":"2025","unstructured":"Junchao Wu, Shu Yang, Runzhe Zhan, Yulin Yuan, Lidia Sam Chao, and Derek Fai Wong. 2025. A survey on LLM-generated text detection: Necessity, methods, and future directions. Computational Linguistics (2025), 1--65."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.181"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.21"},{"key":"e_1_3_2_1_45_1","volume-title":"The Twelfth International Conference on Learning Representations, ICLR 2024","author":"Ye Seonghyeon","year":"2024","unstructured":"Seonghyeon Ye, Doyoung Kim, Sungdong Kim, Hyeonbin Hwang, Seungone Kim, Yongrae Jo, James Thorne, Juho Kim, and Minjoon Seo. 2024b. FLASK: Fine-grained Language Model Evaluation based on Alignment Skill Sets. In The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024. OpenReview.net. https:\/\/openreview.net\/forum?id=CYmF38ysDa"},{"key":"e_1_3_2_1_46_1","volume-title":"BERTScore: Evaluating Text Generation with BERT. In 8th International Conference on Learning Representations, ICLR 2020","author":"Zhang Tianyi","year":"2020","unstructured":"Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q. Weinberger, and Yoav Artzi. 2020. BERTScore: Evaluating Text Generation with BERT. In 8th International Conference on Learning Representations, ICLR 2020, Addis Ababa, Ethiopia, April 26-30, 2020. OpenReview.net. https:\/\/openreview.net\/forum?id=SkeHuCVFDr"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1609\/AAAI.V38I17.29934"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1053"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.5555\/3666122.3668142"}],"event":{"name":"SIGIR '25: The 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Padua Italy","acronym":"SIGIR '25","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3726302.3729895","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T18:35:38Z","timestamp":1755887738000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3726302.3729895"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,13]]},"references-count":49,"alternative-id":["10.1145\/3726302.3729895","10.1145\/3726302"],"URL":"https:\/\/doi.org\/10.1145\/3726302.3729895","relation":{},"subject":[],"published":{"date-parts":[[2025,7,13]]},"assertion":[{"value":"2025-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}