{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T00:53:12Z","timestamp":1774399992788,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,2]],"date-time":"2024-08-02T00:00:00Z","timestamp":1722556800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1846017"],"award-info":[{"award-number":["1846017"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,2]]},"DOI":"10.1145\/3664190.3672511","type":"proceedings-article","created":{"date-parts":[[2024,8,5]],"date-time":"2024-08-05T12:39:41Z","timestamp":1722861581000},"page":"175-184","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":17,"title":["Pencils Down! Automatic Rubric-based Evaluation of Retrieve\/Generate Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-3297-8888","authenticated-orcid":false,"given":"Naghmeh","family":"Farzi","sequence":"first","affiliation":[{"name":"University of New Hampshire, Durham, NH, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1624-3907","authenticated-orcid":false,"given":"Laura","family":"Dietz","sequence":"additional","affiliation":[{"name":"University of New Hampshire, Durham, NH, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,8,5]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Adapting Standard Retrieval Benchmarks to Evaluate Generated Answers. arXiv preprint arXiv:2401.04842","author":"Arabzadeh Negar","year":"2024","unstructured":"Negar Arabzadeh, Amin Bigdeli, and Charles LA Clarke. 2024. Adapting Standard Retrieval Benchmarks to Evaluate Generated Answers. arXiv preprint arXiv:2401.04842 (2024)."},{"key":"e_1_3_2_1_2_1","unstructured":"Tom B Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. arXiv preprint arXiv:2005.14165 (2020)."},{"key":"e_1_3_2_1_3_1","unstructured":"Yupeng Chang Xu Wang Jindong Wang Yuan Wu Kaijie Zhu Hao Chen Linyi Yang Xiaoyuan Yi Cunxiang Wang Yidong Wang et al. 2023. A survey on evaluation of large language models. arXiv preprint arXiv:2307.03109 (2023)."},{"key":"e_1_3_2_1_4_1","first-page":"70","article-title":"2024. Scaling instruction-finetuned language models","volume":"25","author":"Chung Hyung Won","year":"2024","unstructured":"Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Yunxuan Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, et al. 2024. Scaling instruction-finetuned language models. Journal of Machine Learning Research, Vol. 25, 70 (2024), 1--53.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_5_1","volume-title":"Computational Linguistics","volume":"36","author":"Clarke James","year":"2010","unstructured":"James Clarke and Mirella Lapata. 2010. Discourse Constraints for Document Compression. Computational Linguistics, Vol. 36, 3 (2010)."},{"key":"e_1_3_2_1_6_1","volume-title":"Overview of the TREC 2020 deep learning track. arXiv preprint arXiv:2102","author":"Craswell Nick","year":"2021","unstructured":"Nick Craswell, Bhaskar Mitra, Emine Yilmaz, and Daniel Campos. 2021. Overview of the TREC 2020 deep learning track. arXiv preprint arXiv:2102.07662 (2021)."},{"key":"e_1_3_2_1_7_1","volume-title":"Overview of the TREC 2019 deep learning track. arXiv preprint arXiv:2003","author":"Craswell Nick","year":"2020","unstructured":"Nick Craswell, Bhaskar Mitra, Emine Yilmaz, Daniel Campos, and Ellen M Voorhees. 2020. Overview of the TREC 2019 deep learning track. arXiv preprint arXiv:2003.07820 (2020)."},{"key":"e_1_3_2_1_8_1","volume-title":"Towards Question-Answering as an Automatic Metric for Evaluating the Content Quality of a Summary. arXiv preprint arXiv:2010.00490","author":"Deutsch Daniel","year":"2020","unstructured":"Daniel Deutsch, Tania Bedrax-Weiss, and Dan Roth. 2020. Towards Question-Answering as an Automatic Metric for Evaluating the Content Quality of a Summary. arXiv preprint arXiv:2010.00490 (2020)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657871"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of Text REtrieval Conference (TREC).","author":"Dietz Laura","year":"2019","unstructured":"Laura Dietz and John Foley. 2019. TREC CAR Y3: Complex Answer Retrieval Overview. In Proceedings of Text REtrieval Conference (TREC)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1395"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Guglielmo Faggioli Laura Dietz Charles Clarke Gianluca Demartini Matthias Hagen Claudia Hauff Noriko Kando Evangelos Kanoulas Martin Potthast Benno Stein et al. 2024. Who Determines What Is Relevant? Humans or AI? Why Not Both? A spectrum of human-AI collaboration in assessing relevance. Commun. ACM (2024).","DOI":"10.1145\/3624730"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3578337.3605136"},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of LLM4Eval: The First Workshop on Large Language Models for Evaluation in Information Retrieval.","author":"Farzi Naghmeh","year":"2024","unstructured":"Naghmeh Farzi and Laura Dietz. 2024. EXAM: LLM-based Answerability Metrics for IR Evaluation. In Proceedings of LLM4Eval: The First Workshop on Large Language Models for Evaluation in Information Retrieval."},{"key":"e_1_3_2_1_15_1","unstructured":"Raymond Fok and Daniel S Weld. 2023. In Search of Verifiability: Explanations Rarely Enable Complementary Performance in AI-Advised Decision Making. arXiv preprint arXiv:2305.07722 (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.457"},{"key":"e_1_3_2_1_17_1","volume-title":"Textbook Question Answering for Multimodal Machine Comprehension. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Kembhavi Aniruddha","year":"2017","unstructured":"Aniruddha Kembhavi, Minjoon Seo, Dustin Schwenk, Jonghyun Choi, Ali Farhadi, and Hannaneh Hajishirzi. 2017. Are You Smarter Than a Sixth Grader? Textbook Question Answering for Multimodal Machine Comprehension. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017), 5376--5384."},{"key":"e_1_3_2_1_18_1","volume-title":"LLatrieval: LLM-Verified Retrieval for Verifiable Generation. arXiv preprint arXiv:2311.07838","author":"Li Xiaonan","year":"2023","unstructured":"Xiaonan Li, Changtai Zhu, Linyang Li, Zhangyue Yin, Tianxiang Sun, and Xipeng Qiu. 2023. LLatrieval: LLM-Verified Retrieval for Verifiable Generation. arXiv preprint arXiv:2311.07838 (2023)."},{"key":"e_1_3_2_1_19_1","unstructured":"Percy Liang Rishi Bommasani Tony Lee Dimitris Tsipras Dilara Soylu Michihiro Yasunaga Yian Zhang Deepak Narayanan Yuhuai Wu Ananya Kumar et al. 2022. Holistic evaluation of language models. arXiv preprint arXiv:2211.09110 (2022)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.3115\/1220835.1220884"},{"key":"e_1_3_2_1_21_1","volume-title":"Nafise Sadat Moosavi, and Chenghua Lin","author":"Liu Yiqi","year":"2023","unstructured":"Yiqi Liu, Nafise Sadat Moosavi, and Chenghua Lin. 2023. Llms as narcissistic evaluators: When ego inflates evaluation scores. arXiv preprint arXiv:2311.09766 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"One-Shot Labeling for Automatic Relevance Estimation. arXiv preprint arXiv:2302.11266","author":"MacAvaney Sean","year":"2023","unstructured":"Sean MacAvaney and Luca Soldaini. 2023. One-Shot Labeling for Automatic Relevance Estimation. arXiv preprint arXiv:2302.11266 (2023)."},{"key":"e_1_3_2_1_23_1","unstructured":"Richard McCreadie and Cody Buntain. 2023. CrisisFACTS: Buidling and Evaluating Crisis Timelines. Technical Report. Univerity of Glasgow."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-naacl.111"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/2124295.2124343"},{"key":"e_1_3_2_1_26_1","volume-title":"Improving passage retrieval with zero-shot question generation. arXiv preprint arXiv:2204.07496","author":"Sachan Devendra Singh","year":"2022","unstructured":"Devendra Singh Sachan, Mike Lewis, Mandar Joshi, Armen Aghajanyan, Wen-tau Yih, Joelle Pineau, and Luke Zettlemoyer. 2022. Improving passage retrieval with zero-shot question generation. arXiv preprint arXiv:2204.07496 (2022)."},{"key":"e_1_3_2_1_27_1","volume-title":"EXAM: How to Evaluate Retrieve-and-Generate Systems for Users Who Do Not (Yet) Know What They Want.. In DESIRES. 136--146.","author":"Sander David P","year":"2021","unstructured":"David P Sander and Laura Dietz. 2021. EXAM: How to Evaluate Retrieve-and-Generate Systems for Users Who Do Not (Yet) Know What They Want.. In DESIRES. 136--146."},{"key":"e_1_3_2_1_28_1","volume-title":"Human Question Answering Performance using an Interactive Information Retrieval System. (01","author":"Smucker Mark","year":"2008","unstructured":"Mark Smucker, James Allan, and Blagovest Dachev. 2008. Human Question Answering Performance using an Interactive Information Retrieval System. (01 2008)."},{"key":"e_1_3_2_1_29_1","volume-title":"Is ChatGPT Good at Search? Investigating Large Language Models as Re-Ranking Agent. arXiv e-prints","author":"Sun Weiwei","year":"2023","unstructured":"Weiwei Sun, Lingyong Yan, Xinyu Ma, Pengjie Ren, Dawei Yin, and Zhaochun Ren. 2023. Is ChatGPT Good at Search? Investigating Large Language Models as Re-Ranking Agent. arXiv e-prints (2023), arXiv--2304."},{"key":"e_1_3_2_1_30_1","unstructured":"Paul Thomas Seth Spielman Nick Craswell and Bhaskar Mitra. 2023. Large language models can accurately predict searcher preferences. arxiv: 2309.10621 [cs.IR]"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.450"},{"key":"e_1_3_2_1_32_1","volume-title":"Large language models are not fair evaluators. arXiv preprint arXiv:2305.17926","author":"Wang Peiyi","year":"2023","unstructured":"Peiyi Wang, Lei Li, Liang Chen, Dawei Zhu, Binghuai Lin, Yunbo Cao, Qi Liu, Tianyu Liu, and Zhifang Sui. 2023. Large language models are not fair evaluators. arXiv preprint arXiv:2305.17926 (2023)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.167"},{"key":"e_1_3_2_1_34_1","volume-title":"Towards llm-based fact verification on news claims with a hierarchical step-by-step prompting method. arXiv preprint arXiv:2310.00305","author":"Zhang Xuan","year":"2023","unstructured":"Xuan Zhang and Wei Gao. 2023. Towards llm-based fact verification on news claims with a hierarchical step-by-step prompting method. arXiv preprint arXiv:2310.00305 (2023)."}],"event":{"name":"ICTIR '24: The 2024 ACM SIGIR International Conference on the Theory of Information Retrieval","location":"Washington DC USA","acronym":"ICTIR '24","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 2024 ACM SIGIR International Conference on Theory of Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664190.3672511","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664190.3672511","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T23:58:21Z","timestamp":1755907101000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664190.3672511"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,2]]},"references-count":34,"alternative-id":["10.1145\/3664190.3672511","10.1145\/3664190"],"URL":"https:\/\/doi.org\/10.1145\/3664190.3672511","relation":{},"subject":[],"published":{"date-parts":[[2024,8,2]]},"assertion":[{"value":"2024-08-05","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}