{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T00:53:12Z","timestamp":1774399992957,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","funder":[{"name":"Natural Sciences and Engineering Research Council (NSERC) of Canada"},{"name":"Snowflake"},{"name":"Microsoft"},{"DOI":"10.13039\/501100006374","name":"Korean Government","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,13]]},"DOI":"10.1145\/3726302.3730090","type":"proceedings-article","created":{"date-parts":[[2025,7,14]],"date-time":"2025-07-14T01:18:36Z","timestamp":1752455916000},"page":"180-190","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["The Great Nugget Recall: Automating Fact Extraction and RAG Evaluation with Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6296-601X","authenticated-orcid":false,"given":"Ronak","family":"Pradeep","sequence":"first","affiliation":[{"name":"University of Waterloo, Waterloo, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6107-2460","authenticated-orcid":false,"given":"Nandan","family":"Thakur","sequence":"additional","affiliation":[{"name":"University of Waterloo, Waterloo, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7071-2344","authenticated-orcid":false,"given":"Shivani","family":"Upadhyay","sequence":"additional","affiliation":[{"name":"University of Waterloo, Waterloo, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5138-8426","authenticated-orcid":false,"given":"Daniel","family":"Campos","sequence":"additional","affiliation":[{"name":"Snowflake, San Mateo, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9351-8137","authenticated-orcid":false,"given":"Nick","family":"Craswell","sequence":"additional","affiliation":[{"name":"Microsoft, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2363-3014","authenticated-orcid":false,"given":"Ian","family":"Soboroff","sequence":"additional","affiliation":[{"name":"NIST, Gaithersburg, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3009-3465","authenticated-orcid":false,"given":"Hoa Trang","family":"Dang","sequence":"additional","affiliation":[{"name":"NIST, Gaithersburg, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0661-7189","authenticated-orcid":false,"given":"Jimmy","family":"Lin","sequence":"additional","affiliation":[{"name":"University of Waterloo, Waterloo, Canada"}]}],"member":"320","published-online":{"date-parts":[[2025,7,13]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Charles LA Clarke, and Mark Sanderson","author":"Alaofi Marwah","year":"2024","unstructured":"Marwah Alaofi, Negar Arabzadeh, Charles LA Clarke, and Mark Sanderson. 2024. Generative Information Retrieval Evaluation. In Information Access in the Era of Generative AI. Springer, 135-159."},{"key":"e_1_3_2_1_2_1","volume-title":"A Comparison of Methods for Evaluating Generative IR. arXiv:2404.04044","author":"Arabzadeh Negar","year":"2024","unstructured":"Negar Arabzadeh and Charles LA Clarke. 2024. A Comparison of Methods for Evaluating Generative IR. arXiv:2404.04044 (2024)."},{"key":"e_1_3_2_1_3_1","first-page":"2206","volume-title":"Proceedings of the 39th International Conference on Machine Learning (ICML","author":"Borgeaud Sebastian","year":"2022","unstructured":"Sebastian Borgeaud, Arthur Mensch, Jordan Hoffmann, Trevor Cai, Eliza Rutherford, et al., 2022. Improving Language Models by Retrieving from Trillions of Tokens. In Proceedings of the 39th International Conference on Machine Learning (ICML 2022). Baltimore, Maryland, 2206-2240."},{"key":"e_1_3_2_1_4_1","first-page":"33","volume-title":"Proceedings of the 23rd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR","author":"Buckley Chris","year":"2000","unstructured":"Chris Buckley and Ellen M. Voorhees. 2000. Evaluating Evaluation Measure Stability. In Proceedings of the 23rd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2000). Athens, Greece, 33-40."},{"key":"e_1_3_2_1_5_1","first-page":"25","volume-title":"Proceedings of the 27th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR","author":"Buckley Chris","year":"2004","unstructured":"Chris Buckley and Ellen M. Voorhees. 2004. Retrieval Evaluation with Incomplete Information. In Proceedings of the 27th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2004). Sheffield, United Kingdom, 25-32."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29728"},{"key":"e_1_3_2_1_7_1","first-page":"768","volume-title":"Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics (ACL","author":"Dang Hoa Trang","year":"2007","unstructured":"Hoa Trang Dang and Jimmy Lin. 2007. Different Structures for Evaluating Answers to Complex Questions: Pyramids Won't Topple, and Neither Will Human Assessors. In Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics (ACL 2007). Prague, Czech Republic, 768-775."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.eacl-demo.16"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664190.3672511"},{"key":"e_1_3_2_1_10_1","volume-title":"CypherBench: Towards Precise Retrieval over Full-scale Modern Knowledge Graphs in the LLM Era. arXiv:2412.18702","author":"Feng Yanlin","year":"2024","unstructured":"Yanlin Feng, Simone Papicchio, and Sajjadur Rahman. 2024. CypherBench: Towards Precise Retrieval over Full-scale Modern Knowledge Graphs in the LLM Era. arXiv:2412.18702 (2024)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.398"},{"key":"e_1_3_2_1_12_1","first-page":"3929","volume-title":"Proceedings of the 37th International Conference on Machine Learning (ICML","author":"Guu Kelvin","year":"2020","unstructured":"Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat, and Ming-Wei Chang. 2020. REALM: Retrieval-Augmented Language Model Pre-training. In Proceedings of the 37th International Conference on Machine Learning (ICML 2020). 3929-3938."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.249"},{"key":"e_1_3_2_1_14_1","volume-title":"Information Retrieval Evaluation","author":"Harman Donna","unstructured":"Donna Harman. 2011. Information Retrieval Evaluation. Morgan & Claypool Publishers."},{"key":"e_1_3_2_1_15_1","volume-title":"MRAG-Bench: Vision-Centric Evaluation for Retrieval-Augmented Multimodal Models. arXiv:2410.08182","author":"Hu Wenbo","year":"2024","unstructured":"Wenbo Hu, Jia-Chen Gu, Zi-Yi Dou, Mohsen Fayyaz, Pan Lu, Kai-Wei Chang, and Nanyun Peng. 2024. MRAG-Bench: Vision-Centric Evaluation for Retrieval-Augmented Multimodal Models. arXiv:2410.08182 (2024)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.74"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the NeurIPS 2024 Third Table Representation Learning Workshop.","author":"Ji Xingyu","year":"2024","unstructured":"Xingyu Ji, Aditya Parameswaran, and Madelon Hulsebos. 2024. TARGET: Benchmarking Table Retrieval for Generative Tasks. In Proceedings of the NeurIPS 2024 Third Table Representation Learning Workshop."},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the 8th International Conference on Learning Representations (ICLR","author":"Khandelwal Urvashi","year":"2020","unstructured":"Urvashi Khandelwal, Omer Levy, Dan Jurafsky, Luke Zettlemoyer, and Mike Lewis. 2020. Generalization through Memorization: Nearest Neighbor Language Models. In Proceedings of the 8th International Conference on Learning Representations (ICLR 2020). Addis Ababa, Ethiopia."},{"key":"e_1_3_2_1_19_1","volume-title":"A Unified Evaluation of Retrieval-Augmented Generation. arXiv:2409.12941","author":"Krishna Satyapriya","year":"2024","unstructured":"Satyapriya Krishna, Kalpesh Krishna, Anhad Mohananey, Steven Schwarcz, Adam Stambler, Shyam Upadhyay, and Manaal Faruqui. 2024. Fact, Fetch, and Reason: A Unified Evaluation of Retrieval-Augmented Generation. arXiv:2409.12941 (2024)."},{"key":"e_1_3_2_1_20_1","first-page":"9459","article-title":"Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela. 2020. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In Advances in Neural Information Processing Systems 33. 9459-9474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.3115\/1220575.1220692"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10791-006-9003-7"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.3115\/1220835.1220884"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/1277741.1277799"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657846"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.741"},{"key":"e_1_3_2_1_27_1","first-page":"145","volume-title":"Proceedings of the Human Language Technology Conference of the North American Chapter of the Association for Computational Linguistics: HLT-NAACL","author":"Nenkova Ani","year":"2004","unstructured":"Ani Nenkova and Rebecca Passonneau. 2004. Evaluating Content Selection in Summarization: The Pyramid Method. In Proceedings of the Human Language Technology Conference of the North American Chapter of the Association for Computational Linguistics: HLT-NAACL 2004. Boston, Massachusetts, 145-152."},{"key":"e_1_3_2_1_28_1","volume-title":"Hailin Chen, Yifei Ming, Zixuan Ke, Silvio Savarese, Caiming Xong, and Shafiq Joty.","author":"Nguyen Xuan-Phi","year":"2024","unstructured":"Xuan-Phi Nguyen, Shrey Pandit, Senthil Purushwalkam, Austin Xu, Hailin Chen, Yifei Ming, Zixuan Ke, Silvio Savarese, Caiming Xong, and Shafiq Joty. 2024. SFR-RAG: Towards Contextually Faithful LLMs. arXiv:2409.09916 (2024)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2124295.2124343"},{"key":"e_1_3_2_1_30_1","first-page":"1176","volume-title":"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track","author":"Pradeep Ronak","year":"2024","unstructured":"Ronak Pradeep, Daniel Lee, Ali Mousavi, Jeffrey Pound, Yisi Sang, Jimmy Lin, Ihab Ilyas, Saloni Potdar, Mostafa Arefiyan, and Yunyao Li. 2024a. ConvKGYarn: Spinning Configurable and Scalable Conversational Knowledge Graph QA Datasets with Large Language Models. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track. Miami, Florida, 1176-1206."},{"key":"e_1_3_2_1_31_1","first-page":"132","volume-title":"Proceedings of the 47th European Conference on Information Retrieval (ECIR","author":"Pradeep Ronak","year":"2025","unstructured":"Ronak Pradeep, Nandan Thakur, Sahel Sharifymoghaddam, Eric Zhang, Ryan Nguyen, Daniel Campos, Nick Craswell, and Jimmy Lin. 2025. Ragnar\u00f6k: A Reusable RAG Framework and Baselines for TREC 2024 Retrieval-Augmented Generation Track. In Proceedings of the 47th European Conference on Information Retrieval (ECIR 2025), Part I. Lucca, Italy, 132-148."},{"key":"e_1_3_2_1_32_1","volume-title":"Initial Nugget Evaluation Results for the TREC 2024 RAG Track with the AutoNuggetizer Framework. arXiv:2411","author":"Pradeep Ronak","year":"2024","unstructured":"Ronak Pradeep, Nandan Thakur, Shivani Upadhyay, Daniel Campos, Nick Craswell, and Jimmy Lin. 2024b. Initial Nugget Evaluation Results for the TREC 2024 RAG Track with the AutoNuggetizer Framework. arXiv:2411.09607 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"Evaluating RAG-Fusion with RAGElo: an Automated Elo-based Framework. arXiv:2406.14783","author":"Rackauckas Zackary","year":"2024","unstructured":"Zackary Rackauckas, Arthur C\u00e2mara, and Jakub Zavrel. 2024. Evaluating RAG-Fusion with RAGElo: an Automated Elo-based Framework. arXiv:2406.14783 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Researchy Questions: A Dataset of Multi-Perspective, Decompositional Questions for LLM Web Agents. arXiv:2402.17896","author":"Rosset Corby","year":"2024","unstructured":"Corby Rosset, Ho-Lam Chung, Guanghui Qin, Ethan C. Chau, Zhuo Feng, Ahmed Awadallah, Jennifer Neville, and Nikhil Rao. 2024. Researchy Questions: A Dataset of Multi-Perspective, Decompositional Questions for LLM Web Agents. arXiv:2402.17896 (2024)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.20"},{"key":"e_1_3_2_1_36_1","volume-title":"Evaluating Retrieval Quality in Retrieval-Augmented Generation. arXiv:2404.13781","author":"Salemi Alireza","year":"2024","unstructured":"Alireza Salemi and Hamed Zamani. 2024. Evaluating Retrieval Quality in Retrieval-Augmented Generation. arXiv:2404.13781 (2024)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/1076034.1076064"},{"key":"e_1_3_2_1_38_1","volume-title":"Chatbot Arena Meets Nuggets: Towards Explanations and Diagnostics in the Evaluation of LLM Responses. arXiv:2504.20006","author":"Sharifymoghaddam Sahel","year":"2025","unstructured":"Sahel Sharifymoghaddam, Shivani Upadhyay, Nandan Thakur, Ronak Pradeep, and Jimmy Lin. 2025. Chatbot Arena Meets Nuggets: Towards Explanations and Diagnostics in the Evaluation of LLM Responses. arXiv:2504.20006 (2025)."},{"key":"e_1_3_2_1_39_1","volume-title":"RAGProbe: An Automated Approach for Evaluating RAG Applications. arXiv:2409.19019","author":"Sivasothy Shangeetha","year":"2024","unstructured":"Shangeetha Sivasothy, Scott Barnett, Stefanus Kurniawan, Zafaryab Rasool, and Rajesh Vasa. 2024. RAGProbe: An Automated Approach for Evaluating RAG Applications. arXiv:2409.19019 (2024)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.730"},{"key":"e_1_3_2_1_41_1","volume-title":"2025 a. MIRAGE-Bench: Automatic Multilingual Benchmark Arena for Retrieval-Augmented Generation Systems. arXiv:2410.13716","author":"Thakur Nandan","year":"2025","unstructured":"Nandan Thakur, Suleman Kazi, Ge Luo, Jimmy Lin, and Amin Ahmad. 2025 a. MIRAGE-Bench: Automatic Multilingual Benchmark Arena for Retrieval-Augmented Generation Systems. arXiv:2410.13716 (2025)."},{"key":"e_1_3_2_1_42_1","volume-title":"2025 b. FreshStack: Building Realistic Benchmarks for Evaluating Retrieval on Technical Documents. arXiv:2504.13128","author":"Thakur Nandan","year":"2025","unstructured":"Nandan Thakur, Jimmy Lin, Sam Havens, Michael Carbin, Omar Khattab, and Andrew Drozdov. 2025 b. FreshStack: Building Realistic Benchmarks for Evaluating Retrieval on Technical Documents. arXiv:2504.13128 (2025)."},{"key":"e_1_3_2_1_43_1","volume-title":"Hoa Trang Dang, and Jimmy Lin","author":"Upadhyay Shivani","year":"2024","unstructured":"Shivani Upadhyay, Ronak Pradeep, Nandan Thakur, Daniel Campos, Nick Craswell, Ian Soboroff, Hoa Trang Dang, and Jimmy Lin. 2024a. A Large-Scale Study of Relevance Assessments with Large Language Models: An Initial Look. arXiv:2411.08275 (2024)."},{"key":"e_1_3_2_1_44_1","volume-title":"UMBRELA: UMbrela is the (Open-Source Reproduction of the) Bing RELevance Assessor. arXiv:2406.06519","author":"Upadhyay Shivani","year":"2024","unstructured":"Shivani Upadhyay, Ronak Pradeep, Nandan Thakur, Nick Craswell, and Jimmy Lin. 2024b. UMBRELA: UMbrela is the (Open-Source Reproduction of the) Bing RELevance Assessor. arXiv:2406.06519 (2024)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/290941.291017"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073483.1073520"},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the Twelfth Text REtrieval Conference (TREC","author":"Voorhees Ellen M.","year":"2003","unstructured":"Ellen M. Voorhees. 2003b. Overview of the TREC 2003 Question Answering Track. In Proceedings of the Twelfth Text REtrieval Conference (TREC 2003). Gaithersburg, Maryland."},{"key":"e_1_3_2_1_48_1","volume-title":"Roman Kyslyi, and Nicholas Kersting.","author":"Wang Yang","year":"2024","unstructured":"Yang Wang, Alberto Garcia Hernandez, Roman Kyslyi, and Nicholas Kersting. 2024. Evaluating Quality of Answers for Retrieval-Augmented Generation: A Strong LLM Is All You Need. arXiv:2406.18064 (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"Frank F. Xu","author":"Wang Zora Zhiruo","year":"2025","unstructured":"Zora Zhiruo Wang, Akari Asai, Xinyan Velocity Yu, Frank F. Xu, Yiqing Xie, Graham Neubig, and Daniel Fried. 2025. CodeRAG-Bench: Can Retrieval Augment Code Generation? arXiv:2406.14497 (2025)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.181"},{"key":"e_1_3_2_1_51_1","unstructured":"Xiao Yang Kai Sun Hao Xin Yushi Sun Nikita Bhalla Xiangsen Chen Sajal Choudhary Rongze Daniel Gui Ziran Will Jiang Ziyu Jiang Lingkun Kong Brian Moran Jiaqi Wang Yifan Ethan Xu An Yan Chenyu Yang Eting Yuan Hanwen Zha Nan Tang Lei Chen Nicolas Scheffer Yue Liu Nirav Shah Rakesh Wanga Anuj Kumar Wen tau Yih and Xin Luna Dong. 2024. CRAG - Comprehensive RAG Benchmark. arXiv:2406.04744 (2024)."},{"key":"e_1_3_2_1_52_1","first-page":"46595","article-title":"Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In Advances in Neural Information Processing Systems 36 (NeurIPS 2023) Datasets and Benchmarks Track. New Orleans","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric P. Xing, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. 2023. Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In Advances in Neural Information Processing Systems 36 (NeurIPS 2023) Datasets and Benchmarks Track. New Orleans, Louisiana, 46595-46623.","journal-title":"Louisiana"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/290941.291014"}],"event":{"name":"SIGIR '25: The 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Padua Italy","acronym":"SIGIR '25","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3726302.3730090","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T18:31:32Z","timestamp":1755887492000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3726302.3730090"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,13]]},"references-count":53,"alternative-id":["10.1145\/3726302.3730090","10.1145\/3726302"],"URL":"https:\/\/doi.org\/10.1145\/3726302.3730090","relation":{},"subject":[],"published":{"date-parts":[[2025,7,13]]},"assertion":[{"value":"2025-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}