{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T01:02:43Z","timestamp":1774400563969,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,18]]},"DOI":"10.1145\/3731120.3744605","type":"proceedings-article","created":{"date-parts":[[2025,7,18]],"date-time":"2025-07-18T13:34:06Z","timestamp":1752845646000},"page":"358-368","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["A Large-Scale Study of Relevance Assessments with Large Language Models Using UMBRELA"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-7071-2344","authenticated-orcid":false,"given":"Shivani","family":"Upadhyay","sequence":"first","affiliation":[{"name":"University of Waterloo, Waterloo, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6296-601X","authenticated-orcid":false,"given":"Ronak","family":"Pradeep","sequence":"additional","affiliation":[{"name":"University of Waterloo, Waterloo, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6107-2460","authenticated-orcid":false,"given":"Nandan","family":"Thakur","sequence":"additional","affiliation":[{"name":"University of Waterloo, Waterloo, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5138-8426","authenticated-orcid":false,"given":"Daniel","family":"Campos","sequence":"additional","affiliation":[{"name":"Snowflake, San Mateo, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9351-8137","authenticated-orcid":false,"given":"Nick","family":"Craswell","sequence":"additional","affiliation":[{"name":"Microsoft, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2363-3014","authenticated-orcid":false,"given":"Ian","family":"Soboroff","sequence":"additional","affiliation":[{"name":"NIST, Gaithersburg, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0661-7189","authenticated-orcid":false,"given":"Jimmy","family":"Lin","sequence":"additional","affiliation":[{"name":"University of Waterloo, Waterloo, Canada"}]}],"member":"320","published-online":{"date-parts":[[2025,7,18]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Can We Use Large Language Models to Fill Relevance Judgment Holes? arXiv:2405.05600","author":"Abbasiantaeb Zahra","year":"2024","unstructured":"Zahra Abbasiantaeb, Chuan Meng, Leif Azzopardi, and Mohammad Aliannejadi. 2024. Can We Use Large Language Models to Fill Relevance Judgment Holes? arXiv:2405.05600 (2024)."},{"key":"e_1_3_2_1_2_1","first-page":"32","volume-title":"Proceedings of the Annual International ACM SIGIR Conference on Research and Development in Information Retrieval in the Asia Pacific Region (SIGIR-AP","author":"Alaofi Marwah","year":"2024","unstructured":"Marwah Alaofi, Paul Thomas, Falk Scholer, and Mark Sanderson. 2024. LLMs can be Fooled into Labelling a Document as Relevant. In Proceedings of the Annual International ACM SIGIR Conference on Research and Development in Information Retrieval in the Asia Pacific Region (SIGIR-AP 2024). Tokyo, Japan, 32-41."},{"key":"e_1_3_2_1_3_1","volume-title":"Clarke","author":"Arabzadeh Negar","year":"2024","unstructured":"Negar Arabzadeh and Charles L. A. Clarke. 2024. A Comparison of Methods for Evaluating Generative IR. arXiv:2404.04044 (2024)."},{"key":"e_1_3_2_1_4_1","unstructured":"Yuntao Bai Saurav Kadavath Sandipan Kundu Amanda Askell Jackson Kernion Andy Jones Anna Chen Anna Goldie Azalia Mirhoseini Cameron McKinnon Carol Chen Catherine Olsson Christopher Olah Danny Hernandez Dawn Drain Deep Ganguli Dustin Li Eli Tran-Johnson Ethan Perez Jamie Kerr Jared Mueller Jeffrey Ladish Joshua Landau Kamal Ndousse Kamile Lukosuite Liane Lovitt Michael Sellitto Nelson Elhage Nicholas Schiefer Noemi Mercado Nova DasSarma Robert Lasenby Robin Larson Sam Ringer Scott Johnston Shauna Kravec Sheer El Showk Stanislav Fort Tamera Lanham Timothy Telleen-Lawton Tom Conerly Tom Henighan Tristan Hume Samuel R. Bowman Zac Hatfield-Dodds Ben Mann Dario Amodei Nicholas Joseph Sam McCandlish Tom Brown and Jared Kaplan. 2022. Constitutional AI: Harmlessness from AI Feedback. arXiv:2212.08073 (2022)."},{"key":"e_1_3_2_1_5_1","volume-title":"MS MARCO: A Human Generated MAchine Reading COmprehension Dataset. arXiv:1611.09268","author":"Bajaj Payal","year":"2018","unstructured":"Payal Bajaj, Daniel Campos, Nick Craswell, Li Deng, Jianfeng Gao, Xiaodong Liu, Rangan Majumder, Andrew McNamara, Bhaskar Mitra, Tri Nguyen, Mir Rosenberg, Xia Song, Alina Stoica, Saurabh Tiwary, and Tong Wang. 2018. MS MARCO: A Human Generated MAchine Reading COmprehension Dataset. arXiv:1611.09268 (2018)."},{"key":"e_1_3_2_1_6_1","first-page":"2387","volume-title":"Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR","author":"Bonifacio Luiz","year":"2022","unstructured":"Luiz Bonifacio, Hugo Abonizio, Marzieh Fadaee, and Rodrigo Nogueira. 2022. InPars: Data Augmentation for Information Retrieval using Large Language Models. In Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022). Madrid, Spain, 2387-2392."},{"key":"e_1_3_2_1_7_1","first-page":"33","volume-title":"Proceedings of the 23rd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR","author":"Buckley Chris","year":"2000","unstructured":"Chris Buckley and Ellen M. Voorhees. 2000. Evaluating Evaluation Measure Stability. In Proceedings of the 23rd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2000). Athens, Greece, 33-40."},{"key":"e_1_3_2_1_8_1","first-page":"25","volume-title":"Proceedings of the 27th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR","author":"Buckley Chris","year":"2004","unstructured":"Chris Buckley and Ellen M. Voorhees. 2004. Retrieval Evaluation with Incomplete Information. In Proceedings of the 27th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2004). Sheffield, United Kingdom, 25-32."},{"key":"e_1_3_2_1_9_1","volume-title":"Language Models Trained on Media Diets Can Predict Public Opinion. arXiv:2303.16779","author":"Chu Eric","year":"2023","unstructured":"Eric Chu, Jacob Andreas, Stephen Ansolabehere, and Deb Roy. 2023. Language Models Trained on Media Diets Can Predict Public Opinion. arXiv:2303.16779 (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"Clarke and Laura Dietz","author":"Charles L.","year":"2025","unstructured":"Charles L. A. Clarke and Laura Dietz. 2025. LLM-based Relevance Assessment Still Can't Replace Human Relevance Assessment. arXiv:2412.17156 (2025)."},{"key":"e_1_3_2_1_11_1","volume-title":"LM vs LM: Detecting Factual Errors via Cross Examination. arXiv:2305.13281","author":"Cohen Roi","year":"2023","unstructured":"Roi Cohen, May Hamri, Mor Geva, and Amir Globerson. 2023. LM vs LM: Detecting Factual Errors via Cross Examination. arXiv:2305.13281 (2023)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0306-4573(99)00048-5"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the Thirty-First Text REtrieval Conference (TREC","author":"Craswell Nick","year":"2022","unstructured":"Nick Craswell, Bhaskar Mitra, Emine Yilmaz, Daniel Campos, Jimmy Lin, Ellen M. Voorhees, and Ian Soboroff. 2022. Overview of the TREC 2022 Deep Learning Track. In Proceedings of the Thirty-First Text REtrieval Conference (TREC 2022). Gaithersburg, Maryland."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the Thirty-Second Text REtrieval Conference (TREC","author":"Craswell Nick","year":"2023","unstructured":"Nick Craswell, Bhaskar Mitra, Emine Yilmaz, Hossein A. Rahmani, Daniel Campos, Jimmy Lin, Ellen M. Voorhees, and Ian Soboroff. 2023. Overview of the TREC 2023 Deep Learning Track. In Proceedings of the Thirty-Second Text REtrieval Conference (TREC 2023). Gaithersburg, Maryland."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 11th International Conference on Learning Representations (ICLR","author":"Dai Zhuyun","year":"2023","unstructured":"Zhuyun Dai, Vincent Y. Zhao, Ji Ma, Yi Luan, Jianmo Ni, Jing Lu, Anton Bakalov, Kelvin Guu, Keith B. Hall, and Ming-Wei Chang. 2023. Promptagator: Few-shot Dense Retrieval From 8 Examples. In Proceedings of the 11th International Conference on Learning Representations (ICLR 2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Investigating Data Contamination in Modern Benchmarks for Large Language Models. arXiv:2311.09783","author":"Deng Chunyuan","year":"2024","unstructured":"Chunyuan Deng, Yilun Zhao, Xiangru Tang, Mark Gerstein, and Arman Cohan. 2024. Investigating Data Contamination in Modern Benchmarks for Large Language Models. arXiv:2311.09783 (2024)."},{"key":"e_1_3_2_1_17_1","volume-title":"Questioning the Survey Responses of Large Language Models. arXiv:2306.07951","author":"Dominguez-Olmedo Ricardo","year":"2024","unstructured":"Ricardo Dominguez-Olmedo, Moritz Hardt, and Celestine Mendler-D\u00fcnner. 2024. Questioning the Survey Responses of Large Language Models. arXiv:2306.07951 (2024)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3578337.3605136"},{"key":"e_1_3_2_1_19_1","volume-title":"GPTScore: Evaluate as You Desire. arXiv:2302.04166","author":"Fu Jinlan","year":"2023","unstructured":"Jinlan Fu, See-Kiong Ng, Zhengbao Jiang, and Pengfei Liu. 2023. GPTScore: Evaluate as You Desire. arXiv:2302.04166 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"Human-like Summarization Evaluation with ChatGPT. arXiv:2304.02554","author":"Gao Mingqi","year":"2023","unstructured":"Mingqi Gao, Jie Ruan, Renliang Sun, Xunjian Yin, Shiping Yang, and Xiaojun Wan. 2023. Human-like Summarization Evaluation with ChatGPT. arXiv:2304.02554 (2023)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.2305016120"},{"key":"e_1_3_2_1_22_1","volume-title":"Information Retrieval Evaluation","author":"Harman Donna","unstructured":"Donna Harman. 2011. Information Retrieval Evaluation. Morgan & Claypool Publishers."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/1076034.1076063"},{"key":"e_1_3_2_1_24_1","volume-title":"AI-Augmented Surveys: Leveraging Large Language Models and Surveys for Opinion Prediction. arXiv:2305.09620","author":"Kim Junsol","year":"2024","unstructured":"Junsol Kim and Byungkyu Lee. 2024. AI-Augmented Surveys: Leveraging Large Language Models and Surveys for Opinion Prediction. arXiv:2305.09620 (2024)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1016\/0020-0271(68)90029-6"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/1076034.1076102"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3592032"},{"key":"e_1_3_2_1_28_1","volume-title":"Query Performance Prediction using Relevance Judgments Generated by Large Language Models. arXiv:2404.01012","author":"Meng Chuan","year":"2024","unstructured":"Chuan Meng, Negar Arabzadeh, Arian Askari, Mohammad Aliannejadi, and Maarten de Rijke. 2024. Query Performance Prediction using Relevance Judgments Generated by Large Language Models. arXiv:2404.01012 (2024)."},{"key":"e_1_3_2_1_29_1","first-page":"132","volume-title":"Proceedings of the 47th European Conference on Information Retrieval (ECIR","author":"Pradeep Ronak","year":"2025","unstructured":"Ronak Pradeep, Nandan Thakur, Sahel Sharifymoghaddam, Eric Zhang, Ryan Nguyen, Daniel Campos, Nick Craswell, and Jimmy Lin. 2025. Ragnar\u00f6k: A Reusable RAG Framework and Baselines for TREC 2024 Retrieval-Augmented Generation Track. In Proceedings of the 47th European Conference on Information Retrieval (ECIR 2025), Part I. Lucca, Italy, 132-148."},{"key":"e_1_3_2_1_30_1","volume-title":"LLMJudge: LLMs for Relevance Judgments. arXiv:2408.08896","author":"Rahmani Hossein A.","year":"2024","unstructured":"Hossein A. Rahmani, Emine Yilmaz, Nick Craswell, Bhaskar Mitra, Paul Thomas, Charles L. A. Clarke, Mohammad Aliannejadi, Clemencia Siro, and Guglielmo Faggioli. 2024. LLMJudge: LLMs for Relevance Judgments. arXiv:2408.08896 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"Researchy Questions: A Dataset of Multi-Perspective, Decompositional Questions for LLM Web Agents. arXiv:2402.17896","author":"Rosset Corby","year":"2024","unstructured":"Corby Rosset, Ho-Lam Chung, Guanghui Qin, Ethan C. Chau, Zhuo Feng, Ahmed Awadallah, Jennifer Neville, and Nikhil Rao. 2024. Researchy Questions: A Dataset of Multi-Perspective, Decompositional Questions for LLM Web Agents. arXiv:2402.17896 (2024)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/1076034.1076064"},{"key":"e_1_3_2_1_33_1","volume-title":"Don't Use LLMs to Make Relevance Judgments. arXiv:2409.15133","author":"Soboroff Ian","year":"2024","unstructured":"Ian Soboroff. 2024. Don't Use LLMs to Make Relevance Judgments. arXiv:2409.15133 (2024)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657707"},{"key":"e_1_3_2_1_35_1","volume-title":"LLMs Can Patch Up Missing Relevance Judgments in Evaluation. arXiv:2405.04727","author":"Upadhyay Shivani","year":"2024","unstructured":"Shivani Upadhyay, Ehsan Kamalloo, and Jimmy Lin. 2024a. LLMs Can Patch Up Missing Relevance Judgments in Evaluation. arXiv:2405.04727 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Hoa Trang Dang, and Jimmy Lin","author":"Upadhyay Shivani","year":"2024","unstructured":"Shivani Upadhyay, Ronak Pradeep, Nandan Thakur, Daniel Campos, Nick Craswell, Ian Soboroff, Hoa Trang Dang, and Jimmy Lin. 2024b. A Large-Scale Study of Relevance Assessments with Large Language Models: An Initial Look. arXiv:2411.08275 (2024)."},{"key":"e_1_3_2_1_37_1","volume-title":"UMBRELA: UMbrela is the (Open-Source Reproduction of the) Bing RELevance Assessor. arXiv:2406.06519","author":"Upadhyay Shivani","year":"2024","unstructured":"Shivani Upadhyay, Ronak Pradeep, Nandan Thakur, Nick Craswell, and Jimmy Lin. 2024c. UMBRELA: UMbrela is the (Open-Source Reproduction of the) Bing RELevance Assessor. arXiv:2406.06519 (2024)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/2566486.2567989"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/290941.291017"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073445.1073479"},{"key":"e_1_3_2_1_41_1","volume-title":"Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. arXiv:2306.05685","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric P. Xing, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. 2023. Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. arXiv:2306.05685 (2023)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/290941.291014"}],"event":{"name":"ICTIR '25: International ACM SIGIR Conference on Innovative Concepts and Theories in Information Retrieval","location":"Padua Italy","acronym":"ICTIR '25","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 2025 International ACM SIGIR Conference on Innovative Concepts and Theories in Information Retrieval (ICTIR)"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731120.3744605","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:18:32Z","timestamp":1755868712000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731120.3744605"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,18]]},"references-count":42,"alternative-id":["10.1145\/3731120.3744605","10.1145\/3731120"],"URL":"https:\/\/doi.org\/10.1145\/3731120.3744605","relation":{},"subject":[],"published":{"date-parts":[[2025,7,18]]},"assertion":[{"value":"2025-07-18","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}