{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T01:02:41Z","timestamp":1774400561708,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,13]]},"DOI":"10.1145\/3726302.3730218","type":"proceedings-article","created":{"date-parts":[[2025,7,14]],"date-time":"2025-07-14T01:38:52Z","timestamp":1752457132000},"page":"2858-2863","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Large Language Model Relevance Assessors Agree With One Another More Than With Human Assessors"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1003-981X","authenticated-orcid":false,"given":"Maik","family":"Fr\u00f6be","sequence":"first","affiliation":[{"name":"Friedrich-Schiller-Universit\u00e4t Jena, Jena, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5446-8328","authenticated-orcid":false,"given":"Andrew","family":"Parry","sequence":"additional","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6032-909X","authenticated-orcid":false,"given":"Ferdinand","family":"Schlatt","sequence":"additional","affiliation":[{"name":"Friedrich-Schiller-Universit\u00e4t Jena, Jena, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8914-2659","authenticated-orcid":false,"given":"Sean","family":"MacAvaney","sequence":"additional","affiliation":[{"name":"University of Glasgow, Glasgow, UK, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9033-2217","authenticated-orcid":false,"given":"Benno","family":"Stein","sequence":"additional","affiliation":[{"name":"Bauhaus Universit\u00e4t Weimar, Weimar, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2451-0665","authenticated-orcid":false,"given":"Martin","family":"Potthast","sequence":"additional","affiliation":[{"name":"University of Kassel, hessian.AI, and ScaDS.AI, Leipzig, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9733-2890","authenticated-orcid":false,"given":"Matthias","family":"Hagen","sequence":"additional","affiliation":[{"name":"Friedrich-Schiller-Universit\u00e4t Jena, Jena, Germany"}]}],"member":"320","published-online":{"date-parts":[[2025,7,13]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2503.19092"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.3115\/1225403.1225421"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1016\/0306-4573(92)90031-T"},{"key":"e_1_3_2_1_4_1","volume-title":"Clarke and Laura Dietz","author":"Charles L.","year":"2024","unstructured":"Charles L. A. Clarke and Laura Dietz. 2024. LLM-based relevance assessment still can't replace human relevance assessment. arxiv:2412.17156 [cs.IR] https:\/\/arxiv.org\/abs\/2412.17156"},{"key":"e_1_3_2_1_5_1","volume-title":"Technical Report PB169574. Association","author":"Cleverdon C.","year":"1966","unstructured":"C. Cleverdon, J. Mills, and M. Keen. 1966. Factors Determining the Performance of Indexing Systems. Volume I. Design. Part 2. Appendices. Technical Report PB169574. Association of Special Libraries and Information Bureau, Cranfield (England). https:\/\/ntrl.ntis.gov\/NTRL\/dashboard\/searchResults\/titleDetail\/PB169574.xhtml Num Pages: 261."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.6028\/NIST.SP.1266.deep-overview"},{"key":"e_1_3_2_1_7_1","volume-title":"Overview of the TREC 2019 deep learning track. CoRR","volume":"2003","author":"Craswell Nick","year":"2020","unstructured":"Nick Craswell, Bhaskar Mitra, Emine Yilmaz, Daniel Campos, and Ellen M. Voorhees. 2020b. Overview of the TREC 2019 deep learning track. CoRR, Vol. abs\/2003.07820 (2020). arXiv:2003.07820"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1108\/eb026436"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of The First Workshop on Large Language Models for Evaluation in Information Retrieval (LLM4Eval 2024) co-located with 10th International Conference on Online Publishing (SIGIR","author":"de Jesus Gabriel","year":"2024","unstructured":"Gabriel de Jesus and S\u00e9rgio Sobral Nunes. 2024. Exploring Large Language Models for Relevance Judgments in Tetun. In Proceedings of The First Workshop on Large Language Models for Evaluation in Information Retrieval (LLM4Eval 2024) co-located with 10th International Conference on Online Publishing (SIGIR 2024), Washington D.C., USA, July 18, 2024 (CEUR Workshop Proceedings, Vol. 3752), Clemencia Siro, Mohammad Aliannejadi, Hossein A. Rahmani, Nick Craswell, Charles L. A. Clarke, Guglielmo Faggioli, Bhaskar Mitra, Paul Thomas, and Emine Yilmaz (Eds.). CEUR-WS.org, 19-30."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3576840.3578327"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3578337.3605136"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3624730"},{"key":"e_1_3_2_1_13_1","unstructured":"Naghmeh Farzi and Laura Dietz. 2024. EXAM: LLM-based Answerability Metrics for IR Evaluation. In Proceedings of The First Workshop on Large Language Models for Evaluation in Information Retrieval (LLM4Eval 2024) co-located with 10th International Conference on Online Publishing (SIGIR 2024) Washington D.C. USA July 18 2024 (CEUR Workshop Proceedings Vol. 3752) Clemencia Siro Mohammad Aliannejadi Hossein A. Rahmani Nick Craswell Charles L. A. Clarke Guglielmo Faggioli Bhaskar Mitra Paul Thomas and Emine Yilmaz (Eds.). CEUR-WS.org 31-50. https:\/\/ceur-ws.org\/Vol-3752\/paper3.pdf"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-88708-6_29"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657849"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of The First Workshop on Large Language Models for Evaluation in Information Retrieval (LLM4Eval 2024) co-located with 10th International Conference on Online Publishing (SIGIR","author":"Huang Jia-Hong","year":"2024","unstructured":"Jia-Hong Huang, Hongyi Zhu, Yixian Shen, Stevan Rudinac, Alessio M. Pacces, and Evangelos Kanoulas. 2024. A Novel Evaluation Framework for Image2Text Generation. In Proceedings of The First Workshop on Large Language Models for Evaluation in Information Retrieval (LLM4Eval 2024) co-located with 10th International Conference on Online Publishing (SIGIR 2024), Washington D.C., USA, July 18, 2024 (CEUR Workshop Proceedings, Vol. 3752), Clemencia Siro, Mohammad Aliannejadi, Hossein A. Rahmani, Nick Craswell, Charles L. A. Clarke, Guglielmo Faggioli, Bhaskar Mitra, Paul Thomas, and Emine Yilmaz (Eds.). CEUR-WS.org, 51-65."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/0020-0271(68)90029-6"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3592032"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3463254"},{"key":"e_1_3_2_1_20_1","unstructured":"Tri Nguyen Mir Rosenberg Xia Song Jianfeng Gao Saurabh Tiwary Rangan Majumder and Li Deng. 2016. MS MARCO: A Human Generated MAchine Reading COmprehension Dataset. In Proceedings of the Workshop on Cognitive Computation: Integrating neural and symbolic approaches 2016 co-located with the 30th Annual Conference on Neural Information Processing Systems (NIPS 2016) Barcelona Spain December 9 2016 (CEUR Workshop Proceedings Vol. 1773) Tarek Richard Besold Antoine Bordes Artur S. d'Avila Garcez and Greg Wayne (Eds.). CEUR-WS.org. https:\/\/ceur-ws.org\/Vol-1773\/CoCoNIPS_2016_paper9.pdf"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2005.03.023"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331399"},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of The First Workshop on Large Language Models for Evaluation in Information Retrieval (LLM4Eval 2024) co-located with 10th International Conference on Online Publishing (SIGIR","author":"Rackauckas Zackary","year":"2024","unstructured":"Zackary Rackauckas, Arthur C\u00e2mara, and Jakub Zavrel. 2024. Evaluating RAG-Fusion with RAGElo: an Automated Elo-based Framework. In Proceedings of The First Workshop on Large Language Models for Evaluation in Information Retrieval (LLM4Eval 2024) co-located with 10th International Conference on Online Publishing (SIGIR 2024), Washington D.C., USA, July 18, 2024 (CEUR Workshop Proceedings, Vol. 3752), Clemencia Siro, Mohammad Aliannejadi, Hossein A. Rahmani, Nick Craswell, Charles L. A. Clarke, Guglielmo Faggioli, Bhaskar Mitra, Paul Thomas, and Emine Yilmaz (Eds.). CEUR-WS.org, 92-112."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657942"},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of The First Workshop on Large Language Models for Evaluation in Information Retrieval (LLM4Eval 2024) co-located with 10th International Conference on Online Publishing (SIGIR","author":"Rahmani Hossein A.","year":"2024","unstructured":"Hossein A. Rahmani, Emine Yilmaz, Nick Craswell, Bhaskar Mitra, Paul Thomas, Charles L. A. Clarke, Mohammad Aliannejadi, Clemencia Siro, and Guglielmo Faggioli. 2024b. LLMJudge: LLMs for Relevance Judgments. In Proceedings of The First Workshop on Large Language Models for Evaluation in Information Retrieval (LLM4Eval 2024) co-located with 10th International Conference on Online Publishing (SIGIR 2024), Washington D.C., USA, July 18, 2024 (CEUR Workshop Proceedings, Vol. 3752), , Clemencia Siro, Mohammad Aliannejadi, Hossein A. Rahmani, Nick Craswell, Charles L. A. Clarke, Guglielmo Faggioli, Bhaskar Mitra, Paul Thomas, and Emine Yilmaz (Eds.). CEUR-WS.org, 1-3."},{"key":"e_1_3_2_1_26_1","volume-title":"Schultz","author":"Rees Alan M.","year":"1967","unstructured":"Alan M. Rees and Douglas G. Schultz. 1967. A Field Experimental Approach to the Study of Relevance Assessments in Relation to Document Searching. Final Report to the National Science Foundation. Volume I. Technical Report. Clearinghouse for Federal Scientific and Technical Information, Springfield, Va."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1016\/0306-4573(90)90050-C"},{"key":"e_1_3_2_1_28_1","unstructured":"Clemencia Siro Mohammad Aliannejadi Hossein A. Rahmani Nick Craswell Charles L. A. Clarke Guglielmo Faggioli Bhaskar Mitra Paul Thomas and Emine Yilmaz (Eds.). 2024. Proceedings of The First Workshop on Large Language Models for Evaluation in Information Retrieval (LLM4Eval 2024) co-located with 10th International Conference on Online Publishing (SIGIR 2024) Washington D.C. USA July 18 2024. CEUR Workshop Proceedings Vol. 3752. CEUR-WS.org."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2409.15133"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/383952.383961"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1002\/asi.5090160204"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657707"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2411.08275"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2406.06519"},{"key":"e_1_3_2_1_35_1","volume-title":"Variations in Relevance Judgments and the Measurement of Retrieval Effectiveness. 36 No. 5 (2000-01-01","author":"Voorhees Ellen","year":"2000","unstructured":"Ellen Voorhees. 2000. Variations in Relevance Judgments and the Measurement of Retrieval Effectiveness. 36 No. 5 (2000-01-01 2000)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/290941.291017"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-22948-1_2"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/584792.584908"},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of The First Workshop on Large Language Models for Evaluation in Information Retrieval (LLM4Eval 2024) co-located with 10th International Conference on Online Publishing (SIGIR","author":"Yang Jheng-Hong","year":"2024","unstructured":"Jheng-Hong Yang and Jimmy Lin. 2024. Toward Automatic Relevance Judgment using Vision-Language Models for Image-Text Retrieval Evaluation. In Proceedings of The First Workshop on Large Language Models for Evaluation in Information Retrieval (LLM4Eval 2024) co-located with 10th International Conference on Online Publishing (SIGIR 2024), Washington D.C., USA, July 18, 2024 (CEUR Workshop Proceedings, Vol. 3752), , Clemencia Siro, Mohammad Aliannejadi, Hossein A. Rahmani, Nick Craswell, Charles L. A. Clarke, Guglielmo Faggioli, Bhaskar Mitra, Paul Thomas, and Emine Yilmaz (Eds.). CEUR-WS.org, 113-123."}],"event":{"name":"SIGIR '25: The 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Padua Italy","acronym":"SIGIR '25","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3726302.3730218","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T12:08:31Z","timestamp":1755864511000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3726302.3730218"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,13]]},"references-count":39,"alternative-id":["10.1145\/3726302.3730218","10.1145\/3726302"],"URL":"https:\/\/doi.org\/10.1145\/3726302.3730218","relation":{},"subject":[],"published":{"date-parts":[[2025,7,13]]},"assertion":[{"value":"2025-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}