{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T02:07:34Z","timestamp":1765505254114,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":21,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,10]]},"DOI":"10.1145\/3746252.3760934","type":"proceedings-article","created":{"date-parts":[[2025,11,8]],"date-time":"2025-11-08T00:36:36Z","timestamp":1762562196000},"page":"5186-5190","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Open-Source LLM-based Relevance Assessment vs. Highly Reliable Manual Relevance Assessment: A Case Study"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6720-963X","authenticated-orcid":false,"given":"Tetsuya","family":"Sakai","sequence":"first","affiliation":[{"name":"Waseda University\/Naver Corporation, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2301-9537","authenticated-orcid":false,"given":"Khant","family":"Myoe Rain","sequence":"additional","affiliation":[{"name":"Waseda University, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6336-8064","authenticated-orcid":false,"given":"Rikiya","family":"Takehi","sequence":"additional","affiliation":[{"name":"Waseda University, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6751-5303","authenticated-orcid":false,"given":"Sijie","family":"Tao","sequence":"additional","affiliation":[{"name":"Waseda University, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0669-005X","authenticated-orcid":false,"given":"Young-In","family":"Song","sequence":"additional","affiliation":[{"name":"Naver Corporation, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3673791.3698431"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2094072.2094076"},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of EVIA","author":"Charles","year":"2025","unstructured":"Charles L.A. Clarke and Laura Dietz. 2025. LLM-based relevance assessment still can't replace human relevance assessment. In Proceedings of EVIA 2025. to appear."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3578337.3605136"},{"key":"e_1_3_2_1_5_1","volume-title":"The TREC Test Collections. TREC","author":"Harman Donna","year":"2005","unstructured":"Donna Harman. 2005. The TREC Test Collections. TREC Chapter to be published in: TREC: Experiment and Evaluation in Information Retrieval, 2005,."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"David Otero Javier Parapar and \u00c1lvaro Barreiro. 2025. Limitations of Automatic Relevance Assessments with Large Language Models for Fair and Reliable Retrieval Evaluation. arXiv:2411.13212 [cs.IR] https:\/\/arxiv.org\/abs\/2411.13212","DOI":"10.1145\/3726302.3730221"},{"key":"e_1_3_2_1_7_1","unstructured":"Arjun Panickssery Samuel R. Bowman and Shi Feng. 2024. LLM Evaluators Recognize and Favor Their Own Generations. arXiv:2404.13076 [cs.CL] https:\/\/arxiv.org\/abs\/2404.13076"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/1148170.1148261"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/1277741.1277756"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-54798-0_6"},{"volume-title":"Effect Sizes, and Statistical Power","author":"Sakai Tetsuya","key":"e_1_3_2_1_11_1","unstructured":"Tetsuya Sakai. 2018. Laboratory Experiments in Information Retrieval: Sample Sizes, Effect Sizes, and Statistical Power. Springer."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-22948-1_3"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3726302.3730345"},{"key":"e_1_3_2_1_14_1","unstructured":"Tetsuya Sakai Sijie Tao Maria Maistro Zhumin Chu Yujing Li Nuo Chen Nicola Ferro Junjie Wang Ian Soboroff and Yiqun Liu. 2022. Corrected Evaluation Results of the NTCIR WWW-2 WWW-3 and WWW-4 English Subtasks. arXiv:2210.10266 [cs.IR] https:\/\/arxiv.org\/abs\/2210.10266"},{"key":"e_1_3_2_1_15_1","first-page":"219","volume-title":"Proceedings of NTCIR-15","author":"Sakai Tetsuya","year":"2020","unstructured":"Tetsuya Sakai, Sijie Tao, Zhaohao Zeng, Yukun Zheng, Jiaxin Mao, Zhumin Chu, Yiqun Liu, Maria Maistro, Zhicheng Dou, Nicola Ferro, and Ian Soboroff. 2020. Overview of the NTCIR-15 We Want Web with CENTRE (WWW-3) Task. In Proceedings of NTCIR-15. 219-234."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.54195\/irrj.19625"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of ACM SIGIR","author":"Takehi Rikiya","year":"2025","unstructured":"Rikiya Takehi, Ellen M. Voorhees, Tetsuya Sakai, and Ian Soboroff. 2025. Building Large Test Collections with LLM Assistance. In Proceedings of ACM SIGIR 2025. to appear."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657707"},{"key":"e_1_3_2_1_19_1","volume-title":"Hoa Trang Dang, and Jimmy Lin","author":"Upadhyay Shivani","year":"2024","unstructured":"Shivani Upadhyay, Ronak Pradeep, Nandan Thakur, Daniel Campos, Nick Craswell, Ian Soboroff, Hoa Trang Dang, and Jimmy Lin. 2024a. A Large-Scale Study of Relevance Assessments with Large Language Models: An Initial Look. arXiv:2411.08275 [cs.IR] https:\/\/arxiv.org\/abs\/2411.08275"},{"key":"e_1_3_2_1_20_1","volume-title":"UMBRELA: UMbrela is the (Open-Source Reproduction of the) Bing RELevance Assessor. arXiv:2406.06519 [cs.IR] https:\/\/arxiv.org\/abs\/2406.06519","author":"Upadhyay Shivani","year":"2024","unstructured":"Shivani Upadhyay, Ronak Pradeep, Nandan Thakur, Nick Craswell, and Jimmy Lin. 2024b. UMBRELA: UMbrela is the (Open-Source Reproduction of the) Bing RELevance Assessor. arXiv:2406.06519 [cs.IR] https:\/\/arxiv.org\/abs\/2406.06519"},{"volume-title":"Testing Statistical Hypotheses of Equivalence and Noninferiority","author":"Wellek Stefan","key":"e_1_3_2_1_21_1","unstructured":"Stefan Wellek. 2010. Testing Statistical Hypotheses of Equivalence and Noninferiority (Second Edition). CRC Press."}],"event":{"name":"CIKM '25: The 34th ACM International Conference on Information and Knowledge Management","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Seoul Republic of Korea","acronym":"CIKM '25"},"container-title":["Proceedings of the 34th ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746252.3760934","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T02:05:10Z","timestamp":1765505110000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746252.3760934"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":21,"alternative-id":["10.1145\/3746252.3760934","10.1145\/3746252"],"URL":"https:\/\/doi.org\/10.1145\/3746252.3760934","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}