{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T15:29:00Z","timestamp":1776180540054,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":23,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100000266","name":"Engineering and Physical Sciences Research Council","doi-asserted-by":"publisher","award":["EP\/S021566\/1"],"award-info":[{"award-number":["EP\/S021566\/1"]}],"id":[{"id":"10.13039\/501100000266","id-type":"DOI","asserted-by":"publisher"}]},{"name":"The Alan Turing Institute\u2019s Enrichment scheme"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,10]]},"DOI":"10.1145\/3746252.3760908","type":"proceedings-article","created":{"date-parts":[[2025,11,8]],"date-time":"2025-11-08T00:36:36Z","timestamp":1762562196000},"page":"5166-5170","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Towards Understanding Bias in Synthetic Data for Evaluation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2779-4942","authenticated-orcid":false,"given":"Hossein A.","family":"Rahmani","sequence":"first","affiliation":[{"name":"University College London, London, United Kingdom and The Alan Turing Institute, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0330-3184","authenticated-orcid":false,"given":"Varsha","family":"Ramineni","sequence":"additional","affiliation":[{"name":"University College London, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4734-4532","authenticated-orcid":false,"given":"Emine","family":"Yilmaz","sequence":"additional","affiliation":[{"name":"University College London, London, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9351-8137","authenticated-orcid":false,"given":"Nick","family":"Craswell","sequence":"additional","affiliation":[{"name":"Microsoft, Seattle, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5270-5550","authenticated-orcid":false,"given":"Bhaskar","family":"Mitra","sequence":"additional","affiliation":[{"name":"Research, Microsoft, Montreal, Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.623"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/1148170.1148263"},{"key":"e_1_3_2_1_3_1","volume-title":"Towards Understanding the Interplay of LLMs in Information Retrieval Evaluation. arXiv preprint arXiv:2503.19092","author":"Balog Krisztian","year":"2025","unstructured":"Krisztian Balog, Donald Metzler, and Zhen Qin. 2025. Rankers, Judges, and Assistants: Towards Understanding the Interplay of LLMs in Information Retrieval Evaluation. arXiv preprint arXiv:2503.19092 (2025)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531863"},{"key":"e_1_3_2_1_5_1","volume-title":"Aslib proceedings","author":"Cleverdon Cyril","unstructured":"Cyril Cleverdon. 1967. The Cranfield tests on index language devices. In Aslib proceedings, Vol. 19. MCB UP Ltd, 173-194."},{"key":"e_1_3_2_1_6_1","volume-title":"Overview of the TREC 2023 Deep Learning Track. In Text REtrieval Conference (TREC). NIST, TREC.","author":"Craswell Nick","year":"2024","unstructured":"Nick Craswell, Bhaskar Mitra, Emine Yilmaz, Hossein A. Rahmani, Daniel Campos, Jimmy Lin, Ellen M. Voorhees, and Ian Soboroff. 2024. Overview of the TREC 2023 Deep Learning Track. In Text REtrieval Conference (TREC). NIST, TREC."},{"key":"e_1_3_2_1_7_1","volume-title":"LLM-Evaluation Tropes: Perspectives on the Validity of LLM-Evaluations. arXiv preprint arXiv:2504.19076","author":"Dietz Laura","year":"2025","unstructured":"Laura Dietz, Oleg Zendel, Peter Bailey, Charles Clarke, Ellese Cotterill, Jeff Dalton, Faegheh Hasibi, Mark Sanderson, and Nick Craswell. 2025. LLM-Evaluation Tropes: Perspectives on the Validity of LLM-Evaluations. arXiv preprint arXiv:2504.19076 (2025)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3578337.3605136"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664190.3672511"},{"key":"e_1_3_2_1_10_1","volume-title":"Llm-based nlg evaluation: Current status and challenges. arXiv preprint arXiv:2402.01383","author":"Gao Mingqi","year":"2024","unstructured":"Mingqi Gao, Xinyu Hu, Jie Ruan, Xiao Pu, and Xiaojun Wan. 2024. Llm-based nlg evaluation: Current status and challenges. arXiv preprint arXiv:2402.01383 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Information retrieval evaluation","author":"Harman Donna","unstructured":"Donna Harman. 2011. Information retrieval evaluation. Morgan & Claypool Publishers."},{"key":"e_1_3_2_1_12_1","unstructured":"Zhen Li Xiaohan Xu Tao Shen Can Xu Jia-Chen Gu Yuxuan Lai Chongyang Tao and Shuai Ma. 2024. Leveraging Large Language Models for NLG Evaluation: Advances and Challenges. arXiv:2401.07103 [cs.CL] https:\/\/arxiv.org\/abs\/2401.07103"},{"key":"e_1_3_2_1_13_1","volume-title":"Nafise Sadat Moosavi, and Chenghua Lin","author":"Liu Yiqi","year":"2023","unstructured":"Yiqi Liu, Nafise Sadat Moosavi, and Chenghua Lin. 2023. Llms as narcissistic evaluators: When ego inflates evaluation scores. arXiv preprint arXiv:2311.09766 (2023)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657942"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3701551.3705706"},{"key":"e_1_3_2_1_16_1","first-page":"1","volume-title":"Report on the 1st workshop on large language model for evaluation in information retrieval (llm4eval 2024)","volume":"58","author":"Rahmani Hossein A.","year":"2025","unstructured":"Hossein A. Rahmani, Clemencia Siro, Mohammad Aliannejadi, Nick Craswell, Charles LA Clarke, Guglielmo Faggioli, Bhaskar Mitra, Paul Thomas, and Emine Yilmaz. 2025b. Report on the 1st workshop on large language model for evaluation in information retrieval (llm4eval 2024) at sigir 2024. In ACM SIGIR Forum, Vol. 58. ACM New York, NY, USA, 1-12."},{"key":"e_1_3_2_1_17_1","unstructured":"Hossein A. Rahmani Xi Wang Emine Yilmaz Nick Craswell Bhaskar Mitra and Paul Thomas. 2024b. SynDL: A Large-Scale Synthetic Test Collection for Passage Retrieval. arXiv:2408.16312 [cs.IR] https:\/\/arxiv.org\/abs\/2408.16312"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3701716.3715536"},{"key":"e_1_3_2_1_19_1","volume-title":"Mohammad Aliannejadi, Clemencia Siro, and Guglielmo Faggioli.","author":"Rahmani Hossein A.","year":"2024","unstructured":"Hossein A. Rahmani, Emine Yilmaz, Nick Craswell, Bhaskar Mitra, Paul Thomas, Charles LA Clarke, Mohammad Aliannejadi, Clemencia Siro, and Guglielmo Faggioli. 2024c. LLMJudge: LLMs for Relevance Judgments. arXiv preprint arXiv:2408.08896 (2024)."},{"key":"e_1_3_2_1_20_1","first-page":"247","volume-title":"Foundations and Trends\u00ae in Information Retrieval","volume":"4","author":"Mark","year":"2010","unstructured":"Mark Sanderson et al., 2010. Test collection based evaluation of information retrieval systems. Foundations and Trends\u00ae in Information Retrieval, Vol. 4, 4 (2010), 247-375."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657707"},{"key":"e_1_3_2_1_22_1","volume-title":"Is chatgpt a good nlg evaluator? a preliminary study. arXiv preprint arXiv:2303.04048","author":"Wang Jiaan","year":"2023","unstructured":"Jiaan Wang, Yunlong Liang, Fandong Meng, Zengkui Sun, Haoxiang Shi, Zhixu Li, Jinan Xu, Jianfeng Qu, and Jie Zhou. 2023b. Is chatgpt a good nlg evaluator? a preliminary study. arXiv preprint arXiv:2303.04048 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Large language models are not fair evaluators. arXiv preprint arXiv:2305.17926","author":"Wang Peiyi","year":"2023","unstructured":"Peiyi Wang, Lei Li, Liang Chen, Zefan Cai, Dawei Zhu, Binghuai Lin, Yunbo Cao, Qi Liu, Tianyu Liu, and Zhifang Sui. 2023a. Large language models are not fair evaluators. arXiv preprint arXiv:2305.17926 (2023)."}],"event":{"name":"CIKM '25: The 34th ACM International Conference on Information and Knowledge Management","location":"Seoul Republic of Korea","acronym":"CIKM '25","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the 34th ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746252.3760908","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T02:09:16Z","timestamp":1765505356000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746252.3760908"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":23,"alternative-id":["10.1145\/3746252.3760908","10.1145\/3746252"],"URL":"https:\/\/doi.org\/10.1145\/3746252.3760908","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}