{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:40:11Z","timestamp":1755870011536,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":84,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,18]]},"DOI":"10.1145\/3731120.3744574","type":"proceedings-article","created":{"date-parts":[[2025,7,18]],"date-time":"2025-07-18T13:34:06Z","timestamp":1752845646000},"page":"92-102","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Reliable Annotations with Less Effort: Evaluating LLM-Human Collaboration in Search Clarifications"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5951-4052","authenticated-orcid":false,"given":"Leila","family":"Tavakoli","sequence":"first","affiliation":[{"name":"Service Australia, Melbourne, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0800-3340","authenticated-orcid":false,"given":"Hamed","family":"Zamani","sequence":"additional","affiliation":[{"name":"University of Massachusetts Amherst, Amherst, MA, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,7,18]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Can We Use Large Language Models to Fill Relevance Judgment Holes? arXiv preprint arXiv:2405.05600","author":"Abbasiantaeb Zahra","year":"2024","unstructured":"Zahra Abbasiantaeb, Chuan Meng, Leif Azzopardi, and Mohammad Aliannejadi. 2024. Can We Use Large Language Models to Fill Relevance Judgment Holes? arXiv preprint arXiv:2405.05600 (2024)."},{"key":"e_1_3_2_1_2_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Mohammed M Abdelsamea, Shadi Basurra, Sima Iranmanesh, Hadeel Saadany, and Edlira Vakaj.","author":"Al-Turki Dhoyazan","year":"2024","unstructured":"Dhoyazan Al-Turki, Hansi Hettiarachchi, Mohamed Medhat Gaber, Mohammed M Abdelsamea, Shadi Basurra, Sima Iranmanesh, Hadeel Saadany, and Edlira Vakaj. 2024. Human-in-the-Loop Learning with LLMs for Efficient RASE Tagging in Building Compliance Regulations. IEEE Access (2024)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3673791.3698431"},{"key":"e_1_3_2_1_5_1","volume-title":"ConvAI3: Generating Clarifying Questions for Open-Domain Dialogue Systems (ClariQ). arXiv preprint arXiv:2009.11352","author":"Aliannejadi Mohammad","year":"2020","unstructured":"Mohammad Aliannejadi, Julia Kiseleva, Aleksandr Chuklin, Jeff Dalton, and Mikhail Burtsev. 2020. ConvAI3: Generating Clarifying Questions for Open-Domain Dialogue Systems (ClariQ). arXiv preprint arXiv:2009.11352 (2020)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331265"},{"key":"e_1_3_2_1_7_1","volume-title":"Maria Korobeynikova, and Fabrizio Gilardi.","author":"Alizadeh Meysam","year":"2023","unstructured":"Meysam Alizadeh, Ma\u00ebl Kubli, Zeynab Samei, Shirin Dehghani, Juan Diego Bermeo, Maria Korobeynikova, and Fabrizio Gilardi. 2023. Open-source large language models outperform crowd workers and approach ChatGPT in textannotation tasks. arXiv preprint arXiv:2307.02179 101 (2023)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/s42001-024-00345-9"},{"key":"e_1_3_2_1_9_1","unstructured":"Anthropic. 2024. Claude 3: A Next-Generation Large Language Model. https: \/\/www.anthropic.com Accessed: 2024-11-25."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1596\/1813-9450-10597"},{"key":"e_1_3_2_1_11_1","volume-title":"Cycles of Thought: Measuring LLM Confidence through Stable Explanations. arXiv e-prints","author":"Becker Evan","year":"2024","unstructured":"Evan Becker and Stefano Soatto. 2024. Cycles of Thought: Measuring LLM Confidence through Stable Explanations. arXiv e-prints (2024), arXiv-2406."},{"key":"e_1_3_2_1_12_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877-1901."},{"key":"e_1_3_2_1_13_1","volume-title":"Humanin-the-Loop through Chain-of-Thought. arXiv preprint arXiv:2306.07932","author":"Cai Zefan","year":"2023","unstructured":"Zefan Cai, Baobao Chang, and Wenjuan Han. 2023. Humanin-the-Loop through Chain-of-Thought. arXiv preprint arXiv:2306.07932 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"From human-in-the-loop to LLM-in-the-loop for high quality legal dataset. i-lex 17, 1","author":"Carnat Irina","year":"2024","unstructured":"Irina Carnat, Giovanni Comand\u00e9, Daniele Licari, and Chiara De Nigris. 2024. From human-in-the-loop to LLM-in-the-loop for high quality legal dataset. i-lex 17, 1 (2024), 27-40."},{"key":"e_1_3_2_1_15_1","volume-title":"Jack Hessel, Lijuan Wang, and Yejin Choi.","author":"Chandu Khyathi Raghavi","year":"2024","unstructured":"Khyathi Raghavi Chandu, Linjie Li, AnasAwadalla, Ximing Lu, Jae Sung Park, Jack Hessel, Lijuan Wang, and Yejin Choi. 2024. Certainly Uncertain: A Benchmark and Metric for Multimodal Epistemic and Aleatoric Awareness. arXiv preprint arXiv:2407.01942 (2024)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.870"},{"key":"e_1_3_2_1_17_1","volume-title":"Deep reinforcement learning from human preferences. Advances in neural information processing systems 30","author":"Christiano Paul F","year":"2017","unstructured":"Paul F Christiano, Jan Leike, Tom Brown, Miljan Martic, Shane Legg, and Dario Amodei. 2017. Deep reinforcement learning from human preferences. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_18_1","volume-title":"LLM-based relevance assessment still can't replace human relevance assessment. arXiv preprint arXiv:2412.17156","author":"Clarke Charles LA","year":"2024","unstructured":"Charles LA Clarke and Laura Dietz. 2024. LLM-based relevance assessment still can't replace human relevance assessment. arXiv preprint arXiv:2412.17156 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"Weighted kappa: Nominal scale agreement provision for scaled disagreement or partial credit. Psychological bulletin 70, 4","author":"Cohen Jacob","year":"1968","unstructured":"Jacob Cohen. 1968. Weighted kappa: Nominal scale agreement provision for scaled disagreement or partial credit. Psychological bulletin 70, 4 (1968), 213."},{"key":"e_1_3_2_1_20_1","unstructured":"Cohere. 2024. Cohere Command R: Retrieval-Optimized Large Language Model. https:\/\/cohere.ai Accessed: 2024-11-25."},{"key":"e_1_3_2_1_21_1","volume-title":"A fast and elitist multiobjective genetic algorithm: NSGA-II","author":"Deb Kalyanmoy","year":"2002","unstructured":"Kalyanmoy Deb, Amrit Pratap, Sameer Agarwal, and TAMT Meyarivan. 2002. A fast and elitist multiobjective genetic algorithm: NSGA-II. IEEE transactions on evolutionary computation 6, 2 (2002), 182-197."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.626"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41431-023-01396-8"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3578337.3605136"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3624730"},{"key":"e_1_3_2_1_26_1","volume-title":"Human-like summarization evaluation with chatgpt. arXiv preprint arXiv:2304.02554","author":"Gao Mingqi","year":"2023","unstructured":"Mingqi Gao, Jie Ruan, Renliang Sun, Xunjian Yin, Shiping Yang, and Xiaojun Wan. 2023. Human-like summarization evaluation with chatgpt. arXiv preprint arXiv:2304.02554 (2023)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.2305016120"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11192-020-03614-2"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-023-00718-1"},{"key":"e_1_3_2_1_30_1","volume-title":"Nan Duan, Weizhu Chen, et al.","author":"He Xingwei","year":"2023","unstructured":"Xingwei He, Zhenghao Lin, Yeyun Gong, Alex Jin, Hang Zhang, Chen Lin, Jian Jiao, Siu Ming Yiu, Nan Duan, Weizhu Chen, et al. 2023. Annollm: Making large language models to be better crowdsourced annotators. arXiv preprint arXiv:2303.16854 (2023)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1177\/20531680241236239"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.3389\/fonc.2023.1219326"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543873.3587368"},{"key":"e_1_3_2_1_34_1","volume-title":"Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al.","author":"Jiang Albert Q","year":"2023","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al. 2023. Mistral 7B. arXiv preprint arXiv:2310.06825 (2023)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1147"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1025603704680"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.eacl-demo.18"},{"key":"e_1_3_2_1_38_1","volume-title":"Keith Stevens, Abdullah Barhoum, Duc Nguyen, Oliver Stanley, Rich\u00e1rd Nagyfi, et al.","author":"K\u00f6pf Andreas","year":"2024","unstructured":"Andreas K\u00f6pf, Yannic Kilcher, Dimitri von R\u00fctte, Sotiris Anagnostidis, Zhi Rui Tam, Keith Stevens, Abdullah Barhoum, Duc Nguyen, Oliver Stanley, Rich\u00e1rd Nagyfi, et al. 2024. Openassistant conversations-democratizing large language model alignment. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"The measurement of observer agreement for categorical data. biometrics","author":"Richard Landis J","year":"1977","unstructured":"J Richard Landis and Gary G Koch. 1977. The measurement of observer agreement for categorical data. biometrics (1977), 159-174."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643829"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2012.02.191"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.92"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.229"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3592032"},{"key":"e_1_3_2_1_46_1","volume-title":"Proceedings on. PMLR, 38-48","author":"Mohta Jay","year":"2023","unstructured":"Jay Mohta, Kenan Ak, Yan Xu, and Mingwei Shen. 2023. Are large language models good annotators?. In Proceedings on. PMLR, 38-48."},{"key":"e_1_3_2_1_47_1","volume-title":"Chat- GPT for Text Annotation? Mind the Hype. SocArXiv preprint","author":"Ollion Etienne","year":"2023","unstructured":"Etienne Ollion, Rubing Shen, Ana Macanovic, and Arnault Chatelain. 2023. Chat- GPT for Text Annotation? Mind the Hype. SocArXiv preprint (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"Evaluating large language models: Chatgpt- 4, mistral 8x7b, and google gemini benchmarked against mmlu. Authorea Preprints","author":"Ono Kensuke","year":"2024","unstructured":"Kensuke Ono and Akira Morita. 2024. Evaluating large language models: Chatgpt- 4, mistral 8x7b, and google gemini benchmarked against mmlu. Authorea Preprints (2024)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.sigdial-1.23"},{"key":"e_1_3_2_1_50_1","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et al. 2022. Training language models to follow instructions with human feedback. Advances in neural information processing systems 35 (2022) 27730-27744."},{"key":"e_1_3_2_1_51_1","volume-title":"Keeping Humans in the Loop: Human-Centered Automated Annotation with Generative AI. arXiv preprint arXiv:2409.09467","author":"Pangakis Nicholas","year":"2024","unstructured":"Nicholas Pangakis and Samuel Wolken. 2024. Keeping Humans in the Loop: Human-Centered Automated Annotation with Generative AI. arXiv preprint arXiv:2409.09467 (2024)."},{"key":"e_1_3_2_1_52_1","volume-title":"Automated annotation with generative ai requires validation. arXiv preprint arXiv:2306.00176","author":"Pangakis Nicholas","year":"2023","unstructured":"Nicholas Pangakis, Samuel Wolken, and Neil Fasching. 2023. Automated annotation with generative ai requires validation. arXiv preprint arXiv:2306.00176 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"Large Language Model Confidence Estimation via Black-Box Access. arXiv preprint arXiv:2406.04370","author":"Pedapati Tejaswini","year":"2024","unstructured":"Tejaswini Pedapati, Amit Dhurandhar, Soumya Ghosh, Soham Dan, and Prasanna Sattigeri. 2024. Large Language Model Confidence Estimation via Black-Box Access. arXiv preprint arXiv:2406.04370 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Mohammad Aliannejadi, Clemencia Siro, and Guglielmo Faggioli.","author":"Rahmani Hossein A","year":"2024","unstructured":"Hossein A Rahmani, Emine Yilmaz, Nick Craswell, Bhaskar Mitra, Paul Thomas, Charles LA Clarke, Mohammad Aliannejadi, Clemencia Siro, and Guglielmo Faggioli. 2024. Llmjudge: Llms for relevance judgments. arXiv preprint arXiv:2408.08896 (2024)."},{"key":"e_1_3_2_1_55_1","volume-title":"Testing the reliability of chatgpt for text annotation and classification: A cautionary remark. arXiv preprint arXiv:2304.11085","author":"Reiss Michael V","year":"2023","unstructured":"Michael V Reiss. 2023. Testing the reliability of chatgpt for text annotation and classification: A cautionary remark. arXiv preprint arXiv:2304.11085 (2023)."},{"key":"e_1_3_2_1_56_1","volume-title":"Proceedings of The 18th Linguistic Annotation Workshop (LAW-XVIII). 98-111","author":"Rouzegar Hamidreza","year":"2024","unstructured":"Hamidreza Rouzegar and Masoud Makrehchi. 2024. Enhancing Text Classification through LLM-Driven Active Learning and Human Annotation. In Proceedings of The 18th Linguistic Annotation Workshop (LAW-XVIII). 98-111."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.3389\/frai.2023.1279794"},{"key":"e_1_3_2_1_58_1","unstructured":"Akram Sheriff. 2025. TECHNIQUES FOR DERIVING AN LLM AGENT TRUST SCORE FOR DYNAMICALLY TRIGGERING HUMAN-IN-THE-LOOP (HIL) FEEDBACK IN REALTIME FOR AN LLM AGENTIC WORKFLOW. (2025)."},{"key":"e_1_3_2_1_59_1","volume-title":"Don't Use LLMs to Make Relevance Judgments. arXiv preprint arXiv:2409.15133","author":"Soboroff Ian","year":"2024","unstructured":"Ian Soboroff. 2024. Don't Use LLMs to Make Relevance Judgments. arXiv preprint arXiv:2409.15133 (2024)."},{"key":"e_1_3_2_1_60_1","volume-title":"Confidence Estimation for LLM-Based Dialogue State Tracking. arXiv preprint arXiv:2409.09629","author":"Sun Yi-Jyun","year":"2024","unstructured":"Yi-Jyun Sun, Suvodip Dey, Dilek Hakkani-Tur, and Gokhan Tur. 2024. Confidence Estimation for LLM-Based Dialogue State Tracking. arXiv preprint arXiv:2409.09629 (2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"LLM-Assisted Relevance Assessments: When Should We Ask LLMs for Help? arXiv preprint arXiv:2411.06877","author":"Takehi Rikiya","year":"2024","unstructured":"Rikiya Takehi, Ellen M Voorhees, and Tetsuya Sakai. 2024. LLM-Assisted Relevance Assessments: When Should We Ask LLMs for Help? arXiv preprint arXiv:2411.06877 (2024)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531750"},{"key":"e_1_3_2_1_63_1","volume-title":"Workshop Proceedings of the 17th International AAAI Conference on Web and Social Media.","author":"Thapa Surendrabikram","year":"2023","unstructured":"Surendrabikram Thapa, Usman Naseem, and Mehwish Nasim. 2023. From humans to machines: can chatgpt-like llms effectively replace human annotators in nlp tasks. In Workshop Proceedings of the 17th International AAAI Conference on Web and Social Media."},{"key":"e_1_3_2_1_64_1","volume-title":"Large language models can accurately predict searcher preferences","author":"Thomas Paul","year":"2023","unstructured":"Paul Thomas, Seth Spielman, Nick Craswell, and Bhaskar Mitra. 2023. Large language models can accurately predict searcher preferences, 2023. URL https:\/\/arxiv.org\/abs\/2309.10621 (2023)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.330"},{"key":"e_1_3_2_1_66_1","volume-title":"Chatgpt-4 outperforms experts and crowd workers in annotating political twitter messages with zero-shot learning. arXiv preprint arXiv:2304.06588","author":"T\u00f6rnberg Petter","year":"2023","unstructured":"Petter T\u00f6rnberg. 2023. Chatgpt-4 outperforms experts and crowd workers in annotating political twitter messages with zero-shot learning. arXiv preprint arXiv:2304.06588 (2023)."},{"key":"e_1_3_2_1_67_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_68_1","volume-title":"Hoa Trang Dang, and Jimmy Lin","author":"Upadhyay Shivani","year":"2024","unstructured":"Shivani Upadhyay, Ronak Pradeep, Nandan Thakur, Daniel Campos, Nick Craswell, Ian Soboroff, Hoa Trang Dang, and Jimmy Lin. 2024. A Large-Scale Study of Relevance Assessments with Large Language Models: An Initial Look. arXiv preprint arXiv:2411.08275 (2024)."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0306-4573(00)00010-8"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.newsum-1.1"},{"key":"e_1_3_2_1_71_1","volume-title":"Large language models are not fair evaluators. arXiv preprint arXiv:2305.17926","author":"Wang Peiyi","year":"2023","unstructured":"Peiyi Wang, Lei Li, Liang Chen, Zefan Cai, Dawei Zhu, Binghuai Lin, Yunbo Cao, Qi Liu, Tianyu Liu, and Zhifang Sui. 2023. Large language models are not fair evaluators. arXiv preprint arXiv:2305.17926 (2023)."},{"key":"e_1_3_2_1_72_1","first-page":"4195","article-title":"Want To Reduce Labeling Cost? GPT-3 Can Help","volume":"2021","author":"Liu Yang","year":"2021","unstructured":"ShuohangWang, Yang Liu, Yichong Xu, Chenguang Zhu, and Michael Zeng. 2021. Want To Reduce Labeling Cost? GPT-3 Can Help. In Findings of the Association for Computational Linguistics: EMNLP 2021. 4195-4205.","journal-title":"Findings of the Association for Computational Linguistics: EMNLP"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3641960"},{"key":"e_1_3_2_1_74_1","volume-title":"Aligning large language models with human: A survey. arXiv preprint arXiv:2307.12966","author":"Wang Yufei","year":"2023","unstructured":"Yufei Wang, Wanjun Zhong, Liangyou Li, Fei Mi, Xingshan Zeng, Wenyong Huang, Lifeng Shang, Xin Jiang, and Qun Liu. 2023. Aligning large language models with human: A survey. arXiv preprint arXiv:2307.12966 (2023)."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-4413"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.trustnlp-1.28"},{"key":"e_1_3_2_1_77_1","volume-title":"Cranfield experiments. https:\/\/en.wikipedia.org\/wiki\/Cranfield_experiments. https:\/\/en.wikipedia.org\/wiki\/Cranfield_experiments [Online","author":"Wikipedia","year":"2025","unstructured":"Wikipedia contributors. 2025. Cranfield experiments. https:\/\/en.wikipedia.org\/wiki\/Cranfield_experiments. https:\/\/en.wikipedia.org\/wiki\/Cranfield_experiments [Online; accessed 24-April-2025]."},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/3366423.3380126"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340531.3412772"},{"key":"e_1_3_2_1_80_1","first-page":"46595","article-title":"Judging llm-as-a-judge with mt-bench and chatbot arena","volume":"36","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, et al. 2023. Judging llm-as-a-judge with mt-bench and chatbot arena. Advances in Neural Information Processing Systems 36 (2023), 46595-46623.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_81_1","volume-title":"Can chatgpt understand too? a comparative study on chatgpt and fine-tuned bert. arXiv preprint arXiv:2302.10198","author":"Zhong Qihuang","year":"2023","unstructured":"Qihuang Zhong, Liang Ding, Juhua Liu, Bo Du, and Dacheng Tao. 2023. Can chatgpt understand too? a comparative study on chatgpt and fine-tuned bert. arXiv preprint arXiv:2302.10198 (2023)."},{"key":"e_1_3_2_1_82_1","volume-title":"Ziwen Han, Keiran Paster, Silviu Pitis, Harris Chan, and Jimmy Ba.","author":"Zhou Yongchao","year":"2022","unstructured":"Yongchao Zhou, Andrei Ioan Muresanu, Ziwen Han, Keiran Paster, Silviu Pitis, Harris Chan, and Jimmy Ba. 2022. Large language models are human-level prompt engineers. arXiv preprint arXiv:2211.01910 (2022)."},{"key":"e_1_3_2_1_83_1","volume-title":"Can ChatGPT reproduce human-generated labels. A Study of Social Computing Tasks","author":"Zhu Yiming","year":"2023","unstructured":"Yiming Zhu, Peixian Zhang, EU Haq, Pan Hui, and Gareth Tyson. 2023. Can ChatGPT reproduce human-generated labels. A Study of Social Computing Tasks (2023), 92-101."},{"key":"e_1_3_2_1_84_1","volume-title":"INTERS: unlocking the power of large language models in search with instruction tuning. arXiv preprint arXiv:2401.06532","author":"Zhu Yutao","year":"2024","unstructured":"Yutao Zhu, Peitian Zhang, Chenghao Zhang, Yifei Chen, Binyu Xie, Zheng Liu, Ji-RongWen, and Zhicheng Dou. 2024. INTERS: unlocking the power of large language models in search with instruction tuning. arXiv preprint arXiv:2401.06532 (2024)."}],"event":{"name":"ICTIR '25: International ACM SIGIR Conference on Innovative Concepts and Theories in Information Retrieval","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"],"location":"Padua Italy","acronym":"ICTIR '25"},"container-title":["Proceedings of the 2025 International ACM SIGIR Conference on Innovative Concepts and Theories in Information Retrieval (ICTIR)"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731120.3744574","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T13:19:16Z","timestamp":1755868756000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731120.3744574"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,18]]},"references-count":84,"alternative-id":["10.1145\/3731120.3744574","10.1145\/3731120"],"URL":"https:\/\/doi.org\/10.1145\/3731120.3744574","relation":{},"subject":[],"published":{"date-parts":[[2025,7,18]]},"assertion":[{"value":"2025-07-18","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}