{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,30]],"date-time":"2025-05-30T04:11:16Z","timestamp":1748578276633,"version":"3.41.0"},"publisher-location":"Cham","reference-count":55,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031934148","type":"print"},{"value":"9783031934155","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-93415-5_21","type":"book-chapter","created":{"date-parts":[[2025,5,29]],"date-time":"2025-05-29T22:04:30Z","timestamp":1748556270000},"page":"353-372","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Automating Dialogue Evaluation: LLMs Vs Human Judgment"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-7228-1434","authenticated-orcid":false,"given":"Ebubechukwu","family":"Ike","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4010-4491","authenticated-orcid":false,"given":"Johane","family":"Takeuchi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4421-1737","authenticated-orcid":false,"given":"Frank","family":"Joublin","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1075-459X","authenticated-orcid":false,"given":"Antonello","family":"Ceravola","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7156-0596","authenticated-orcid":false,"given":"Marc","family":"Tanti","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,30]]},"reference":[{"key":"21_CR1","unstructured":"Call for proposals - dialog system technology challenge (DSTC) (2024). Association for Computational Linguistics. Available at: https:\/\/www.aclweb.org"},{"key":"21_CR2","unstructured":"Adiwardana, D., et\u00a0al.: Towards a human-like open-domain chatbot. arXiv preprint arXiv:2001.09977 (2020)"},{"key":"21_CR3","unstructured":"Libraria AI. BERT and GPT explained: How these models are shaping the future of AI (2023). Available at: https:\/\/libraria.ai\/bert-and-gpt-explained\/"},{"key":"21_CR4","doi-asserted-by":"crossref","unstructured":"AlMutairi, I.A., Qamar, A.M.: Evaluating neural dialogue systems using deep learning and conversation history. J. Artif. Intell. 4(3), 0021\u20132579 (2022)","DOI":"10.32604\/jai.2022.032390"},{"key":"21_CR5","doi-asserted-by":"crossref","unstructured":"Bender, E.M., Gebru, T., McMillan-Major, A., Shmitchell, S.: On the dangers of stochastic parrots: can language models be too big?. In: Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency, pp. 610\u2013623 (2021)","DOI":"10.1145\/3442188.3445922"},{"key":"21_CR6","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. Adv. Neural Inf. Process. Syst. 33, 1877\u20131901 (2020)"},{"key":"21_CR7","unstructured":"Brown, T. B, et\u00a0al.: Language models are few-shot learners (2020). arxiv . arXiv preprint arXiv:2005.14165"},{"key":"21_CR8","doi-asserted-by":"crossref","unstructured":"Budzianowski, P., et al.: Multiwoz\u2013a large-scale multi-domain wizard-of-oz dataset for task-oriented dialogue modelling. arXiv preprint arXiv:1810.00278 (2018)","DOI":"10.18653\/v1\/D18-1547"},{"key":"21_CR9","unstructured":"Chen, Y.T., Huang, H.H., Chen, H.H.: MPDD: a multi-party dialogue dataset for analysis of emotions and interpersonal relationships. In: Proceedings of the Twelfth Language Resources and Evaluation Conference, pp. 610\u2013614 (2020)"},{"key":"21_CR10","doi-asserted-by":"publisher","unstructured":"Chen, Y., Zhou, Y., Zhu, S., Xu, H.: Detecting offensive language in social media to protect adolescent online safety. In: 2012 International Conference on Privacy, Security, Risk and Trust and 2012 International Confernece on Social Computing, pp. 71\u201380 (2012). https:\/\/doi.org\/10.1109\/SocialCom-PASSAT.2012.55","DOI":"10.1109\/SocialCom-PASSAT.2012.55"},{"key":"21_CR11","unstructured":"Creutz, M.: Open subtitles paraphrase corpus for six languages. arXiv preprintarXiv:1809.06142 (2018)"},{"key":"21_CR12","doi-asserted-by":"crossref","unstructured":"Dalvi, B., et al.: Explaining answers with entailment trees. arXiv preprint arXiv:2104.08661 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.585"},{"key":"21_CR13","doi-asserted-by":"publisher","first-page":"755","DOI":"10.1007\/s10462-020-09866-x","volume":"54","author":"J Deriu","year":"2021","unstructured":"Deriu, J., et al.: Survey on evaluation methods for dialogue systems. Artif. Intell. Rev. 54, 755\u2013810 (2021)","journal-title":"Artif. Intell. Rev."},{"key":"21_CR14","doi-asserted-by":"crossref","unstructured":"Dobrin, D.N.: A new grammar checker. Comput. Hum. 24, 67\u201380 (1990)","DOI":"10.1007\/BF00115029"},{"key":"21_CR15","unstructured":"Erdmann, M., Maedche, A., Schnurr, H.P., Staab, S.: From manual to semi-automatic semantic annotation: about ontology-based text annotation tools. In: Proceedings of the COLING-2000 Workshop on Semantic Annotation and Intelligent Content, pp. 79\u201385 (2000)"},{"key":"21_CR16","doi-asserted-by":"crossref","unstructured":"Feng, Y., Lu, Z., Liu, B., Zhan, L., Wu, X.M.: Towards LLM-driven dialogue state tracking. arXiv preprintarXiv:2310.14970 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.48"},{"key":"21_CR17","doi-asserted-by":"crossref","unstructured":"Finch, S.E., Finch, J.D., Choi, J.D.: Don\u2019t forget your ABC\u2019s: evaluating the state-of-the-art in chat-oriented dialogue systems (2022)","DOI":"10.18653\/v1\/2023.acl-long.839"},{"key":"21_CR18","doi-asserted-by":"crossref","unstructured":"Finch, S.E., Paek, E.S., Choi, J.D.: Leveraging large language models for automated dialogue analysis. arXiv preprint arXiv:2309.06490 (2023)","DOI":"10.18653\/v1\/2023.sigdial-1.20"},{"key":"21_CR19","unstructured":"Gartner. 58 conversational AI statistics you can\u2019t ignore (2021). https:\/\/www.netomi.com\/blog\/conversational-ai-statistics"},{"key":"21_CR20","first-page":"7789","volume":"34","author":"S Ghazarian","year":"2020","unstructured":"Ghazarian, S., Weischedel, R., Galstyan, A., Peng, N.: Predictive engagement: an efficient metric for automatic evaluation of open-domain dialogue systems. Proc. AAAI Conf. Artif. Intell. 34, 7789\u20137796 (2020)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"21_CR21","doi-asserted-by":"crossref","unstructured":"Gill, S.S., et al.: Transformative effects of ChatGPT on modern education: emerging era of AI chatbots. Internet Things Cyber Phy. Syst. 4, 19\u201323 (2024)","DOI":"10.1016\/j.iotcps.2023.06.002"},{"key":"21_CR22","unstructured":"Herbert Paul Grice: logic and conversation. Syntax Seman. 3, 43\u201358 (1975)"},{"key":"21_CR23","doi-asserted-by":"crossref","unstructured":"Gupta, P., Mehri, S., Zhao, T., Pavel, A., Eskenazi, M., Bigham, J.P.: Investigating evaluation of open-domain dialogue systems with human generated multiple references. arXiv preprint arXiv:1907.10568 (2019)","DOI":"10.18653\/v1\/W19-5944"},{"key":"21_CR24","unstructured":"Haist, C.: An evaluation of Microsoft word 97\u2019s grammar checker (2000)"},{"key":"21_CR25","doi-asserted-by":"crossref","unstructured":"Higashinaka, R., Araki, M., Tsukahara, H., Mizukami, M.: Integrated taxonomy of errors in chat-oriented dialogue systems. In: Proceedings of the 22nd Annual Meeting of the Special Interest Group on Discourse and Dialogue, pp. 89\u201398 (2021)","DOI":"10.18653\/v1\/2021.sigdial-1.10"},{"key":"21_CR26","doi-asserted-by":"crossref","unstructured":"Hung, V., Elvir, M., Gonzalez, A., DeMara, R.: Towards a method for evaluating naturalness in conversational dialog systems. In: 2009 IEEE International Conference on Systems, Man and Cybernetics, pp. 1236\u20131241. IEEE (2009)","DOI":"10.1109\/ICSMC.2009.5345904"},{"key":"21_CR27","unstructured":"Joublin, F., Ceravola, A., Sandu, C.: Introducing brain-like concepts to embodied hand-crafted dialog management system. arXiv preprint arXiv:2406.08996 (2024)"},{"issue":"2","key":"21_CR28","first-page":"106","volume":"41","author":"B Kilby","year":"2021","unstructured":"Kilby, B.: Dialogic pedagogies: defining and analyzing four types of dialogue in education. Anal. Teach. Philos. Praxis 41(2), 106\u2013121 (2021)","journal-title":"Anal. Teach. Philos. Praxis"},{"key":"21_CR29","unstructured":"Kloker, S., Bazanya, M., Kateete, T.: I don\u2019t trust you (anymore)!\u2013the effect of students\u2019 LLM use on lecturer-student-trust in higher education. arXiv preprintarXiv:2406.14871 (2024)"},{"key":"21_CR30","doi-asserted-by":"crossref","unstructured":"Kumar, S., Bhatia, S., Aggarwal, M., Chakraborty, T.: Dialogue agents 101: a beginner\u2019s guide to critical ingredients for designing effective conversational systems. arXiv preprint arXiv:2307.07255 (2023)","DOI":"10.1017\/nlp.2024.42"},{"key":"21_CR31","unstructured":"Li, J., Monroe, W., Jurafsky, D.: Understanding neural networks through representation erasure. arXiv preprintarXiv:1612.08220 (2016)"},{"key":"21_CR32","unstructured":"Li, R., Ebrahimi Kahou, S., Schulz, H., Michalski, V., Charlin, L., Pal, C.: Towards deep conversational recommendations. Adv. Neural Inf. Process. Syst. 31 (2018)"},{"key":"21_CR33","doi-asserted-by":"crossref","unstructured":"Lowe, R., Noseworthy, M., Serban, I.V., Angelard-Gontier, N., Bengio, Y., Pineau, J.: Towards an automatic turing test: learning to evaluate dialogue responses. arXiv preprint arXiv:1708.07149 (2017)","DOI":"10.18653\/v1\/P17-1103"},{"key":"21_CR34","doi-asserted-by":"crossref","unstructured":"Lucas, G.M., et al.: Getting to know each other: the role of social dialogue in recovery from errors in social robots. In: Proceedings of the 2018 ACM\/IEEE International Conference on Human-robot Interaction, pp. 344\u2013351 (2018)","DOI":"10.1145\/3171221.3171258"},{"key":"21_CR35","doi-asserted-by":"crossref","unstructured":"Mahajan, K., Santhanam, S., Shaikh, S.: Towards evaluation of multi-party dialogue systems. In: Proceedings of the 15th International Conference on Natural Language Generation, pp. 278\u2013287 (2022)","DOI":"10.18653\/v1\/2022.inlg-main.23"},{"key":"21_CR36","doi-asserted-by":"crossref","unstructured":"McTear, M.: Conversational AI: dialogue systems, conversational agents, and chatbots. Springer Nature (2022)","DOI":"10.1007\/978-3-031-02176-3"},{"key":"21_CR37","unstructured":"AI\u00a0Meta. Introducing llama: a foundational, 65-billion-parameter large language model. Meta AI (2023)"},{"key":"21_CR38","doi-asserted-by":"crossref","unstructured":"Nakano, M.: A robot that can engage in both task-oriented and non-task-oriented dialogues. In: 2006 6th IEEE-RAS International Conference on Humanoid Robots, pp. 404\u2013411. IEEE (2006)","DOI":"10.1109\/ICHR.2006.321304"},{"issue":"4","key":"21_CR39","doi-asserted-by":"publisher","first-page":"3055","DOI":"10.1007\/s10462-022-10248-8","volume":"56","author":"J Ni","year":"2023","unstructured":"Ni, J., Young, T., Pandelea, V., Xue, F., Cambria, E.: Recent advances in deep learning based dialogue systems: a systematic survey. Artif. Intell. Rev. 56(4), 3055\u20133155 (2023)","journal-title":"Artif. Intell. Rev."},{"key":"21_CR40","unstructured":"Journal of\u00a0Petroleum\u00a0Technology. As hype fades, LLMs gaining acceptance in upstream as new-age research and coding tool (2023). https:\/\/jpt.spe.org\/as-hype-fades-llms-gaining-acceptance-in-upstream-as-new-age-research-and-coding-tool. Accessed: 2024-07-03"},{"key":"21_CR41","doi-asserted-by":"crossref","unstructured":"Pang, B., Lee, L., Vaithyanathan, S.: Thumbs up? Sentiment classification using machine learning techniques. In: Proceedings of the ACL-02 Conference on Empirical Methods in Natural Language Processing (EMNLP), Association for Computational Linguistics, pp. 79\u201386 (2002)","DOI":"10.3115\/1118693.1118704"},{"key":"21_CR42","doi-asserted-by":"crossref","unstructured":"Reddy, S.: Automating human evaluation of dialogue systems. In: Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Student Research Workshop, pp. 229\u2013234 (2022)","DOI":"10.18653\/v1\/2022.naacl-srw.29"},{"key":"21_CR43","unstructured":"Rodr\u00edguez-Cantelar, M., et al.: Overview of robust and multilingual automatic evaluation metrics for open-domain dialogue systems at DSTC 11 track 4. arXiv preprint arXiv:2306.12794 (2023)"},{"key":"21_CR44","doi-asserted-by":"crossref","unstructured":"Sai, A.B., Gupta, M.D., Khapra, M.M. and Srinivasan, M.: Re-evaluating ADEM: a deeper look at scoring dialogue responses. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 33, pp. 6220\u20136227 (2019)","DOI":"10.1609\/aaai.v33i01.33016220"},{"key":"21_CR45","doi-asserted-by":"crossref","unstructured":"De\u00a0Witt\u00a0T Starnes and Gertrude\u00a0E Noyes. The English dictionary from Cawdrey to Johnson, pp. 1604\u20131755 (1991)","DOI":"10.1075\/sihols.57"},{"key":"21_CR46","unstructured":"Sun, K., Yu, D., Chen, J., Yu, D., Choi, Y., Cardie, C.: Dream: a challenge dataset and models for dialogue-based reading comprehension. corr abs\/1902.00164 (2019). http:\/\/arxivorg\/abs\/1902.00164, 1902"},{"key":"21_CR47","unstructured":"Tsiakoulis, P., et al.: Statistical methods for building robust spoken dialogue systems in an automobile. In: Proceedings of the 4th Applied Human Factors and Ergonomics (2012)"},{"issue":"3","key":"21_CR48","doi-asserted-by":"publisher","first-page":"329","DOI":"10.1016\/S8755-4615(00)00038-4","volume":"17","author":"A Vernon","year":"2000","unstructured":"Vernon, A.: Computerized grammar checkers 2000: capabilities, limitations, and pedagogical possibilities. Comput. Compos. 17(3), 329\u2013349 (2000)","journal-title":"Comput. Compos."},{"key":"21_CR49","unstructured":"Walton, D.: Types of dialogue and burdens of proof. In: COMMA, pp. 13\u201324 (2010)"},{"key":"21_CR50","doi-asserted-by":"crossref","unstructured":"Watanabe, H., Bouazizi, M., Ohtsuki, T.: Hate speech on Twitter: a pragmatic approach to collect hateful and offensive expressions and perform hate speech detection. IEEE Access 6, 13825\u201313835 (2018). DOIurlhttps:\/\/doi.org\/10.1109\/ACCESS.2018.2806394","DOI":"10.1109\/ACCESS.2018.2806394"},{"key":"21_CR51","unstructured":"Wei, J., Shuster, K., Szlam, A., Weston, J., Urbanek, J., Komeili, M.: Multi-party chat: conversational agents in group settings with humans and models. arXiv preprint arXiv:2304.13835 (2023)"},{"issue":"1","key":"21_CR52","doi-asserted-by":"publisher","first-page":"36","DOI":"10.1145\/365153.365168","volume":"9","author":"J Weizenbaum","year":"1966","unstructured":"Weizenbaum, J.: Eliza-a computer program for the study of natural language communication between man and machine. Commun. ACM 9(1), 36\u201345 (1966)","journal-title":"Commun. ACM"},{"key":"21_CR53","unstructured":"Yule, G., Brown, G.R.: Discourse Analysis. Cambridge University Press (1986)"},{"key":"21_CR54","doi-asserted-by":"crossref","unstructured":"Zhang, S., Dinan, E., Urbanek, J., Szlam, A., Kiela, D., Weston, J.: Personalizing dialogue agents: i have a dog, do you have pets too? arXiv preprint arXiv:1801.07243 (2018)","DOI":"10.18653\/v1\/P18-1205"},{"key":"21_CR55","doi-asserted-by":"crossref","unstructured":"Zhang, Y, et al.: DialoGPT: large-scale generative pre-training for conversational response generation. arXiv preprint arXiv:1911.00536 (2019)","DOI":"10.18653\/v1\/2020.acl-demos.30"}],"container-title":["Lecture Notes in Computer Science","Artificial Intelligence in HCI"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-93415-5_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,29]],"date-time":"2025-05-29T22:04:41Z","timestamp":1748556281000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-93415-5_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031934148","9783031934155"],"references-count":55,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-93415-5_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"30 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"HCII","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Human-Computer Interaction","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Gothenburg","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Sweden","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22 June 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 June 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"hcii2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/2025.hci.international\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}