{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,24]],"date-time":"2026-06-24T15:57:22Z","timestamp":1782316642625,"version":"3.54.5"},"publisher-location":"Singapore","reference-count":27,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819620739","type":"print"},{"value":"9789819620746","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-2074-6_1","type":"book-chapter","created":{"date-parts":[[2024,12,31]],"date-time":"2024-12-31T11:09:30Z","timestamp":1735643370000},"page":"3-17","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Multimodal Engagement Prediction in\u00a0Human-Robot Interaction Using Transformer Neural Networks"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-5709-8493","authenticated-orcid":false,"given":"Jia Yap","family":"Lim","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3005-4109","authenticated-orcid":false,"given":"John","family":"See","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4821-5871","authenticated-orcid":false,"given":"Christian","family":"Dondrup","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,1,1]]},"reference":[{"key":"1_CR1","doi-asserted-by":"crossref","unstructured":"Bartneck, C., Belpaeme, T., Eyssel, F., Kanda, T., Keijsers, M., \u0160abanovi\u0107, S.: Human-Robot Interaction: An Introduction. Cambridge University Press (2020)","DOI":"10.1017\/9781108676649"},{"issue":"3","key":"1_CR2","doi-asserted-by":"publisher","first-page":"776","DOI":"10.1109\/TAFFC.2019.2898399","volume":"12","author":"A Ben-Youssef","year":"2019","unstructured":"Ben-Youssef, A., Clavel, C., Essid, S.: Early detection of user engagement breakdown in spontaneous human-humanoid interaction. IEEE Trans. Affect. Comput. 12(3), 776\u2013787 (2019)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"1_CR3","doi-asserted-by":"crossref","unstructured":"Ben-Youssef, A., Clavel, C., Essid, S., Bilac, M., Chamoux, M., Lim, A.: UE-HRI: a new dataset for the study of user engagement in spontaneous human-robot interactions. In: Proceedings of the 19th ACM International Conference on Multimodal Interaction, pp. 464\u2013472 (2017)","DOI":"10.1145\/3136755.3136814"},{"issue":"5","key":"1_CR4","doi-asserted-by":"publisher","first-page":"815","DOI":"10.1007\/s12369-019-00591-2","volume":"11","author":"A Ben-Youssef","year":"2019","unstructured":"Ben-Youssef, A., Varni, G., Essid, S., Clavel, C.: On-the-fly detection of user engagement decrease in spontaneous human-robot interaction using recurrent and deep neural networks. Int. J. Soc. Robot. 11(5), 815\u2013828 (2019)","journal-title":"Int. J. Soc. Robot."},{"key":"1_CR5","doi-asserted-by":"crossref","unstructured":"Castellano, G., Pereira, A., Leite, I., Paiva, A., McOwan, P.W.: Detecting user engagement with a robot companion using task and social interaction-based features. In: Proceedings of the 2009 International Conference on Multimodal Interfaces, pp. 119\u2013126 (2009)","DOI":"10.1145\/1647314.1647336"},{"key":"1_CR6","doi-asserted-by":"crossref","unstructured":"Clavel, C., Cafaro, A., Campano, S., Pelachaud, C.: Fostering user engagement in face-to-face human-agent interactions: a survey. In: Toward Robotic Socially Believable Behaving Systems-Volume II: Modeling Social Signals, pp. 93\u2013120 (2016)","DOI":"10.1007\/978-3-319-31053-4_7"},{"key":"1_CR7","doi-asserted-by":"publisher","first-page":"116","DOI":"10.3389\/frobt.2020.00116","volume":"7","author":"F Del Duchetto","year":"2020","unstructured":"Del Duchetto, F., Baxter, P., Hanheide, M.: Are you still with me? continuous engagement assessment from a robot\u2019s point of view. Front. Robot. AI 7, 116 (2020)","journal-title":"Front. Robot. AI"},{"key":"1_CR8","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth $$16 \\times 16$$ words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"issue":"5","key":"1_CR9","doi-asserted-by":"publisher","first-page":"659","DOI":"10.1007\/s12369-017-0414-y","volume":"9","author":"ME Foster","year":"2017","unstructured":"Foster, M.E., Gaschler, A., Giuliani, M.: Automatically classifying user engagement for dynamic multi-party human-robot interaction. Int. J. Soc. Robot. 9(5), 659\u2013674 (2017)","journal-title":"Int. J. Soc. Robot."},{"key":"1_CR10","doi-asserted-by":"crossref","unstructured":"Gong, Y., Chung, Y.A., Glass, J.: AST: audio spectrogram transformer. arXiv preprint arXiv:2104.01778 (2021)","DOI":"10.21437\/Interspeech.2021-698"},{"key":"1_CR11","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"8","key":"1_CR12","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"1_CR13","doi-asserted-by":"crossref","unstructured":"Huang, Y., Gilmartin, E., Campbell, N.: Conversational engagement recognition using auditory and visual cues. In: Interspeech, pp. 590\u2013594 (2016)","DOI":"10.21437\/Interspeech.2016-846"},{"key":"1_CR14","doi-asserted-by":"crossref","unstructured":"Ito, K., Kong, Q., Horiguchi, S., Sumiyoshi, T., Nagamatsu, K.: Anticipating the start of user interaction for service robot in the wild. In: 2020 IEEE International Conference on Robotics and Automation (ICRA), pp. 9687\u20139693. IEEE (2020)","DOI":"10.1109\/ICRA40945.2020.9196548"},{"key":"1_CR15","doi-asserted-by":"publisher","first-page":"63","DOI":"10.1007\/s12369-016-0357-8","volume":"9","author":"S Ivaldi","year":"2017","unstructured":"Ivaldi, S., Lefort, S., Peters, J., Chetouani, M., Provasi, J., Zibetti, E.: Towards engagement models that consider individual factors in HRI: on the relation of extroversion and negative attitude towards robots to gaze and speech during a human-robot assembly task: experiments with the icub humanoid. Int. J. Soc. Robot. 9, 63\u201386 (2017)","journal-title":"Int. J. Soc. Robot."},{"key":"1_CR16","doi-asserted-by":"crossref","unstructured":"Leite, I., McCoy, M., Ullman, D., Salomons, N., Scassellati, B.: Comparing models of disengagement in individual and group interactions. In: Proceedings of the Tenth Annual ACM\/IEEE International Conference on Human-Robot Interaction, pp. 99\u2013105 (2015)","DOI":"10.1145\/2696454.2696466"},{"key":"1_CR17","unstructured":"Liu, T., Kappas, A.: Predicting engagement breakdown in HRI using thin-slices of facial expressions. In: Workshops at the Thirty-Second AAAI Conference on Artificial Intelligence (2018)"},{"key":"1_CR18","doi-asserted-by":"crossref","unstructured":"Saheb\u00a0Jam, G., Rhim, J., Lim, A.: Developing a data-driven categorical taxonomy of emotional expressions in real world human robot interactions. In: Companion of the 2021 ACM\/IEEE International Conference on Human-Robot Interaction, pp. 479\u2013483 (2021)","DOI":"10.1145\/3434074.3447218"},{"key":"1_CR19","doi-asserted-by":"crossref","unstructured":"Sainath, T.N., Vinyals, O., Senior, A., Sak, H.: Convolutional, long short-term memory, fully connected deep neural networks. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4580\u20134584. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178838"},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Saleh, K., Yu, K., Chen, F.: Improving users engagement detection using end-to-end spatio-temporal convolutional neural networks. In: Companion of the 2021 ACM\/IEEE International Conference on Human-Robot Interaction, pp. 190\u2013194 (2021)","DOI":"10.1145\/3434074.3447157"},{"key":"1_CR21","doi-asserted-by":"crossref","unstructured":"Schuller, B., M\u00fceller, R., H\u00f6ernler, B., H\u00f6ethker, A., Konosu, H., Rigoll, G.: Audiovisual recognition of spontaneous interest within conversations. In: Proceedings of the 9th International Conference on Multimodal Interfaces, pp. 30\u201337 (2007)","DOI":"10.1145\/1322192.1322201"},{"key":"1_CR22","doi-asserted-by":"crossref","unstructured":"Serengil, S.I., Ozpinar, A.: Hyperextended lightface: a facial attribute analysis framework. In: 2021 International Conference on Engineering and Emerging Technologies (ICEET), pp.\u00a01\u20134. IEEE (2021)","DOI":"10.1109\/ICEET53442.2021.9659697"},{"key":"1_CR23","doi-asserted-by":"crossref","unstructured":"Sidner, C.L., Kidd, C.D., Lee, C., Lesh, N.: Where to look: a study of human-robot engagement. In: Proceedings of the 9th International Conference on Intelligent User Interfaces, pp. 78\u201384 (2004)","DOI":"10.1145\/964442.964458"},{"key":"1_CR24","doi-asserted-by":"crossref","unstructured":"Sidner, C., Dzikovska, M.: Human-robot interaction: engagement between humans and robots for hosting activities. In: Proceedings Fourth IEEE International Conference on Multimodal Interfaces, pp. 123\u2013128. IEEE (2002)","DOI":"10.1109\/ICMI.2002.1166980"},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z.: Rethinking the inception architecture for computer vision. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2818\u20132826 (2016)","DOI":"10.1109\/CVPR.2016.308"},{"key":"1_CR26","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"1_CR27","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Zheng, J., Thalmann, N.M.: Engagement intention estimation in multiparty human-robot interaction. In: 2021 30th IEEE International Conference on Robot & Human Interactive Communication (RO-MAN), pp. 117\u2013122. IEEE (2021)","DOI":"10.1109\/RO-MAN50785.2021.9515373"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-2074-6_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T17:19:30Z","timestamp":1779815970000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-2074-6_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819620739","9789819620746"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-2074-6_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"1 January 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Nara","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 January 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 January 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/mmm2025.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}