{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T14:56:48Z","timestamp":1773154608373,"version":"3.50.1"},"publisher-location":"Cham","reference-count":38,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031780134","type":"print"},{"value":"9783031780141","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78014-1_1","type":"book-chapter","created":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T12:25:17Z","timestamp":1732191917000},"page":"3-17","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["A Cross-Multi-modal Fusion Approach for Enhanced Engagement Recognition"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5141-2561","authenticated-orcid":false,"given":"Denis","family":"Dresvyanskiy","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3424-652X","authenticated-orcid":false,"given":"Alexey","family":"Karpov","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4531-0662","authenticated-orcid":false,"given":"Wolfgang","family":"Minker","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,22]]},"reference":[{"key":"1_CR1","doi-asserted-by":"crossref","unstructured":"Abedi, A., Khan, S.S.: Improving state-of-the-art in detecting student engagement with resnet and TCN hybrid network. In: 2021 18th Conference on Robots and Vision (CRV), pp. 151\u2013157 (2021)","DOI":"10.1109\/CRV52889.2021.00028"},{"key":"1_CR2","doi-asserted-by":"crossref","unstructured":"Al-Darraji, S., Zafar, Z., Berns, K.: Real-time perception of non-verbal human feedback in a gaming scenario. In: Electronic Workshops in Computing (2016)","DOI":"10.14236\/ewic\/HCI2016.27"},{"key":"1_CR3","doi-asserted-by":"crossref","unstructured":"Bagher\u00a0Zadeh, A., Liang, P.P., Poria, S., Cambria, E., Morency, L.P.: Multimodal language analysis in the wild: CMU-MOSEI dataset and interpretable dynamic fusion graph. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2236\u20132246 (2018)","DOI":"10.18653\/v1\/P18-1208"},{"key":"1_CR4","doi-asserted-by":"crossref","unstructured":"Bednarik, R., Eivazi, S., Hradis, M.: Gaze and conversational engagement in multiparty video conversation: an annotation scheme and classification of high and low levels of engagement. In: Proceedings of the 4th Workshop on Eye Gaze in Intelligent Human Machine Interaction. Gaze-In \u201912 (2012)","DOI":"10.1145\/2401836.2401846"},{"key":"1_CR5","doi-asserted-by":"crossref","unstructured":"Ben-Youssef, A., Clavel, C., Essid, S., Bilac, M., Chamoux, M., Lim, A.: UE-HRI: a new dataset for the study of user engagement in spontaneous human-robot interactions. In: Proceedings of the 19th ACM International Conference on Multimodal Interaction, pp. 464\u2013472. ICMI \u201917 (2017)","DOI":"10.1145\/3136755.3136814"},{"issue":"3","key":"1_CR6","doi-asserted-by":"publisher","first-page":"776","DOI":"10.1109\/TAFFC.2019.2898399","volume":"12","author":"A Ben-Youssef","year":"2021","unstructured":"Ben-Youssef, A., Clavel, C., Essid, S.: Early detection of user engagement breakdown in spontaneous human-humanoid interaction. IEEE Trans. Affect. Comput. 12(3), 776\u2013787 (2021)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"1_CR7","doi-asserted-by":"crossref","unstructured":"Cafaro, A., et al.: The NoXi database: multimodal recordings of mediated novice-expert interactions. In: Proceedings of the 19th ACM International Conference on Multimodal Interaction, pp. 350\u2013359. ICMI \u201917 (2017)","DOI":"10.1145\/3136755.3136780"},{"key":"1_CR8","doi-asserted-by":"crossref","unstructured":"Cerrato, L., Campbell, N.: Engagement in Dialogue with Social Robots, pp. 313\u2013319 (2017)","DOI":"10.1007\/978-981-10-2585-3_25"},{"key":"1_CR9","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Ververas, E., Kotsia, I., Zafeiriou, S.: RetinaFace: single-shot multi-level face localisation in the wild. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5202\u20135211 (2020)","DOI":"10.1109\/CVPR42600.2020.00525"},{"key":"1_CR10","unstructured":"Dermouche, S., Pelachaud, C.: From analysis to modeling of engagement as sequences of multimodal behaviors. In: Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018) (2018)"},{"key":"1_CR11","unstructured":"Dresvyanskiy, D., Karpov, A., Minker, W.: The importance of data diversity for engagement recognition systems: a cross-cultural study. In: Proceedings of the 13th International Workshop on Spoken Dialogue Systems Technology (2023)"},{"key":"1_CR12","doi-asserted-by":"crossref","unstructured":"Dresvyanskiy, D., Minker, W., Karpov, A.: Deep learning based engagement recognition in highly imbalanced data. In: Speech and Computer, pp. 166\u2013178 (2021)","DOI":"10.1007\/978-3-030-87802-3_16"},{"key":"1_CR13","doi-asserted-by":"crossref","unstructured":"Galland, L., Pelachaud, C., Pecune, F.: Adapting conversational strategies to co-optimize agent\u2019s task performance and user\u2019s engagement. In: Proceedings of the 22nd ACM International Conference on Intelligent Virtual Agents. IVA \u201922 (2022)","DOI":"10.1145\/3514197.3549674"},{"key":"1_CR14","unstructured":"Gupta, A., Jaiswal, R., Adhikari, S., Balasubramanian, V.: DAISEE: dataset for affective states in e-learning environments. CoRR abs\/1609.01885 (2016)"},{"key":"1_CR15","unstructured":"Heimerl, A., Baur, T., Andr\u00e9, E.: A transparent framework towards the context-sensitive recognition of conversational engagement. In: Proceedings of the Eleventh International Workshop Modelling and Reasoning in Context MRC@ECAI 2020. CEUR Workshop Proceedings, vol.\u00a02787, pp. 7\u201316 (2020)"},{"key":"1_CR16","doi-asserted-by":"crossref","unstructured":"Holroyd, A., Rich, C., Sidner, C.L., Ponsler, B.: Generating connection events for human-robot collaboration. In: 2011 RO-MAN, pp. 241\u2013246 (2011)","DOI":"10.1109\/ROMAN.2011.6005245"},{"key":"1_CR17","doi-asserted-by":"crossref","unstructured":"Jayagopi, D.B., et al.: The vernissage corpus: a conversational human-robot-interaction dataset. In: 2013 8th ACM\/IEEE International Conference on Human-Robot Interaction (HRI), pp. 149\u2013150 (2013)","DOI":"10.1109\/HRI.2013.6483545"},{"key":"1_CR18","doi-asserted-by":"crossref","unstructured":"Kim, D., Song, B.C.: Emotion-aware multi-view contrastive learning for facial emotion recognition. In: Computer Vision \u2013 ECCV 2022, pp. 178\u2013195 (2022)","DOI":"10.1007\/978-3-031-19778-9_11"},{"key":"1_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1007\/978-3-319-46843-3_3","volume-title":"Human Behavior Understanding","author":"J Kim","year":"2016","unstructured":"Kim, J., Truong, K.P., Charisi, V., Zaga, C., Evers, V., Chetouani, M.: Multimodal detection of engagement in groups of children using rank learning. In: Chetouani, M., Cohn, J., Salah, A.A. (eds.) HBU 2016. LNCS, vol. 9997, pp. 35\u201348. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46843-3_3"},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Li, Y.Y., Hung, Y.P.: Feature fusion of face and body for engagement intensity detection. In: 2019 IEEE International Conference on Image Processing (ICIP), pp. 3312\u20133316 (2019)","DOI":"10.1109\/ICIP.2019.8803488"},{"key":"1_CR21","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R.B., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp. 2999\u20133007 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"1_CR22","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2019)"},{"key":"1_CR23","doi-asserted-by":"crossref","unstructured":"Muller, P., et al.: MultiMediate \u201923: engagement estimation and bodily behaviour recognition in social interactions. In: Proceedings of the 31st ACM International Conference on Multimedia (2023)","DOI":"10.1145\/3581783.3613851"},{"key":"1_CR24","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1002\/asi.21229","volume":"61","author":"HL O\u2019Brien","year":"2010","unstructured":"O\u2019Brien, H.L., Toms, E.: The development and evaluation of a survey to measure user engagement. J. Assoc. Inf. Sci. Technol. 61, 50\u201369 (2010)","journal-title":"J. Assoc. Inf. Sci. Technol."},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"Olaniyi, B.Y., Fern\u00e1ndez\u00a0del R\u00edo, A., Peri\u00e1\u00f1ez, A., Bellhouse, L.: User engagement in mobile health applications. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp. 4704\u20144712 (2022)","DOI":"10.1145\/3534678.3542681"},{"key":"1_CR26","doi-asserted-by":"publisher","first-page":"435","DOI":"10.1016\/j.neucom.2022.10.013","volume":"514","author":"E Ryumina","year":"2022","unstructured":"Ryumina, E., Dresvyanskiy, D., Karpov, A.: In search of a robust facial expressions recognition model: a large-scale visual cross-corpus study. Neurocomputing 514, 435\u2013450 (2022)","journal-title":"Neurocomputing"},{"key":"1_CR27","unstructured":"Salam, H., Celiktutan, O., Gunes, H., Chetouani, M.: Automatic context-aware inference of engagement in HMI: a survey. IEEE Transactions on Affective Computing, pp. 1\u201320 (2023)"},{"key":"1_CR28","doi-asserted-by":"crossref","unstructured":"Sun, K., Xiao, B., Liu, D., Wang, J.: Deep high-resolution representation learning for human pose estimation. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5686\u20135696 (2019)","DOI":"10.1109\/CVPR.2019.00584"},{"key":"1_CR29","unstructured":"Tan, M., Le, Q.V.: EfficientNet: Rethinking model scaling for convolutional neural networks. ArXiv abs\/1905.11946 (2019)"},{"key":"1_CR30","doi-asserted-by":"crossref","unstructured":"Tsai, Y.H.H., Bai, S., Liang, P.P., Kolter, J.Z., Morency, L.P., Salakhutdinov, R.: Multimodal transformer for unaligned multimodal language sequences, pp. 6558\u20136569 (2019)","DOI":"10.18653\/v1\/P19-1656"},{"key":"1_CR31","doi-asserted-by":"crossref","unstructured":"Tu, V.N., et al.: DCTM: dilated convolutional transformer model for multimodal engagement estimation in conversation. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 9521\u20139525. MM \u201923 (2023)","DOI":"10.1145\/3581783.3612857"},{"key":"1_CR32","doi-asserted-by":"crossref","unstructured":"Tu, V.N., et al.: DCTM: dilated convolutional transformer model for multimodal engagement estimation in conversation. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 9521\u20139525. MM \u201923 (2023)","DOI":"10.1145\/3581783.3612857"},{"key":"1_CR33","doi-asserted-by":"crossref","unstructured":"Wang, H., et al.: Score-CAM: score-weighted visual explanations for convolutional neural networks. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW), pp. 111\u2013119 (2020)","DOI":"10.1109\/CVPRW50498.2020.00020"},{"key":"1_CR34","doi-asserted-by":"crossref","unstructured":"Yang, C., et al.: Multimediate 2023: engagement level detection using audio and video features. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 9601\u20139605. MM \u201923 (2023)","DOI":"10.1145\/3581783.3612873"},{"key":"1_CR35","doi-asserted-by":"crossref","unstructured":"Yang, D., et al.: Emotion recognition for multiple context awareness. In: Computer Vision \u2013 ECCV 2022, pp. 144\u2013162 (2022)","DOI":"10.1007\/978-3-031-19836-6_9"},{"key":"1_CR36","doi-asserted-by":"crossref","unstructured":"Yu, J., et al.: Sliding window seq2seq modeling for engagement estimation. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 9496\u20139500. MM \u201923 (2023)","DOI":"10.1145\/3581783.3612852"},{"key":"1_CR37","doi-asserted-by":"crossref","unstructured":"Yu, J., et al.: Sliding window seq2seq modeling for engagement estimation. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 9496\u20139500. MM \u201923 (2023)","DOI":"10.1145\/3581783.3612852"},{"issue":"4","key":"1_CR38","doi-asserted-by":"publisher","first-page":"696","DOI":"10.1109\/TAFFC.2018.2834350","volume":"11","author":"WH Yun","year":"2018","unstructured":"Yun, W.H., Lee, D., Park, C., Kim, J., Kim, J.: Automatic recognition of children engagement from facial video using convolutional neural networks. IEEE Trans. Affect. Comput. 11(4), 696\u2013707 (2018)","journal-title":"IEEE Trans. Affect. Comput."}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78014-1_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,8]],"date-time":"2025-01-08T15:05:31Z","timestamp":1736348731000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78014-1_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,22]]},"ISBN":["9783031780134","9783031780141"],"references-count":38,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78014-1_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,22]]},"assertion":[{"value":"22 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Belgrade","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Serbia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 November 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/specom2024.ftn.uns.ac.rs\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}