{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,13]],"date-time":"2026-02-13T04:29:42Z","timestamp":1770956982187,"version":"3.50.1"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2025,1,27]],"date-time":"2025-01-27T00:00:00Z","timestamp":1737936000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,27]],"date-time":"2025-01-27T00:00:00Z","timestamp":1737936000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100013139","name":"Humanities and Social Science Fund of Ministry of Education of China","doi-asserted-by":"publisher","award":["22YJAZH036"],"award-info":[{"award-number":["22YJAZH036"]}],"id":[{"id":"10.13039\/501100013139","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s00371-024-03768-7","type":"journal-article","created":{"date-parts":[[2025,1,27]],"date-time":"2025-01-27T14:16:54Z","timestamp":1737987414000},"page":"6011-6025","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["FANN: a novel frame attention neural network for student engagement recognition in facial video"],"prefix":"10.1007","volume":"41","author":[{"given":"Hu","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hong-Mei","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wen-Long","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu-Xiang","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rui-Sheng","family":"Jia","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,1,27]]},"reference":[{"key":"3768_CR1","doi-asserted-by":"publisher","first-page":"1143947","DOI":"10.3389\/fpubh.2023.1143947","volume":"11","author":"SG Ali","year":"2023","unstructured":"Ali, S.G., Wang, X., Li, P., Jung, Y., Bi, L., Kim, J., Sheng, B.: A systematic review: virtual-reality-based techniques for human exercises and health improvement. Front. Public Health 11, 1143947 (2023). https:\/\/doi.org\/10.3389\/fpubh.2023.1143947","journal-title":"Front. Public Health"},{"issue":"5","key":"3768_CR2","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3234149","volume":"51","author":"K Doherty","year":"2018","unstructured":"Doherty, K., Doherty, G.: Engagement in HCI: conception, theory and measurement. ACM Comput. Surv. 51(5), 1\u201339 (2018). https:\/\/doi.org\/10.1145\/3234149","journal-title":"ACM Comput. Surv."},{"issue":"1","key":"3768_CR3","doi-asserted-by":"publisher","first-page":"532","DOI":"10.1109\/TNNLS.2022.3175775","volume":"35","author":"A Karambakhsh","year":"2022","unstructured":"Karambakhsh, A., Sheng, B., Li, P., Li, H., Kim, J., Jung, Y., Chen, C.P.: SparseVoxNet: 3-D object recognition with sparsely aggregation of 3-D dense blocks. IEEE Trans. Neural Netw. Learn. Syst. 35(1), 532\u2013546 (2022). https:\/\/doi.org\/10.1109\/TNNLS.2022.3175775","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"3768_CR4","doi-asserted-by":"publisher","unstructured":"Mohamad Nezami, O., Dras, M., Hamey, L., Richards, D., Wan, S., Paris, C.: Automatic recognition of student engagement using deep learning and facial expression. In: Joint European Conference on Machine Learning and Knowledge Discovery in Databases, pp. 273\u2013289 (2020). https:\/\/doi.org\/10.1007\/978-3-030-46133-1_17","DOI":"10.1007\/978-3-030-46133-1_17"},{"issue":"1","key":"3768_CR5","doi-asserted-by":"publisher","first-page":"86","DOI":"10.1109\/TAFFC.2014.2316163","volume":"5","author":"J Whitehill","year":"2014","unstructured":"Whitehill, J., Serpell, Z., Lin, Y.C., Foster, A., Movellan, J.R.: The faces of engagement: automatic recognition of student engagementfrom facial expressions. IEEE Trans. Affect. Comput. 5(1), 86\u201398 (2014). https:\/\/doi.org\/10.1109\/TAFFC.2014.2316163","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"2","key":"3768_CR6","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1080\/00461520.2017.1281747","volume":"52","author":"S D'Mello","year":"2017","unstructured":"D\u2019Mello, S., Dieterle, E., Duckworth, A.: Advanced, analytic, automated (AAA) measurement of engagement during learning. Educ. Psychol. 52(2), 104\u2013123 (2017). https:\/\/doi.org\/10.1080\/00461520.2017.1281747","journal-title":"Educ. Psychol."},{"key":"3768_CR7","doi-asserted-by":"publisher","unstructured":"Gupta, A., D'Cunha, A., Awasthi, K., Balasubramanian, V.: Daisee: towards user engagement recognition in the wild. arXiv preprint http:\/\/arxiv.org\/abs\/1609.01885 (2016). https:\/\/doi.org\/10.48550\/arXiv.1609.01885","DOI":"10.48550\/arXiv.1609.01885"},{"key":"3768_CR8","doi-asserted-by":"publisher","unstructured":"Huang, T., Mei, Y., Zhang, H., Liu, S., Yang, H.: Fine-grained engagement recognition in online learning environment. In: IEEE 9th International Conference on Electronics Information and Emergency Communication (ICEIEC), pp. 338\u2013341 (2019). https:\/\/doi.org\/10.1109\/ICEIEC.2019.8784559","DOI":"10.1109\/ICEIEC.2019.8784559"},{"key":"3768_CR9","doi-asserted-by":"publisher","unstructured":"Wang, Y., Kotha, A., Hong, P. H., Qiu, M.: Automated student engagement monitoring and evaluation during learning in the wild. In: 7th IEEE International Conference on Cyber Security and Cloud Computing (CSCloud)\/6th IEEE International Conference on Edge Computing and Scalable Cloud (EdgeCom), pp. 270\u2013275 (2020). https:\/\/doi.org\/10.1109\/CSCloud-EdgeCom49738.2020.00054","DOI":"10.1109\/CSCloud-EdgeCom49738.2020.00054"},{"key":"3768_CR10","doi-asserted-by":"publisher","unstructured":"Geng, L., Xu, M., Wei, Z., Zhou, X.: Learning deep spatiotemporal feature for engagement recognition of online courses. In: IEEE Symposium Series on Computational Intelligence (SSCI), pp. 442\u2013447 (2019). https:\/\/doi.org\/10.1109\/SSCI44817.2019.9002713","DOI":"10.1109\/SSCI44817.2019.9002713"},{"key":"3768_CR11","doi-asserted-by":"publisher","unstructured":"Zhang, H., Xiao, X., Huang, T., Liu, S., Xia, Y., Li, J.: An novel end-to-end network for automatic student engagement recognition. In: IEEE 9th International Conference on Electronics Information and Emergency Communication (ICEIEC), pp. 342\u2013345 (2019). IEEE. https:\/\/doi.org\/10.1109\/ICEIEC.2019.8784507","DOI":"10.1109\/ICEIEC.2019.8784507"},{"key":"3768_CR12","doi-asserted-by":"publisher","unstructured":"Abedi, A., Khan, S. S.: Improving state-of-the-art in detecting student engagement with resnet and tcn hybrid network. In: 2021 18th Conference on Robots and Vision (CRV), pp. 151\u2013157 (2021). https:\/\/doi.org\/10.1109\/CRV52889.2021.00028","DOI":"10.1109\/CRV52889.2021.00028"},{"key":"3768_CR13","doi-asserted-by":"publisher","unstructured":"Zhu, B., Lan, X., Guo, X., Barner, K. E., Boncelet, C.: Multi-rate attention based gru model for engagement prediction. In: Proceedings of the 2020 International Conference on Multimodal Interaction, pp. 841\u2013848 (2020). https:\/\/doi.org\/10.1145\/3382507.3417965","DOI":"10.1145\/3382507.3417965"},{"key":"3768_CR14","doi-asserted-by":"publisher","unstructured":"Choi, M., Kim, H., Han, B., Xu, N., Lee, K. M.: Channel attention is all you need for video frame interpolation. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 10663\u201310671 (2020). https:\/\/doi.org\/10.1609\/aaai.v34i07.6693","DOI":"10.1609\/aaai.v34i07.6693"},{"key":"3768_CR15","doi-asserted-by":"publisher","unstructured":"Yang, J., Ren, P., Zhang, D., Chen, D., Wen, F., Li, H., Hua, G.: Neural aggregation network for video face recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern recognition, pp. 4362\u20134371 (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.554","DOI":"10.1109\/CVPR.2017.554"},{"issue":"8","key":"3768_CR16","doi-asserted-by":"publisher","first-page":"3635","DOI":"10.1007\/s00371-023-03035-1","volume":"39","author":"MM Rashid","year":"2023","unstructured":"Rashid, M.M., Wu, S., Nie, Y., Li, G.: High-fidelity facial expression transfer using part-based local\u2013global conditional gans. Vis. Comput. 39(8), 3635\u20133646 (2023). https:\/\/doi.org\/10.1007\/s00371-023-03035-1","journal-title":"Vis. Comput."},{"issue":"4","key":"3768_CR17","doi-asserted-by":"publisher","first-page":"230","DOI":"10.1177\/1088357610380137","volume":"25","author":"C Holifield","year":"2010","unstructured":"Holifield, C., Goodman, J., Hazelkorn, M., Heflin, L.J.: Using self-monitoring to increase attending to task and academic accuracy in children with autism. Foc. Autism Dev. Disabil. 25(4), 230\u2013238 (2010). https:\/\/doi.org\/10.1177\/1088357610380137","journal-title":"Foc. Autism Dev. Disabil."},{"issue":"12","key":"3768_CR18","doi-asserted-by":"publisher","first-page":"6205","DOI":"10.1007\/s00371-022-02721-w","volume":"39","author":"Y Yi","year":"2023","unstructured":"Yi, Y., Xu, Y., Ye, Z., Li, L., Hu, X., Tian, Y.: STAN: spatiotemporal attention network for video-based facial expression recognition. Vis. Comput. 39(12), 6205\u20136220 (2023). https:\/\/doi.org\/10.1007\/s00371-022-02721-w","journal-title":"Vis. Comput."},{"key":"3768_CR19","doi-asserted-by":"publisher","unstructured":"Bosch, N., D'Mello, S., Baker, R., Ocumpaugh, J., Shute, V., Ventura, M., Zhao, W.: Automatic detection of learning-centered affective states in the wild. In: Proceedings of the 20th International Conference on Intelligent User Interfaces, pp. 379\u2013388 (2015). https:\/\/doi.org\/10.1145\/2678025.2701397","DOI":"10.1145\/2678025.2701397"},{"issue":"1","key":"3768_CR20","doi-asserted-by":"publisher","first-page":"63","DOI":"10.1177\/0735633119825575","volume":"58","author":"Z Zhang","year":"2020","unstructured":"Zhang, Z., Li, Z., Liu, H., Cao, T., Liu, S.: Data-driven online learning engagement detection via facial expression and mouse behavior recognition technology. J. Educ. Comput. Res. 58(1), 63\u201386 (2020). https:\/\/doi.org\/10.1177\/0735633119825575","journal-title":"J. Educ. Comput. Res."},{"key":"3768_CR21","doi-asserted-by":"publisher","unstructured":"Kaur, A., Mustafa, A., Mehta, L., Dhall, A.: Prediction and localization of student engagement in the wild. In: 2018 Digital Image Computing: Techniques and Applications (DICTA), pp. 1\u20138 (2018). https:\/\/doi.org\/10.1109\/DICTA.2018.8615851","DOI":"10.1109\/DICTA.2018.8615851"},{"key":"3768_CR22","doi-asserted-by":"publisher","unstructured":"Lin, T. Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2980\u20132988 (2017). https:\/\/doi.org\/10.1109\/TPAMI.2018.2858826","DOI":"10.1109\/TPAMI.2018.2858826"},{"issue":"10","key":"3768_CR23","doi-asserted-by":"publisher","first-page":"6609","DOI":"10.1007\/s10489-020-02139-8","volume":"51","author":"J Liao","year":"2021","unstructured":"Liao, J., Liang, Y., Pan, J.: Deep facial spatiotemporal network for engagement prediction in online learning. Appl. Intell. 51(10), 6609\u20136621 (2021). https:\/\/doi.org\/10.1007\/s10489-020-02139-8","journal-title":"Appl. Intell."},{"key":"3768_CR24","doi-asserted-by":"publisher","unstructured":"Huang, T., Mei, Y., Zhang, H., Liu, S., Yang, H.: Fine-grained engagement recognition in online learning environment. In: 2019 IEEE 9th International Conference on Electronics Information and Emergency Communication (ICEIEC), pp. 338\u2013341 (2019). https:\/\/doi.org\/10.1109\/ICEIEC.2019.8784559","DOI":"10.1109\/ICEIEC.2019.8784559"},{"issue":"12","key":"3768_CR25","doi-asserted-by":"publisher","first-page":"13803","DOI":"10.1007\/s10489-022-03200-4","volume":"52","author":"NK Mehta","year":"2022","unstructured":"Mehta, N.K., Prasad, S.S., Saurav, S., Saini, R., Singh, S.: Three-dimensional DenseNet self-attention neural network for automatic detection of student\u2019s engagement. Appl. Intell. 52(12), 13803\u201313823 (2022). https:\/\/doi.org\/10.1007\/s10489-022-03200-4","journal-title":"Appl. Intell."},{"issue":"16","key":"3768_CR26","doi-asserted-by":"publisher","first-page":"8007","DOI":"10.3390\/app12168007","volume":"12","author":"Y Hu","year":"2022","unstructured":"Hu, Y., Jiang, Z., Zhu, K.: An optimized cnn model for engagement recognition in an e-learning environment. Appl. Sci. 12(16), 8007 (2022). https:\/\/doi.org\/10.3390\/app12168007","journal-title":"Appl. Sci."},{"issue":"4","key":"3768_CR27","doi-asserted-by":"publisher","first-page":"2132","DOI":"10.1109\/TAFFC.2022.3188390","volume":"13","author":"AV Savchenko","year":"2022","unstructured":"Savchenko, A.V., Savchenko, L.V., Makarov, I.: Classifying emotions and engagement in online learning based on a single facial expression recognition neural network. IEEE Trans. Affect. Comput. 13(4), 2132\u20132143 (2022). https:\/\/doi.org\/10.1109\/TAFFC.2022.3188390","journal-title":"IEEE Trans. Affect. Comput."},{"key":"3768_CR28","doi-asserted-by":"publisher","unstructured":"Zhu, B., Lan, X., Guo, X., Barner, K. E., Boncelet, C.: Multi-rate attention based gru model for engagement prediction. In: Proceedings of the 2020 International Conference on Multimodal Interaction, pp. 841\u2013848 (2020). https:\/\/doi.org\/10.1145\/3382507.3417965","DOI":"10.1145\/3382507.3417965"},{"key":"3768_CR29","doi-asserted-by":"publisher","unstructured":"Liu, Z., Ning, J., Cao, Y., Wei, Y., Zhang, Z., Lin, S., Hu, H.: Video swin transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3202\u20133211 (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.00320","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"3768_CR30","doi-asserted-by":"publisher","DOI":"10.1111\/j.1460-2466.1975.tb00577.x","author":"JD Boucher","year":"1975","unstructured":"Boucher, J.D., Ekman, P.: Facial areas and emotional information. J. Commun. (1975). https:\/\/doi.org\/10.1111\/j.1460-2466.1975.tb00577.x","journal-title":"J. Commun."},{"key":"3768_CR31","doi-asserted-by":"publisher","unstructured":"Sung, F., Yang, Y., Zhang, L., Xiang, T., Torr, P. H., Hospedales, T. M.: Learning to compare: Relation network for few-shot learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1199\u20131208 (2018). https:\/\/doi.org\/10.48550\/arXiv.1711.06025","DOI":"10.48550\/arXiv.1711.06025"},{"key":"3768_CR32","doi-asserted-by":"publisher","unstructured":"Guo, Y., Zhang, L., Hu, Y., He, X., Gao, J.: Ms-celeb-1m: A dataset and benchmark for large-scale face recognition. In: Computer Vision\u2013ECCV: 14th European Conference, pp. 87\u2013102 (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_6","DOI":"10.1007\/978-3-319-46487-9_6"},{"key":"3768_CR33","doi-asserted-by":"publisher","unstructured":"Barsoum, E., Zhang, C., Ferrer, C. C., Zhang, Z.: Training deep networks for facial expression recognition with crowd-sourced label distribution. In: Proceedings of the 18th ACM International Conference on Multimodal Interaction, pp. 279\u2013283 (2016). https:\/\/doi.org\/10.1145\/2993148.2993165","DOI":"10.1145\/2993148.2993165"},{"key":"3768_CR34","doi-asserted-by":"publisher","unstructured":"Parmar, P., Tran Morris, B.: Learning to score olympic events. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp. 20\u201328 (2017). https:\/\/doi.org\/10.1109\/CVPRW.2017.16","DOI":"10.1109\/CVPRW.2017.16"},{"key":"3768_CR35","doi-asserted-by":"publisher","first-page":"76551","DOI":"10.1109\/ACCESS.2023.3297651","volume":"11","author":"R Miao","year":"2023","unstructured":"Miao, R., Kato, H., Hatori, Y., Sato, Y., Shioiri, S.: Analysis of facial expressions to estimate the level of engagement in online lectures. IEEE Access 11, 76551\u201376562 (2023). https:\/\/doi.org\/10.1109\/ACCESS.2023.3297651","journal-title":"IEEE Access"},{"issue":"4","key":"3768_CR36","doi-asserted-by":"publisher","first-page":"377","DOI":"10.1109\/TAFFC.2014.2336244","volume":"5","author":"H Cao","year":"2014","unstructured":"Cao, H., Cooper, D.G., Keutmann, M.K., Gur, R.C., Nenkova, A., Verma, R.: Crema-d: crowd-sourced emotional multimodal actors dataset. IEEE Trans. Affect. Comput. 5(4), 377\u2013390 (2014). https:\/\/doi.org\/10.1109\/TAFFC.2014.2336244","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"5","key":"3768_CR37","doi-asserted-by":"publisher","first-page":"e0196391","DOI":"10.1371\/journal.pone.0196391","volume":"13","author":"SR Livingstone","year":"2018","unstructured":"Livingstone, S.R., Russo, F.A.: The ryerson audio-visual database of emotional speech and song (RAVDESS): a dynamic, multimodal set of facial and vocal expressions in North American English. PLoS ONE 13(5), e0196391 (2018). https:\/\/doi.org\/10.1371\/journal.pone.0196391","journal-title":"PLoS ONE"},{"key":"3768_CR38","doi-asserted-by":"publisher","unstructured":"Shukla, A., Vougioukas, K., Ma, P., Petridis, S., Pantic, M.: Visually guided self supervised learning of speech representations. In: ICASSP 2020\u20132020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6299\u20136303 (2020). https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053415","DOI":"10.1109\/ICASSP40776.2020.9053415"},{"issue":"5","key":"3768_CR39","doi-asserted-by":"publisher","first-page":"1398","DOI":"10.1007\/s11263-019-01251-8","volume":"128","author":"K Vougioukas","year":"2020","unstructured":"Vougioukas, K., Petridis, S., Pantic, M.: Realistic speech-driven facial animation with gans. Int. J. Comput. Vision 128(5), 1398\u20131413 (2020). https:\/\/doi.org\/10.1007\/s11263-019-01251-8","journal-title":"Int. J. Comput. Vision"},{"key":"3768_CR40","doi-asserted-by":"publisher","unstructured":"He, G., Liu, X., Fan, F., You, J.: Image2audio: Facilitating semi-supervised audio emotion recognition with facial expression image. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp. 912\u2013913 (2020). https:\/\/doi.org\/10.1109\/CVPRW50498.2020.00464","DOI":"10.1109\/CVPRW50498.2020.00464"},{"key":"3768_CR41","doi-asserted-by":"publisher","first-page":"3480","DOI":"10.1109\/TMM.2021.3099900","volume":"24","author":"SE Eskimez","year":"2021","unstructured":"Eskimez, S.E., Zhang, Y., Duan, Z.: Speech driven talking face generation from a single image and an emotion condition. IEEE Trans. Multimed. 24, 3480\u20133490 (2021). https:\/\/doi.org\/10.1109\/TMM.2021.3099900","journal-title":"IEEE Trans. Multimed."},{"key":"3768_CR42","doi-asserted-by":"publisher","unstructured":"Gong, Y., Chung, Y. A., Glass, J.: Ast: Audio spectrogram transformer. arXiv preprint http:\/\/arxiv.org\/abs\/2104.01778 (2021). https:\/\/doi.org\/10.48550\/arXiv.2104.01778","DOI":"10.48550\/arXiv.2104.01778"},{"key":"3768_CR43","doi-asserted-by":"publisher","unstructured":"Ristea, N. C., Ionescu, R. T., Khan, F. S.: Septr: Separable transformer for audio spectrogram processing. arXiv preprint http:\/\/arxiv.org\/abs\/2203.09581 (2022). https:\/\/doi.org\/10.48550\/arXiv.2203.09581","DOI":"10.48550\/arXiv.2203.09581"},{"key":"3768_CR44","doi-asserted-by":"publisher","unstructured":"Sinha, S., Biswas, S., Yadav, R., Bhowmick, B.: Emotion-controllable generalized talking face generation. arXiv preprint http:\/\/arxiv.org\/abs\/2205.01155 (2022). https:\/\/doi.org\/10.48550\/arXiv.2205.01155","DOI":"10.48550\/arXiv.2205.01155"},{"key":"3768_CR45","doi-asserted-by":"publisher","unstructured":"Ghaleb, E., Popa, M., Asteriadis, S.: Multimodal and temporal perception of audio-visual cues for emotion recognition. In: 8th International Conference on Affective Computing and Intelligent Interaction (ACII), pp. 552\u2013558 (2019). https:\/\/doi.org\/10.1109\/ACII.2019.8925444","DOI":"10.1109\/ACII.2019.8925444"},{"key":"3768_CR46","doi-asserted-by":"publisher","unstructured":"Su, L., Hu, C., Li, G., Cao, D.: Msaf: Multimodal split attention fusion. arXiv preprint http:\/\/arxiv.org\/abs\/2012.07175 (2020). https:\/\/doi.org\/10.48550\/arXiv.2012.07175","DOI":"10.48550\/arXiv.2012.07175"},{"key":"3768_CR47","doi-asserted-by":"publisher","unstructured":"Joze, H. R. V., Shaban, A., Iuzzolino, M. L., Koishida, K.: MMTM: Multimodal transfer module for CNN fusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13289\u201313299 (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.01330","DOI":"10.1109\/CVPR42600.2020.01330"},{"key":"3768_CR48","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1016\/j.patrec.2022.07.012","volume":"161","author":"S Verbitskiy","year":"2022","unstructured":"Verbitskiy, S., Berikov, V., Vyshegorodtsev, V.: Eranns: efficient residual audio neural networks for audio pattern recognition. Patt. Recogn. Lett. 161, 38\u201344 (2022). https:\/\/doi.org\/10.1016\/j.patrec.2022.07.012","journal-title":"Patt. Recogn. Lett."},{"key":"3768_CR49","doi-asserted-by":"publisher","unstructured":"Fu, Z., Liu, F., Wang, H., Qi, J., Fu, X., Zhou, A., Li, Z.: A cross-modal fusion network based on self-attention and residual structure for multimodal emotion recognition. arXiv preprint http:\/\/arxiv.org\/abs\/2111.02172 (2021). https:\/\/doi.org\/10.48550\/arXiv.2111.02172","DOI":"10.48550\/arXiv.2111.02172"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03768-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-024-03768-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03768-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,16]],"date-time":"2025-05-16T08:51:31Z","timestamp":1747385491000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-024-03768-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1,27]]},"references-count":49,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["3768"],"URL":"https:\/\/doi.org\/10.1007\/s00371-024-03768-7","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,1,27]]},"assertion":[{"value":"13 December 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 January 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}