{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,3]],"date-time":"2026-01-03T06:13:19Z","timestamp":1767420799867,"version":"3.48.0"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"18","license":[{"start":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:00:00Z","timestamp":1764547200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:00:00Z","timestamp":1764547200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1007\/s10489-025-07027-7","type":"journal-article","created":{"date-parts":[[2025,12,13]],"date-time":"2025-12-13T10:32:45Z","timestamp":1765621965000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multimodal contextual transformer augmented fusion for emotion recognition"],"prefix":"10.1007","volume":"55","author":[{"given":"Wesagn Dawit","family":"Chemma","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Adane Letta","family":"Mamuye","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8545-3740","authenticated-orcid":false,"given":"Marco","family":"Piangerelli","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,12,13]]},"reference":[{"key":"7027_CR1","doi-asserted-by":"publisher","first-page":"532279","DOI":"10.3389\/frobt.2020.532279","volume":"7","author":"M Spezialetti","year":"2020","unstructured":"Spezialetti M, Placidi G, Rossi S (2020) Emotion recognition for human-robot interaction: recent advances and future perspectives. Front Robot AI 7:532279","journal-title":"Front Robot AI"},{"key":"7027_CR2","doi-asserted-by":"crossref","unstructured":"Kapoor A, Verma V (2024) Emotion AI: understanding emotions through artificial intelligence. Int J Eng Sci Hum 14(Special Issue 1):223\u2013232","DOI":"10.62904\/0vcbvb24"},{"issue":"9","key":"7027_CR3","doi-asserted-by":"publisher","first-page":"1370","DOI":"10.1109\/JPROC.2003.817122","volume":"91","author":"M Pantic","year":"2003","unstructured":"Pantic M, Rothkrantz LJM (2003) Toward an affect-sensitive multimodal human-computer interaction. Proc IEEE 91(9):1370\u20131390","journal-title":"Proc IEEE"},{"issue":"22","key":"7027_CR4","doi-asserted-by":"publisher","first-page":"4714","DOI":"10.3390\/electronics12224714","volume":"12","author":"Y Fu","year":"2023","unstructured":"Fu Y, Yuan S, Zhang C, Cao J (2023) Emotion recognition in conversations: a survey focusing on context, speaker dependencies, and fusion methods. Electronics 12(22):4714","journal-title":"Electronics"},{"issue":"1","key":"7027_CR5","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1109\/TPAMI.2008.52","volume":"31","author":"Z Zeng","year":"2009","unstructured":"Zeng Z, Pantic M, Roisman GI, Huang TS (2009) A survey of affect recognition methods: audio, visual, and spontaneous expressions. IEEE Trans Pattern Anal Mach Intell 31(1):39\u201358","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"7027_CR6","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1016\/j.inffus.2017.02.003","volume":"37","author":"S Poria","year":"2017","unstructured":"Poria S, Cambria E, Bajpai R, Hussain A (2017) A review of affective computing: from unimodal analysis to multimodal fusion. Inf Fusion 37:98\u2013125","journal-title":"Inf Fusion"},{"issue":"5","key":"7027_CR7","doi-asserted-by":"publisher","first-page":"90","DOI":"10.1145\/3129340","volume":"61","author":"BW Schuller","year":"2018","unstructured":"Schuller BW (2018) Speech emotion recognition: two decades in a nutshell, benchmarks, and ongoing trends. Commun ACM 61(5):90\u201399","journal-title":"Commun ACM"},{"issue":"6","key":"7027_CR8","doi-asserted-by":"publisher","first-page":"59","DOI":"10.1109\/MSP.2021.3106895","volume":"38","author":"S Zhao","year":"2021","unstructured":"Zhao S, Jia G, Yang J, Ding G, Keutzer K (2021) Emotion recognition from multiple modalities: fundamentals and methodologies. IEEE Signal Process Mag 38(6):59\u201373","journal-title":"IEEE Signal Process Mag"},{"key":"7027_CR9","unstructured":"Ngiam J, Khosla A, Kim M, Nam J, Lee H, Ng AY (2011) Multimodal deep learning. In: Proceedings of the 28th International Conference on Machine Learning (ICML), pp 689\u2013696"},{"key":"7027_CR10","doi-asserted-by":"crossref","unstructured":"Hazarika D, Poria S, Zadeh A, Cambria E, Morency L-P, Zimmermann R (2018) Conversational memory network for emotion recognition in dyadic dialogue videos. In: Proceedings of the 2018 conference of the north american chapter of the association for computational linguistics: human language technologies, vol. 1 (Long Papers), pp 2122\u20132132","DOI":"10.18653\/v1\/N18-1193"},{"key":"7027_CR11","doi-asserted-by":"crossref","unstructured":"Hazarika D, Poria S, Mihalcea R, Cambria E, Zimmermann R (2018) ICON: interactive conversational memory network for multimodal emotion detection. In: Proceedings of the 2018 conference on empirical methods in natural language processing, pp 2594\u20132604","DOI":"10.18653\/v1\/D18-1280"},{"key":"7027_CR12","doi-asserted-by":"crossref","unstructured":"Tsai Y-HH, Bai S, Liang PP, Kolter JZ, Morency L-P, Salakhutdinov R (2019) Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the 57th annual meeting of the association for computational linguistics, pp 6558\u20136569","DOI":"10.18653\/v1\/P19-1656"},{"key":"7027_CR13","doi-asserted-by":"crossref","unstructured":"Liang PP, Zadeh A, Shen T, Morency L-P (2018) Memory fusion network for multi-view sequential learning. In: Proceedings of the AAAI conference on artificial intelligence, pp 4344\u20134355","DOI":"10.1609\/aaai.v32i1.12021"},{"key":"7027_CR14","doi-asserted-by":"crossref","unstructured":"Majumder N, Poria S, Hazarika D, Mihalcea R, Gelbukh A, Cambria E (2019) Dialoguernn: An attentive rnn for emotion detection in conversations. In: Proceedings of the AAAI conference on artificial intelligence, pp 6818\u20136825","DOI":"10.1609\/aaai.v33i01.33016818"},{"key":"7027_CR15","unstructured":"Mao Y, Sun Q, Liu G, Wang X, Gao W, Li X, Shen J (2020) DialogueTRM: exploring the intra-modal and inter-modal emotional behaviors in the conversation. arXiv:2010.07637"},{"key":"7027_CR16","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","volume":"42","author":"C Busso","year":"2008","unstructured":"Busso C, Bulut M, Lee C-C, Kazemzadeh A, Mower E, Kim S, Chang JN, Lee S, Narayanan SS (2008) IEMOCAP: Interactive emotional dyadic motion capture database. Lang Resour Eval 42:335\u2013359","journal-title":"Lang Resour Eval"},{"key":"7027_CR17","doi-asserted-by":"crossref","unstructured":"Poria S, Hazarika D, Majumder N, Naik G, Cambria E, Mihalcea R (2018) MELD: A multimodal multi-party dataset for emotion recognition in conversations. arXiv:1810.02508","DOI":"10.18653\/v1\/P19-1050"},{"key":"7027_CR18","doi-asserted-by":"crossref","unstructured":"Hu J, Liu Y, Zhao J, Jin Q (2021) MMGCN: multimodal fusion via deep graph convolution network for emotion recognition in conversation. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics (ACL), pp 3358\u20133369","DOI":"10.18653\/v1\/2021.acl-long.440"},{"key":"7027_CR19","doi-asserted-by":"crossref","unstructured":"Ma H, Wang J, Lin H, Zhang B, Zhang Y, Xu B (2023) A transformer-based model with self-distillation for multimodal emotion recognition in conversations. IEEE Trans Multimed","DOI":"10.1109\/TMM.2023.3271019"},{"key":"7027_CR20","doi-asserted-by":"crossref","unstructured":"Maji B, Swain M, Guha R, Routray A (2023) Multimodal emotion recognition based on deep temporal features using cross-modal transformer and self-attention. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp 1\u20135","DOI":"10.1109\/ICASSP49357.2023.10096937"},{"key":"7027_CR21","unstructured":"Driess D, Xia F, Sajjadi MSM, Huang AZ, Tompson J, Ichter B, Shah A, Florence P, Huang TS, Goldberg K (2023) PaLM-E: An embodied multimodal language model. arXiv:2303.03378"},{"key":"7027_CR22","unstructured":"Alayrac J-B, Donahue J, Luc P, Mensch AM, Kandemir K, Mohtashami S, Ding K, Barham M, Touvron T, Ramirez C et al (2022) Flamingo: a visual language model for few-shot learning. arXiv:2204.14198"},{"key":"7027_CR23","unstructured":"Jaegle A, Borgeaud S, Tunyasuvunakool K, Zoran D, Rezende DJ, Vaswani A, Vinyals O, Pritzel A (2022) Perceiver AR: auto-regressive modality-agnostic transformers. arXiv:2202.07765"},{"key":"7027_CR24","doi-asserted-by":"crossref","unstructured":"Meng T, Shou Y, Ai W, Yin N, Li K (2024) Deep imbalanced learning for multimodal emotion recognition in conversations. IEEE Trans Artif Intell","DOI":"10.1109\/TAI.2024.3445325"},{"issue":"3","key":"7027_CR25","doi-asserted-by":"publisher","first-page":"3735","DOI":"10.1007\/s13369-023-08159-z","volume":"49","author":"A Abed","year":"2024","unstructured":"Abed A, Akrout B, Amous I (2024) Convolutional neural network for head segmentation and counting in crowded retail environment using top-view depth images. Arab J Sci Eng 49(3):3735\u20133749","journal-title":"Arab J Sci Eng"},{"key":"7027_CR26","doi-asserted-by":"crossref","unstructured":"Abed A, Akrout B, Amous I (2022) Shoppers interaction classification based on an improved densenet model using rgb-d data. In: 2022 8th International Conference on Systems and Informatics (ICSAI). IEEE, pp 1-6","DOI":"10.1109\/ICSAI57119.2022.10005508"},{"issue":"31","key":"7027_CR27","doi-asserted-by":"publisher","first-page":"19365","DOI":"10.1007\/s00521-024-10239-6","volume":"36","author":"A Abed","year":"2024","unstructured":"Abed A, Akrout B, Amous I (2024) Deep learning-based few-shot person re-identification from top-view rgb and depth images. Neural Comput Appl 36(31):19365\u201319382","journal-title":"Neural Comput Appl"},{"key":"7027_CR28","doi-asserted-by":"publisher","first-page":"102272","DOI":"10.1016\/j.inffus.2024.102272","volume":"106","author":"N Lu","year":"2024","unstructured":"Lu N, Han Z, Han M, Qian J (2024) Bi-stream graph learning based multimodal fusion for emotion recognition in conversation. Inf Fusion 106:102272","journal-title":"Inf Fusion"},{"key":"7027_CR29","doi-asserted-by":"publisher","first-page":"111969","DOI":"10.1016\/j.knosys.2024.111969","volume":"296","author":"L Guo","year":"2024","unstructured":"Guo L, Song Y, Ding S (2024) Speaker-aware cognitive network with cross-modal attention for multimodal emotion recognition in conversation. Knowl-Based Syst 296:111969","journal-title":"Knowl-Based Syst"},{"key":"7027_CR30","unstructured":"Tran Van C, Tran TV, Nguyen V, Son Hy T (2024) Effective context modeling framework for emotion recognition in conversations. arXiv e-prints, arXiv-2412"},{"key":"7027_CR31","doi-asserted-by":"crossref","unstructured":"Li J, Ding S, Guo L, Li X (2025) Multi-modal anchor gated transformer with knowledge distillation for emotion recognition in conversation. arXiv:2506.18716","DOI":"10.24963\/ijcai.2025\/905"},{"key":"7027_CR32","doi-asserted-by":"publisher","first-page":"1181598","DOI":"10.3389\/fnbot.2023.1181598","volume":"17","author":"Y Wang","year":"2023","unstructured":"Wang Y, Gu Y, Yin Y, Han Y, Zhang H, Wang S, Li C, Quan D (2023) Multimodal transformer augmented fusion for speech emotion recognition. Front Neurorobot 17:1181598","journal-title":"Front Neurorobot"},{"key":"7027_CR33","doi-asserted-by":"crossref","unstructured":"Dai W, Cahyawijaya S, Liu Z, Fung P (2021) Multimodal end-to-end sparse model for emotion recognition. arXiv:2103.09666","DOI":"10.18653\/v1\/2021.naacl-main.417"},{"key":"7027_CR34","doi-asserted-by":"crossref","unstructured":"Ghosal D, Majumder N, Poria S, Chhaya N, Gelbukh A (2019) DialogueGCN: A graph convolutional neural network for emotion recognition in conversation. arXiv:1908.11549","DOI":"10.18653\/v1\/D19-1015"},{"key":"7027_CR35","doi-asserted-by":"crossref","unstructured":"Wu Z, Lu Y, Dai X (2023) An empirical study and improvement for speech emotion recognition. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp 1\u20135","DOI":"10.1109\/ICASSP49357.2023.10095042"},{"key":"7027_CR36","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Adv Neural Inf Process Syst 30"},{"key":"7027_CR37","unstructured":"Loshchilov I, Hutter F (2019) Decoupled weight decay regularization. In: Proceedings of the 7th International Conference on Learning Representations (ICLR)"},{"key":"7027_CR38","unstructured":"Loshchilov I, Hutter F (2017) SGDR: stochastic gradient descent with warm restarts. In: Proceedings of the 5th International Conference on Learning Representations (ICLR), preprint"},{"key":"7027_CR39","doi-asserted-by":"crossref","unstructured":"Patamia RA, Santos PE, Acheampong KN, Ekong F, Sarpong K, Kun S (2023) Multimodal speech emotion recognition using modality-specific self-supervised frameworks. In: 2023 IEEE International Conference on Systems, Man, and Cybernetics (SMC), pp 4134\u20134141","DOI":"10.1109\/SMC53992.2023.10394418"},{"key":"7027_CR40","doi-asserted-by":"crossref","unstructured":"Li J, Wang S, Chao Y, Liu X, Meng H (2022) Context-aware multimodal fusion for speech emotion recognition. In: Proceedings of Interspeech, pp 4218\u20134222","DOI":"10.21437\/Interspeech.2022-10592"},{"issue":"8","key":"7027_CR41","doi-asserted-by":"publisher","first-page":"1301","DOI":"10.1109\/JSTSP.2017.2764438","volume":"11","author":"P Tzirakis","year":"2017","unstructured":"Tzirakis P, Trigeorgis G, Nicolaou MA, Schuller BW, Zafeiriou S (2017) End-to-end multimodal emotion recognition using deep neural networks. IEEE J Sel Top Signal Process 11(8):1301\u20131309","journal-title":"IEEE J Sel Top Signal Process"},{"key":"7027_CR42","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T et\u00a0al (2020) An image is worth 16x16 words: transformers for image recognition at scale. arXiv:2010.11929"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-025-07027-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-025-07027-7","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-025-07027-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,3]],"date-time":"2026-01-03T06:11:30Z","timestamp":1767420690000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-025-07027-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12]]},"references-count":42,"journal-issue":{"issue":"18","published-print":{"date-parts":[[2025,12]]}},"alternative-id":["7027"],"URL":"https:\/\/doi.org\/10.1007\/s10489-025-07027-7","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"type":"print","value":"0924-669X"},{"type":"electronic","value":"1573-7497"}],"subject":[],"published":{"date-parts":[[2025,12]]},"assertion":[{"value":"5 August 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 November 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 December 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of Interest"}}],"article-number":"1143"}}