{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T17:47:15Z","timestamp":1769622435331,"version":"3.49.0"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,11,26]],"date-time":"2025-11-26T00:00:00Z","timestamp":1764115200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T00:00:00Z","timestamp":1764288000000},"content-version":"vor","delay-in-days":2,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Intell Syst"],"DOI":"10.1007\/s44196-025-00811-w","type":"journal-article","created":{"date-parts":[[2025,11,26]],"date-time":"2025-11-26T17:25:57Z","timestamp":1764177957000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Cross-modal Synergy for Enhancing Emotion Recognition Through Integrated Audio\u2013Video Fusion Techniques"],"prefix":"10.1007","volume":"18","author":[{"given":"P.","family":"Santhiya","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kogilavani","family":"Shanmugavadivel","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"R.","family":"Rajalakshmi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"N.","family":"Krishnamoorthy","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,11,26]]},"reference":[{"key":"811_CR1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.122946","volume":"245","author":"M Khan","year":"2024","unstructured":"Khan, M., Gueaieb, W., El Saddik, A., Kwon, S.: MSER: Multi-modal speech emotion recognition using cross-attention with deep fusion. Expert Syst. Appl. 245, 122946 (2024)","journal-title":"Expert Syst. Appl."},{"key":"811_CR2","doi-asserted-by":"publisher","first-page":"64516","DOI":"10.1109\/ACCESS.2022.3183587","volume":"10","author":"YC Yoon","year":"2022","unstructured":"Yoon, Y.C.: Can we exploit all datasets? Multi-modal emotion recognition using cross-modal translation. IEEE Access 10, 64516\u201364524 (2022)","journal-title":"IEEE Access"},{"issue":"10","key":"811_CR3","doi-asserted-by":"publisher","first-page":"1440","DOI":"10.3390\/e25101440","volume":"25","author":"H Lian","year":"2023","unstructured":"Lian, H., Lu, C., Li, S., Zhao, Y., Tang, C., Zong, Y.: A survey of deep learning-based multi-modal emotion recognition: speech, text, and face. Entropy 25(10), 1440 (2023)","journal-title":"Entropy"},{"key":"811_CR4","doi-asserted-by":"crossref","unstructured":"Chudasama, V., Kar, P., Gudmalwar, A., Shah, N., Wasnik, P., & Onoe, N.: M2FNET: multi-modal fusion network for emotion recognition in conversation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4652\u20134661 (2022)","DOI":"10.1109\/CVPRW56347.2022.00511"},{"key":"811_CR5","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102218","volume":"105","author":"AV Geetha","year":"2024","unstructured":"Geetha, A.V., Mala, T., Priyanka, D., Uma, E.: Multi-modal Emotion Recognition with deep learning: advancements, challenges, and future directions. Inf. Fusion 105, 102218 (2024)","journal-title":"Inf. Fusion"},{"key":"811_CR6","doi-asserted-by":"crossref","unstructured":"Liang, J., Li, R., Jin, Q.: Semi-supervised multi-modal emotion recognition with cross-modal distribution matching. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 2852\u20132861 (2020)","DOI":"10.1145\/3394171.3413579"},{"key":"811_CR7","unstructured":"Venkatraman, S., Sharma, V., Malarvannan, S.: Multi-modal Emotion Recognition using Audio-Video Transformer Fusion with Cross Attention.\u00a0arXiv preprint arXiv:2407.18552 (2024)"},{"key":"811_CR8","doi-asserted-by":"publisher","DOI":"10.1016\/j.bspc.2023.105052","volume":"85","author":"S Zhang","year":"2023","unstructured":"Zhang, S., Yang, Y., Chen, C., Liu, R., Tao, X., Guo, W., Zhao, X.: Multi-modal emotion recognition based on audio and text by using hybrid attention networks. Biomed. Signal Process. Control 85, 105052 (2023)","journal-title":"Biomed. Signal Process. Control"},{"key":"811_CR9","unstructured":"Fu, Z., Liu, F., Wang, H., Qi, J., Fu, X., Zhou, A., Li, Z.: A cross-modal fusion network based on self-attention and residual structure for multi-modal emotion recognition.\u00a0arXiv preprint arXiv:2111.02172 (2021)"},{"key":"811_CR10","doi-asserted-by":"crossref","unstructured":"Li, H., Ding, W., Wu, Z., Liu, Z.: Learning fine-grained cross modality excitement for speech emotion recognition.\u00a0arXiv preprint arXiv:2010.12733 (2020)","DOI":"10.21437\/Interspeech.2021-158"},{"issue":"1","key":"811_CR11","first-page":"4767437","volume":"2022","author":"Z Quan","year":"2022","unstructured":"Quan, Z., Sun, T., Su, M., Wei, J.: Multi-modal sentiment analysis based on cross-modal attention and gated cyclic hierarchical fusion networks. Comput. Intell. Neurosci. 2022(1), 4767437 (2022)","journal-title":"Comput. Intell. Neurosci."},{"key":"811_CR12","doi-asserted-by":"publisher","first-page":"14742","DOI":"10.1109\/ACCESS.2023.3244390","volume":"11","author":"HD Le","year":"2023","unstructured":"Le, H.D., Lee, G.S., Kim, S.H., Kim, S., Yang, H.J.: Multi-label multi-modal emotion recognition with transformer-based fusion and emotion-level representation learning. IEEE Access 11, 14742\u201314751 (2023)","journal-title":"IEEE Access"},{"key":"811_CR13","doi-asserted-by":"crossref","unstructured":"Praveen, R.G., Alam, J.: Recursive joint cross-modal attention for multi-modal fusion in dimensional emotion recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4803\u20134813 (2024)","DOI":"10.1109\/CVPRW63382.2024.00483"},{"key":"811_CR14","doi-asserted-by":"publisher","first-page":"176274","DOI":"10.1109\/ACCESS.2020.3026823","volume":"8","author":"S Siriwardhana","year":"2020","unstructured":"Siriwardhana, S., Kaluarachchi, T., Billinghurst, M., Nanayakkara, S.: Multi-modal emotion recognition with transformer-based self-supervised feature fusion. IEEE Access 8, 176274\u2013176285 (2020)","journal-title":"IEEE Access"},{"issue":"1","key":"811_CR15","doi-asserted-by":"publisher","first-page":"2000688","DOI":"10.1080\/08839514.2021.2000688","volume":"36","author":"X Yan","year":"2022","unstructured":"Yan, X., Xue, H., Jiang, S., Liu, Z.: Multi-modal sentiment analysis using multi-tensor fusion network with cross-modal modeling. Appl. Artif. Intell. 36(1), 2000688 (2022)","journal-title":"Appl. Artif. Intell."},{"key":"811_CR16","doi-asserted-by":"publisher","first-page":"94557","DOI":"10.1109\/ACCESS.2021.3092735","volume":"9","author":"S Lee","year":"2021","unstructured":"Lee, S., Han, D.K., Ko, H.: Multi-modal emotion recognition fusion analysis adapting BERT with heterogeneous feature unification. IEEE Access 9, 94557\u201394572 (2021)","journal-title":"IEEE Access"},{"issue":"17","key":"811_CR17","doi-asserted-by":"publisher","first-page":"7962","DOI":"10.3390\/app11177962","volume":"11","author":"P Koromilas","year":"2021","unstructured":"Koromilas, P., Giannakopoulos, T.: Deep multi-modal emotion recognition on human speech: a review. Appl. Sci. 11(17), 7962 (2021)","journal-title":"Appl. Sci."},{"issue":"14","key":"811_CR18","doi-asserted-by":"publisher","first-page":"4913","DOI":"10.3390\/s21144913","volume":"21","author":"B Xie","year":"2021","unstructured":"Xie, B., Sidulova, M., Park, C.H.: Robust multi-modal emotion recognition from conversation with transformer-based crossmodality fusion. Sensors 21(14), 4913 (2021)","journal-title":"Sensors"},{"key":"811_CR19","doi-asserted-by":"publisher","DOI":"10.1016\/j.bspc.2021.103029","volume":"70","author":"Y Tan","year":"2021","unstructured":"Tan, Y., Sun, Z., Duan, F., Sol\u00e9-Casals, J., Caiafa, C.F.: A multi-modal emotion recognitionmethod based on facial expressions and electroencephalography. Biomed. Signal Process. Control 70, 103029 (2021)","journal-title":"Biomed. Signal Process. Control"},{"issue":"3","key":"811_CR20","first-page":"1397","volume":"16","author":"PS Tomar","year":"2024","unstructured":"Tomar, P.S., Mathur, K., Suman, U.: Fusing facial and speech cues for enhanced multi-modal emotion recognition. Int. J. Inf. Technol. 16(3), 1397\u20131405 (2024)","journal-title":"Int. J. Inf. Technol."},{"issue":"1","key":"811_CR21","doi-asserted-by":"publisher","first-page":"142","DOI":"10.30574\/ijsra.2024.12.1.0723","volume":"12","author":"S Nithyasri","year":"2024","unstructured":"Nithyasri, S., Hemavarthini, B., Gopalsamy, B.N.: Multi-modal emotion recognition from audio and video. Int. J. Sci. Res. Arch. 12(1), 142\u2013149 (2024)","journal-title":"Int. J. Sci. Res. Arch."},{"key":"811_CR22","doi-asserted-by":"publisher","first-page":"1898","DOI":"10.1109\/LSP.2021.3112314","volume":"28","author":"K Zhang","year":"2021","unstructured":"Zhang, K., Li, Y., Wang, J., Wang, Z., Li, X.: Feature fusion for multi-modal emotion recognition based on deep canonical correlation analysis. IEEE Signal Process. Lett. 28, 1898\u20131902 (2021)","journal-title":"IEEE Signal Process. Lett."},{"key":"811_CR23","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2022.108580","volume":"244","author":"AI Middya","year":"2022","unstructured":"Middya, A.I., Nag, B., Roy, S.: Deep learning based multi-modal emotion recognition using model-level fusion of audio\u2013visual modalities. Knowl. Based Syst. 244, 108580 (2022)","journal-title":"Knowl. Based Syst."},{"key":"811_CR24","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2023.104676","volume":"133","author":"B Mocanu","year":"2023","unstructured":"Mocanu, B., Tapu, R., Zaharia, T.: Multi-modal emotion recognition using cross modal audio-video fusion with attention and deep metric learning. Image Vis. Comput. 133, 104676 (2023)","journal-title":"Image Vis. Comput."},{"key":"811_CR25","unstructured":"Sanku, S.R., Sandhya, B.: Multi-modal Emotion recognition using novel feature fusion mechanism with SVM"},{"key":"811_CR26","doi-asserted-by":"publisher","first-page":"16205","DOI":"10.1007\/s11042-020-08796-8","volume":"80","author":"W Nie","year":"2021","unstructured":"Nie, W., Yan, Y., Song, D., Wang, K.: Multi-modal feature fusion based on multi-layers LSTM for video emotion recognition. Multimedia Tools Appl. 80, 16205\u201316214 (2021)","journal-title":"Multimedia Tools Appl."},{"key":"811_CR27","doi-asserted-by":"crossref","unstructured":"Poria, S., Chaturvedi, I., Cambria, E., Hussain, A.: Convolutional MKL based multi-modal emotion recognition and sentiment analysis. In: IEEE 16th international conference on data mining (ICDM), pp. 439\u2013448. IEEE (2023)","DOI":"10.1109\/ICDM.2016.0055"},{"key":"811_CR28","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.122579","volume":"240","author":"C Dixit","year":"2024","unstructured":"Dixit, C., Satapathy, S.M.: Deep CNN with late fusion for real time multi-modal emotion recognition. Expert Syst. Appl. 240, 122579 (2024)","journal-title":"Expert Syst. Appl."},{"key":"811_CR29","doi-asserted-by":"publisher","first-page":"172948","DOI":"10.1109\/ACCESS.2019.2955637","volume":"7","author":"S Nemati","year":"2021","unstructured":"Nemati, S., Rohani, R., Basiri, M.E., Abdar, M., Yen, N.Y., Makarenkov, V.: A hybrid latent space data fusion method for multi-modal emotion recognition. IEEE Access 7, 172948\u2013172964 (2021)","journal-title":"IEEE Access"},{"key":"811_CR30","doi-asserted-by":"publisher","first-page":"102306","DOI":"10.1016\/j.inffus.2024.102306","volume":"106","author":"C Fan","year":"2024","unstructured":"Fan, C., Lin, J., Mao, R., Cambria, E.: Fusing pairwise modalities for emotion recognition in conversations. Inf. Fusion 106, 102306 (2024). https:\/\/doi.org\/10.1016\/j.inffus.2024.102306","journal-title":"Inf. Fusion"},{"key":"811_CR31","doi-asserted-by":"publisher","first-page":"90982","DOI":"10.1109\/ACCESS.2019.2926751","volume":"7","author":"JKP Seng","year":"2023","unstructured":"Seng, J.K.P., Ang, K.L.M.: Multi-modal emotion and sentiment modeling from unstructured Big data: challenges, architecture, & techniques. IEEE Access 7, 90982\u201390998 (2023)","journal-title":"IEEE Access"},{"key":"811_CR32","doi-asserted-by":"publisher","first-page":"120469","DOI":"10.1016\/j.eswa.2023.120469","volume":"228","author":"Y-K Li","year":"2023","unstructured":"Li, Y.-K., Meng, Q.-H., Wang, Y.-X., Hou, H.-R.: MMFN: emotion recognition by fusing touch gesture and facial expression information. Expert Syst. Appl. 228, 120469 (2023). https:\/\/doi.org\/10.1016\/j.eswa.2023.120469","journal-title":"Expert Syst. Appl."},{"key":"811_CR33","doi-asserted-by":"publisher","first-page":"115","DOI":"10.1016\/j.patrec.2021.07.005","volume":"150","author":"Q Chen","year":"2021","unstructured":"Chen, Q., Chaturvedi, I., Ji, S., Cambria, E.: Sequential fusion of facial appearance and dynamics for depression recognition. Pattern Recognit. Lett. 150, 115\u2013121 (2021). https:\/\/doi.org\/10.1016\/j.patrec.2021.07.005","journal-title":"Pattern Recognit. Lett."},{"issue":"4","key":"811_CR34","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1007\/s13735-024-00347-3","volume":"13","author":"R Wang","year":"2024","unstructured":"Wang, R., Zhu, J., Wang, S., Wang, T., Huang, J., Zhu, X.: Multi-modal emotion recognition using tensor decomposition fusion and self-supervised multi-tasking. Int. J. Multimedia Inf. Retr. 13(4), 39 (2024)","journal-title":"Int. J. Multimedia Inf. Retr."},{"key":"811_CR35","doi-asserted-by":"crossref","unstructured":"Liu, R., Sisman, B., Li, H.: Reinforcement learning for emotional text-to-speech synthesis with improved emotion discriminability. arXiv preprint\u00a0arXiv:2104.01408 (2021)","DOI":"10.21437\/Interspeech.2021-1236"},{"key":"811_CR36","unstructured":"Shayaninasab, M., Babaali, B.: Multi-Modal Emotion Recognition by Text, Speech and Video Using Pretrained Transformers.\u00a0arXiv preprint arXiv:2402.07327 (2024)"},{"issue":"1","key":"811_CR37","doi-asserted-by":"publisher","first-page":"327","DOI":"10.3390\/app12010327","volume":"12","author":"C Luna-Jim\u00e9nez","year":"2021","unstructured":"Luna-Jim\u00e9nez, C., Kleinlein, R., Griol, D., Callejas, Z., Montero, J.M., Fern\u00e1ndez-Mart\u00ednez, F.: A proposal for multi-modal emotion recognition using aural transformers and action units on ravdess dataset. Appl. Sci. 12(1), 327 (2021)","journal-title":"Appl. Sci."},{"issue":"9","key":"811_CR38","doi-asserted-by":"publisher","first-page":"419","DOI":"10.3390\/info13090419","volume":"13","author":"R Pecoraro","year":"2022","unstructured":"Pecoraro, R., Basile, V., Bono, V.: Local multi-head channel self-attention for facial expression recognition. Information 13(9), 419 (2022)","journal-title":"Information"},{"key":"811_CR39","doi-asserted-by":"crossref","unstructured":"Gao, Y., Liu, J., Wang, L., Dang, J.: Metric learning based feature representation with gated fusion model for speech emotion recognition. In: Interspeech, pp. 4503\u20134507 (2021)","DOI":"10.21437\/Interspeech.2021-1133"},{"key":"811_CR40","unstructured":"https:\/\/www.kaggle.com\/datasets\/zaber666\/meld-dataset"}],"updated-by":[{"DOI":"10.1007\/s44196-025-01128-4","type":"correction","label":"Correction","source":"publisher","updated":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T00:00:00Z","timestamp":1769472000000}}],"container-title":["International Journal of Computational Intelligence Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s44196-025-00811-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s44196-025-00811-w","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s44196-025-00811-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T06:37:20Z","timestamp":1769582240000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s44196-025-00811-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,26]]},"references-count":40,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["811"],"URL":"https:\/\/doi.org\/10.1007\/s44196-025-00811-w","relation":{},"ISSN":["1875-6883"],"issn-type":[{"value":"1875-6883","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,26]]},"assertion":[{"value":"28 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 March 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 March 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 November 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 January 2026","order":7,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Update","order":8,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The original online version of this article was revised: In this article, N. Krishnamoorthy was incorrectly denoted as a second corresponding author, but P. Santhiya should have been the sole corresponding author.","order":9,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 January 2026","order":10,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Correction","order":11,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"A Correction to this paper has been published:","order":12,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"https:\/\/doi.org\/10.1007\/s44196-025-01128-4","URL":"https:\/\/doi.org\/10.1007\/s44196-025-01128-4","order":13,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for Publication"}},{"value":"Not applicable as the research was done on the publicly available dataset.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics Approval and Consent to Participate"}}],"article-number":"315"}}