{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,4]],"date-time":"2025-12-04T07:01:16Z","timestamp":1764831676032,"version":"3.46.0"},"reference-count":44,"publisher":"Tech Science Press","issue":"2","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["CMC"],"published-print":{"date-parts":[[2025]]},"DOI":"10.32604\/cmc.2025.067103","type":"journal-article","created":{"date-parts":[[2025,9,4]],"date-time":"2025-09-04T08:54:20Z","timestamp":1756976060000},"page":"2851-2872","source":"Crossref","is-referenced-by-count":0,"title":["Robust Audio-Visual Fusion for Emotion Recognition Based on Cross-Modal Learning under Noisy Conditions"],"prefix":"10.32604","volume":"85","author":[{"given":"A-Seong","family":"Moon","sequence":"first","affiliation":[]},{"given":"Seungyeon","family":"Jeong","sequence":"additional","affiliation":[]},{"given":"Donghee","family":"Kim","sequence":"additional","affiliation":[]},{"given":"Mohd Asyraf","family":"Zulkifley","sequence":"additional","affiliation":[]},{"given":"Bong-Soo","family":"Sohn","sequence":"additional","affiliation":[]},{"given":"Jaesung","family":"Lee","sequence":"additional","affiliation":[]}],"member":"17807","published-online":{"date-parts":[[2025]]},"reference":[{"key":"ref1","series-title":"Proceedings of the 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition; 2022 Jun 18\u201324","first-page":"2382","article-title":"Time-continuous audiovisual fusion with recurrence vs attention for in-the-wild affect recognition","author":"Karas"},{"key":"ref2","series-title":"2021 16th IEEE International Conference on Automatic Face and Gesture Recognition (FG 2021); 2021 Dec 15\u201318","first-page":"1","article-title":"Cross attentional audio-visual fusion for dimensional emotion recognition","author":"Praveen"},{"key":"ref3","doi-asserted-by":"crossref","first-page":"104676","DOI":"10.1016\/j.imavis.2023.104676","article-title":"Multimodal emotion recognition using cross modal audio-video fusion with attention and deep metric learning","volume":"133","author":"Mocanu","year":"2023","journal-title":"Image Vis Comput"},{"key":"ref4","doi-asserted-by":"crossref","first-page":"19","DOI":"10.1016\/j.inffus.2022.03.009","article-title":"A systematic review on affective computing: emotion models, databases, and recent advances","volume":"83","author":"Wang","year":"2022","journal-title":"Inform Fus"},{"key":"ref5","series-title":"2013 10th IEEE International Conference and Workshops on Automatic Face and Gesture Recognition (FG); 2013 Apr 22\u201326","first-page":"1","article-title":"Introducing the RECOLA multimodal corpus of remote collaborative and affective interactions","author":"Ringeval"},{"key":"ref6","doi-asserted-by":"crossref","first-page":"1022","DOI":"10.1109\/TPAMI.2019.2944808","article-title":"Sewa db: a rich database for audio-visual emotion and sentiment research in the wild","volume":"43","author":"Kossaifi","year":"2019","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref7","doi-asserted-by":"crossref","first-page":"5","DOI":"10.1109\/T-AFFC.2011.20","article-title":"The semaine database: annotated multimodal records of emotionally colored conversations between a person and a limited agent","volume":"3","author":"McKeown","year":"2011","journal-title":"IEEE Trans Affect Comput"},{"key":"ref8","doi-asserted-by":"crossref","first-page":"42","DOI":"10.1016\/j.neucom.2019.09.037","article-title":"An efficient model-level fusion approach for continuous affect recognition from audiovisual signals","volume":"376","author":"Pei","year":"2020","journal-title":"Neurocomputing"},{"key":"ref9","series-title":"Proceedings of the 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition; 2022 Jun 18\u201324","first-page":"2328","article-title":"Abaw: valence-arousal estimation, expression recognition, action unit detection & multi-task learning challenges","author":"Kollias"},{"key":"ref10","unstructured":"Bai S, Kolter JZ, Koltun V. An empirical evaluation of generic convolutional and recurrent networks for sequence modeling. arXiv:1803.01271. 2018."},{"key":"ref11","unstructured":"Yu J, Zhao G, Wang Y, Wei Z, Zheng Y, Zhang Z, et al. Multimodal fusion method with spatiotemporal sequences and relationship learning for valence-arousal estimation. arXiv: 2403.12425. 2024."},{"key":"ref12","series-title":"Proceedings of the 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition; 2024 Jun 16\u201322","first-page":"4773","article-title":"Multi-modal arousal and valence estimation under noisy conditions","author":"Dresvyanskiy"},{"key":"ref13","doi-asserted-by":"crossref","first-page":"1313","DOI":"10.1109\/TMM.2021.3063612","article-title":"Deep auto-encoders with sequential learning for multimodal dimensional emotion recognition","volume":"24","author":"Nguyen","year":"2021","journal-title":"IEEE Trans Multimed"},{"key":"ref14","series-title":"Proceedings of the 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition; 2023 Jun 17\u201324","first-page":"5756","article-title":"Leveraging TCN and transformer for effective visual-audio fusion in continuous emotion recognition","author":"Zhou"},{"key":"ref15","unstructured":"Zhang Q, Wei Y, Han Z, Fu H, Peng X, Deng C, et al. Multimodal fusion on low-quality data: a comprehensive survey. arXiv: 2404.18947. 2024."},{"key":"ref16","series-title":"Proceedings of the 2021 IEEE\/CVF International Conference on Computer Vision; 2021 Oct 11\u201317","first-page":"3567","article-title":"Continuous emotion recognition with audio-visual leader-follower attentive fusion","author":"Zhang"},{"key":"ref17","doi-asserted-by":"crossref","first-page":"942","DOI":"10.1109\/LSP.2022.3160373","article-title":"Branch-fusion-net for multi-modal continuous dimensional emotion recognition","volume":"29","author":"Li","year":"2022","journal-title":"IEEE Signal Process Lett"},{"key":"ref18","series-title":"European Conference on Computer Vision","first-page":"157","article-title":"Abaw: learning from synthetic data & multi-task learning challenges","author":"Kollias","year":"2022"},{"key":"ref19","series-title":"Proceedings of the 2017 IEEE Conference on Computer Vision and Pattern Recognition Workshops; 2017 Jul 21\u201326","first-page":"61","article-title":"Emotic: emotions in context dataset","author":"Kosti"},{"key":"ref20","series-title":"Proceedings of the 2017 IEEE Conference on Computer Vision and Pattern Recognition; 2017 Jul 21\u201326","first-page":"2852","article-title":"Reliable crowdsourcing and deep locality-preserving learning for expression recognition in the wild","author":"Li"},{"key":"ref21","doi-asserted-by":"crossref","first-page":"360","DOI":"10.1109\/TBIOM.2022.3233083","article-title":"Audio-visual fusion for emotion recognition in the valence-arousal space using joint cross-attention","volume":"5","author":"Praveen","year":"2023","journal-title":"IEEE Trans Biom Behav Identity Sci"},{"key":"ref22","unstructured":"Kollias D, Zafeiriou S. Aff-Wild2: extending the Aff-Wild database for affect recognition. arXiv: 1811.07770. 2018."},{"key":"ref23","series-title":"ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP); 2023 Jun 4\u20139","first-page":"1","article-title":"Recursive joint attention for audio-visual fusion in regression based emotion recognition","author":"Praveen"},{"key":"ref24","doi-asserted-by":"crossref","first-page":"108339","DOI":"10.1016\/j.engappai.2024.108339","article-title":"Using transformers for multimodal emotion recognition: taxonomies and state of the art review","volume":"133","author":"Hazmoune","year":"2024","journal-title":"Eng Appl Artif Intell"},{"key":"ref25","doi-asserted-by":"crossref","first-page":"121692","DOI":"10.1016\/j.eswa.2023.121692","article-title":"Deep learning-based multimodal emotion recognition from audio, visual, and text modalities: a systematic review of recent advancements and future prospects","volume":"237","author":"Zhang","year":"2024","journal-title":"Expert Syst Appl"},{"journal-title":"Surrey audio-visual expressed emotion (savee) database","year":"2014","author":"Jackson","key":"ref26"},{"key":"ref27","doi-asserted-by":"crossref","first-page":"126693","DOI":"10.1016\/j.neucom.2023.126693","article-title":"Survey on multimodal approaches to emotion recognition","volume":"556","author":"Gladys","year":"2023","journal-title":"Neurocomputing"},{"key":"ref28","doi-asserted-by":"crossref","first-page":"18","DOI":"10.1109\/TAFFC.2017.2740923","article-title":"Affectnet: a database for facial expression, valence, and arousal computing in the wild","volume":"10","author":"Mollahosseini","year":"2017","journal-title":"IEEE Trans Affect Comput"},{"key":"ref29","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.patrec.2021.03.007","article-title":"Leveraging recent advances in deep learning for audio-visual emotion recognition","volume":"146","author":"Schoneveld","year":"2021","journal-title":"Pattern Recognit Lett"},{"key":"ref30","doi-asserted-by":"crossref","first-page":"1213","DOI":"10.1007\/s13042-023-01964-w","article-title":"CoDF-Net: coordinated-representation decision fusion network for emotion recognition with EEG and eye movement signals","volume":"15","author":"Gong","year":"2024","journal-title":"Int J Mach Learn Cybern"},{"key":"ref31","first-page":"360","author":"Sad","year":"2016","journal-title":"Iberoamerican congress on pattern recognition"},{"key":"ref32","doi-asserted-by":"crossref","first-page":"1082","DOI":"10.1109\/TAFFC.2021.3100868","article-title":"Behavioral and physiological signals-based deep multimodal approach for mobile emotion recognition","volume":"14","author":"Yang","year":"2021","journal-title":"IEEE Trans Affect Comput"},{"key":"ref33","series-title":"Proceedings of the 2021 IEEE\/CVF International Conference on Computer Vision; 2021 Oct 10\u201317","first-page":"3652","article-title":"Analysing affective behavior in the second abaw2 competition","author":"Kollias"},{"key":"ref34","doi-asserted-by":"crossref","first-page":"424","DOI":"10.1016\/j.inffus.2022.09.025","article-title":"Multimodal sentiment analysis: a systematic review of history, datasets, multimodal fusion methods, applications, challenges and future directions","volume":"91","author":"Gandhi","year":"2023","journal-title":"Inform Fus"},{"key":"ref35","series-title":"ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP); 2024 Apr 14\u201319","first-page":"12981","article-title":"Cross-subject EEG emotion recognition based on interconnected dynamic domain adaptation","author":"An"},{"key":"ref36","doi-asserted-by":"crossref","first-page":"192","DOI":"10.1016\/j.patrec.2025.02.024","article-title":"Multi-corpus emotion recognition method based on cross-modal gated attention fusion","volume":"190","author":"Ryumina","year":"2025","journal-title":"Pattern Recognit Lett"},{"key":"ref37","doi-asserted-by":"crossref","first-page":"1440","DOI":"10.3390\/e25101440","article-title":"A survey of deep learning-based multimodal emotion recognition: speech, text, and face","volume":"25","author":"Lian","year":"2023","journal-title":"Entropy"},{"key":"ref38","doi-asserted-by":"crossref","first-page":"23","DOI":"10.1016\/j.imavis.2017.02.001","article-title":"AFEW-VA database for valence and arousal estimation in-the-wild","volume":"65","author":"Kossaifi","year":"2017","journal-title":"Image Vis Comput"},{"key":"ref39","series-title":"2020 15th IEEE International Conference on Automatic Face and Gesture Recognition (FG 2020); 2020 Nov 16\u201320","first-page":"637","article-title":"Analysing affective behavior in the first abaw 2020 competition","author":"Kollias"},{"key":"ref40","series-title":"Proceedings of the 2017 IEEE Conference on Computer Vision and Pattern Recognition Workshops; 2017 Jul 21\u201326","first-page":"34","article-title":"Aff-Wild: valence and arousal \u2018in-the-wild\u2019 challenge","author":"Zafeiriou"},{"key":"ref41","first-page":"107547","article-title":"xlstm: extended long short-term memory","volume":"37","author":"Beck","year":"2024","journal-title":"Adv Neural Inform Process Syst"},{"key":"ref42","series-title":"First Conference on Language Modeling; 2024 Oct 7\u20139","first-page":"1","article-title":"Mamba: linear-time sequence modeling with selective state spaces","author":"Gu"},{"key":"ref43","series-title":"Proceedings of the 32th AAAI Conference on Artificial Intelligence; 2018 Feb 2\u20137","first-page":"5642","article-title":"Multi-attention recurrent network for human communication comprehension","author":"Zadeh"},{"key":"ref44","series-title":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics; 2018 Jul 15\u201320","first-page":"2236","article-title":"Multimodal language analysis in the wild: Cmu-mosei dataset and interpretable dynamic fusion graph","author":"Zadeh"}],"container-title":["Computers, Materials &amp; Continua"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/cdn.techscience.cn\/files\/cmc\/2025\/TSP_CMC-85-2\/TSP_CMC_67103\/TSP_CMC_67103.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,4]],"date-time":"2025-12-04T05:35:38Z","timestamp":1764826538000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.techscience.com\/cmc\/v85n2\/63810"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":44,"journal-issue":{"issue":"2","published-online":{"date-parts":[[2025]]},"published-print":{"date-parts":[[2025]]}},"URL":"https:\/\/doi.org\/10.32604\/cmc.2025.067103","relation":{},"ISSN":["1546-2226"],"issn-type":[{"type":"electronic","value":"1546-2226"}],"subject":[],"published":{"date-parts":[[2025]]}}}