{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:09:30Z","timestamp":1765008570514,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,9]]},"DOI":"10.1145\/3743093.3771083","type":"proceedings-article","created":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:06:16Z","timestamp":1765008376000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Robust speech emotion recognition using conditional transformer-based architecture"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4829-9799","authenticated-orcid":false,"given":"Hanwook","family":"Chung","sequence":"first","affiliation":[{"name":"Forvia Irystec Inc., Montreal, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5337-4475","authenticated-orcid":false,"given":"Hyunjin","family":"Yoo","sequence":"additional","affiliation":[{"name":"Forvia Irystec Inc., Montreal, Canada"}]}],"member":"320","published-online":{"date-parts":[[2025,12,6]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746679"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683799"},{"key":"e_1_3_3_2_4_2","volume-title":"Pattern recognition and machine learning","author":"Bishop Christopher\u00a0M","year":"2006","unstructured":"Christopher\u00a0M Bishop and Nasser\u00a0M Nasrabadi. 2006. Pattern recognition and machine learning. Vol.\u00a04. Springer."},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"crossref","unstructured":"Carlos Busso Murtaza Bulut Chi-Chun Lee Abe Kazemzadeh Emily Mower Samuel Kim Jeannette\u00a0N Chang Sungbok Lee and Shrikanth\u00a0S Narayanan. 2008. IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation 42 4 (2008) 335\u2013359.","DOI":"10.1007\/s10579-008-9076-6"},{"key":"e_1_3_3_2_6_2","first-page":"3257","volume-title":"INTERSPEECH","author":"Chakraborty Rupayan","year":"2019","unstructured":"Rupayan Chakraborty, Ashish Panda, Meghna Pandharipande, Sonal Joshi, and Sunil\u00a0Kumar Kopparapu. 2019. Front-end feature compensation and denoising for noise robust speech emotion Recognition.. In INTERSPEECH. 3257\u20133261."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Mingyi Chen Xuanji He Jing Yang and Han Zhang. 2018. 3-D convolutional recurrent neural networks with attention model for speech emotion recognition. IEEE Signal Processing Letters 25 10 (2018) 1440\u20131444.","DOI":"10.1109\/LSP.2018.2860246"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"crossref","unstructured":"Zebang Cheng Zhi-Qi Cheng Jun-Yan He Kai Wang Yuxiang Lin Zheng Lian Xiaojiang Peng and Alexander Hauptmann. 2024. Emotion-llama: Multimodal emotion recognition and reasoning with instruction tuning. Advances in Neural Information Processing Systems 37 (2024) 110805\u2013110853.","DOI":"10.52202\/079017-3518"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/MLSP.2018.8517027"},{"key":"e_1_3_3_2_10_2","first-page":"1274","volume-title":"Proceedings of the 36th Intelligent Vehicles Symposium (IV)","author":"Chung Hanwook","year":"2025","unstructured":"Hanwook Chung and Yunjin Yoo. 2025. Conditional tranformer-based U-net architecture for speech emotion recognition. In Proceedings of the 36th Intelligent Vehicles Symposium (IV). IEEE, Cluj-Napoca, Romania, 1274\u20131279."},{"key":"e_1_3_3_2_11_2","first-page":"2457","volume-title":"Interspeech","author":"Deng Feng","year":"2020","unstructured":"Feng Deng, Tao Jiang, Xiao-Rui Wang, Chen Zhang, and Yan Li. 2020. NAAGN: Noise-aware attention-gated network for speech enhancement. In Interspeech. 2457\u20132461."},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414060"},{"key":"e_1_3_3_2_13_2","first-page":"1060","volume-title":"Interspeech","author":"Gao Yuan","year":"2024","unstructured":"Yuan Gao, Hao Shi, Chenhui Chu, and Tatsuya Kawahara. 2024. Speech emotion recognition with multi-level acoustic and semantic information extraction and interaction. In Interspeech , Vol.\u00a02024. 1060\u20131064."},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"S Girirajan and A Pandian. 2023. Real-time speech enhancement based on convolutional recurrent neural network. Intelligent Automation & Soft Computing 35 2 (2023).","DOI":"10.32604\/iasc.2023.028090"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747157"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Lucas Goncalves Seong-Gyun Leem Wei-Cheng Lin Berrak Sisman and Carlos Busso. 2024. Versatile audio-visual learning for emotion recognition. IEEE Transactions on Affective Computing (2024).","DOI":"10.1109\/TAFFC.2024.3433386"},{"key":"e_1_3_3_2_17_2","first-page":"2024","volume-title":"Interspeech","author":"Hu Yuchen","year":"2024","unstructured":"Yuchen Hu, Chen Chen, Ruizhe Li, Qiushi Zhu, and Eng\u00a0Siong Chng. 2024. Noise-aware speech enhancement using diffusion probabilistic model. In Interspeech. 2024\u2013929."},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00037"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"crossref","unstructured":"Yan Li Yapeng Wang Xu Yang and Sio-Kei Im. 2023. Speech emotion recognition based on Graph-LSTM neural network. EURASIP Journal on Audio Speech and Music Processing 2023 1 (2023) 40.","DOI":"10.1186\/s13636-023-00303-9"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Eva Lieskovska Maros Jakubec Roman Jarina and Michal Chmulik. 2021. A review on speech emotion recognition using deep learning and attention mechanism. Electronics 10 10 (2021) 1163.","DOI":"10.3390\/electronics10101163"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Gang Liu Shifang Cai and Ce Wang. 2023. Speech emotion recognition based on emotion perception. EURASIP Journal on Audio Speech and Music Processing 2023 1 (2023) 22.","DOI":"10.1186\/s13636-023-00289-4"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Yang Liu Xin Chen Yarong Li Jie Ma Xiaoqi Yang Yuan Song Xiaolei Meng Yongwei Li Xingfeng Li and Zhen Zhao. 2025. Enhanced speech emotion recognition in noisy environments: Adaptive emotion denoising diffusion approach with iterative confidence learning strategy. IEEE Internet of Things Journal (2025).","DOI":"10.1109\/JIOT.2025.3595096"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"crossref","unstructured":"Yang Liu Xin Chen Yongwei Li Longbiao Wang and Zhen Zhao. 2025. Multi-stage confidence-guided diffusion and emotional bidirectional Mamba for robust speech emotion recognition. IEEE Signal Processing Letters 32 (2025) 2184\u20132188.","DOI":"10.1109\/LSP.2025.3570245"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052937"},{"key":"e_1_3_3_2_25_2","first-page":"3683","volume-title":"Interspeech","author":"Ma Xi","year":"2018","unstructured":"Xi Ma, Zhiyong Wu, Jia Jia, Mingxing Xu, Helen Meng, and Lianhong Cai. 2018. Emotion recognition from variable-length speech segments using deep learning on spectrograms.. In Interspeech. 3683\u20133687."},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Ambuj Mehrish Navonil Majumder Rishabh Bharadwaj Rada Mihalcea and Soujanya Poria. 2023. A review of deep learning techniques for speech processing. Information Fusion 99 (2023) 101869.","DOI":"10.1016\/j.inffus.2023.101869"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3696409.3700183"},{"key":"e_1_3_3_2_28_2","unstructured":"Vaishnav Potlapalli Syed\u00a0Waqas Zamir Salman\u00a0H Khan and Fahad\u00a0Shahbaz Khan. 2023. Promptir: Prompting for all-in-one image restoration. Advances in Neural Information Processing Systems 36 (2023) 71275\u201371293."},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Sumit Ranjan Rupayan Chakraborty and Sunil\u00a0Kumar Kopparapu. 2024. reinforcement learning based data augmentation for noise robust speech emotion recognition. INTERSPEECH (2024).","DOI":"10.21437\/Interspeech.2024-921"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"crossref","unstructured":"Nasir Saleem Teddy\u00a0Surya Gunawan Mira Kartiwi Bambang\u00a0Setia Nugroho and Inung Wijayanto. 2023. NSE-CATNet: Deep neural speech enhancement using convolutional attention transformer network. IEEE Access 11 (2023) 66979\u201366994.","DOI":"10.1109\/ACCESS.2023.3290908"},{"key":"e_1_3_3_2_32_2","unstructured":"Yih-Liang Shen Pei-Chin Hsieh and Tai-Shih Chi. 2025. spectro-temporal modulations incorporated two-stream robust speech emotion recognition. IEEE Transactions on Affective Computing 14 8 (2025) 1\u201312."},{"key":"e_1_3_3_2_33_2","first-page":"451","volume-title":"International Conference on Pattern Recognition","author":"Shi Tong","year":"2024","unstructured":"Tong Shi, Xuri Ge, Joemon\u00a0M Jose, Nicolas Pugeault, and Paul Henderson. 2024. Detail-enhanced intra-and inter-modal interaction for audio-visual emotion recognition. In International Conference on Pattern Recognition. Springer, 451\u2013465."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2010.5496083"},{"key":"e_1_3_3_2_35_2","first-page":"035081","volume-title":"Proceedings of Meetings on Acoustics","volume":"19","author":"Thiemann Joachim","year":"2013","unstructured":"Joachim Thiemann, Nobutaka Ito, and Emmanuel Vincent. 2013. The diverse environments multi-channel acoustic noise database (demand): A database of multichannel environmental noise recordings. In Proceedings of Meetings on Acoustics , Vol.\u00a019. Acoustical Society of America, 035081."},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"crossref","unstructured":"Andrew Varga and Herman\u00a0JM Steeneken. 1993. Assessment for automatic speech recognition: II. NOISEX-92: A database and an experiment to study the effect of additive noise on speech recognition systems. Speech communication 12 3 (1993) 247\u2013251.","DOI":"10.1016\/0167-6393(93)90095-3"},{"key":"e_1_3_3_2_37_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054629"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO63174.2024.10715447"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPAASC58517.2023.10317412"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"crossref","unstructured":"Chunsheng Xu Yunqing Liu Wenjun Song Zonglin Liang and Xing Chen. 2024. A new network structure for speech emotion recognition research. Sensors 24 5 (2024) 1429.","DOI":"10.3390\/s24051429"},{"key":"e_1_3_3_2_42_2","first-page":"2670","volume-title":"Interspeech","author":"Xu Yong","year":"2014","unstructured":"Yong Xu, Jun Du, Li-Rong Dai, and Chin-Hui Lee. 2014. Dynamic noise aware training for speech enhancement based on deep neural networks. In Interspeech , Vol.\u00a01. 2670\u20132674."},{"key":"e_1_3_3_2_43_2","first-page":"3688","volume-title":"Interspeech","author":"Yenigalla Promod","year":"2018","unstructured":"Promod Yenigalla, Abhay Kumar, Suraj Tripathi, Chirag Singh, Sibsambhu Kar, and Jithendra Vepa. 2018. Speech emotion recognition using spectrogram & phoneme embedding.. In Interspeech , Vol.\u00a02018. 3688\u20133692."},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"crossref","unstructured":"Chenghao Zhang and Lei Xue. 2021. Autoencoder with emotion embedding for speech emotion recognition. IEEE access 9 (2021) 51231\u201351241.","DOI":"10.1109\/ACCESS.2021.3069818"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.23919\/APSIPA.2018.8659587"},{"key":"e_1_3_3_2_46_2","first-page":"4098","volume-title":"Interspeech","author":"Zhou Hengshun","year":"2020","unstructured":"Hengshun Zhou, Jun Du, Yanhui Tu, and Chin-Hui Lee. 2020. Using speech enhancement preprocessing for speech emotion recognition in realistic noisy conditions.. In Interspeech. 4098\u20134102."}],"event":{"name":"MMAsia '25: ACM Multimedia Asia","location":"Kuala Lumpur Malaysia","acronym":"MMAsia '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 7th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3743093.3771083","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:06:28Z","timestamp":1765008388000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3743093.3771083"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":45,"alternative-id":["10.1145\/3743093.3771083","10.1145\/3743093"],"URL":"https:\/\/doi.org\/10.1145\/3743093.3771083","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]},"assertion":[{"value":"2025-12-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}