{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:14:39Z","timestamp":1765008879967,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":26,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,9]]},"DOI":"10.1145\/3743093.3771046","type":"proceedings-article","created":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:06:16Z","timestamp":1765008376000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Speech Emotion Recognition via Multi-Level Acoustic Modeling and Cross-Modal Temporal Fusion"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-1880-1604","authenticated-orcid":false,"given":"Xuan","family":"Zhang","sequence":"first","affiliation":[{"name":"Key Laboratory of Computing Power Network and Information Security, Ministry of Education, Shandong Computer Science Center (National Supercomputer Center in Jinan), Qilu University of Technology (Shandong Academy of Sciences); Shandong Provincial Key Laboratory of Industrial Network and Information System Security, Shandong Fundamental Research Center for Computer Science, Jinan, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4851-2094","authenticated-orcid":false,"given":"Peng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Computing Power Network and Information Security, Ministry of Education, Shandong Computer Science Center (National Supercomputer Center in Jinan), Qilu University of Technology (Shandong Academy of Sciences); Shandong Provincial Key Laboratory of Industrial Network and Information System Security, Shandong Fundamental Research Center for Computer Science, Jinan, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4670-2197","authenticated-orcid":false,"given":"Jianqiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Computing Power Network and Information Security, Ministry of Education, Shandong Computer Science Center (National Supercomputer Center in Jinan), Qilu University of Technology (Shandong Academy of Sciences); Shandong Provincial Key Laboratory of Industrial Network and Information System Security, Shandong Fundamental Research Center for Computer Science, Jinan, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7624-1917","authenticated-orcid":false,"given":"Wei","family":"Zhao","sequence":"additional","affiliation":[{"name":"Shandong Computer Science Center (National Supercomputer Center in Jinan), Qilu University of Technology (Shandong Academy of Sciences), Jinan, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2843-0136","authenticated-orcid":false,"given":"Fuqiang","family":"Wang","sequence":"additional","affiliation":[{"name":"Shandong Computer Science Center (National Supercomputer Center in Jinan), Qilu University of Technology (Shandong Academy of Sciences), Jinan, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3160-0620","authenticated-orcid":false,"given":"Xiaoming","family":"Wu","sequence":"additional","affiliation":[{"name":"Shandong Computer Science Center (National Supercomputer Center in Jinan), Qilu University of Technology (Shandong Academy of Sciences), Jinan, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,12,6]]},"reference":[{"key":"e_1_3_3_1_2_2","volume-title":"Journal of Physics: Conference Series","volume":"1896","author":"Atmaja Bagus\u00a0Tris","year":"2020","unstructured":"Bagus\u00a0Tris Atmaja and Masato Akagi. 2020. Evaluation of Error- and Correlation-Based Loss Functions for Multitask Learning Dimensional Speech Emotion Recognition. In Journal of Physics: Conference Series , Vol.\u00a01896. https:\/\/api.semanticscholar.org\/CorpusID:214623473"},{"key":"e_1_3_3_1_3_2","unstructured":"Alexei Baevski Henry Zhou Abdel rahman Mohamed and Michael Auli. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. https:\/\/arxiv.org\/abs\/2006.11477. arXiv:https:\/\/arXiv.org\/abs\/2006.11477."},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"John\u00a0Lorenzo Bautista Yun\u00a0Kyung Lee Seungyoon Nam Chanki Park and Hyun\u00a0Soon Shin. 2023. Utilizing Dimensional Emotion Representations in Speech Emotion Recognition. Human-Centered Design and User Experience (2023). https:\/\/api.semanticscholar.org\/CorpusID:264824797 Accessed via Semantic Scholar.","DOI":"10.54941\/ahfe1004283"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","unstructured":"John\u00a0Lorenzo Bautista and Hyun\u00a0Soon Shin. 2025. Speech Emotion Recognition Model Based on Joint Modeling of Discrete and Dimensional Emotion Representation. Appl. Sci. 15 2 (2025) 623. 10.3390\/app15020623","DOI":"10.3390\/app15020623"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","unstructured":"Carlos Busso Murtaza Bulut Chi-Chun Lee Abe Kazemzadeh Emily Mower Samuel Kim Jeannette\u00a0N. Chang Sungbok Lee and Shrikanth\u00a0S. Narayanan. 2008. IEMOCAP: Interactive Emotional Dyadic Motion Capture Database. Language Resources and Evaluation 42 4 (2008) 335\u2013359. 10.1007\/s10579-008-9076-6","DOI":"10.1007\/s10579-008-9076-6"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414540"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","unstructured":"Sanyuan Chen Chengyi Wang Zhengyang Chen Yu Wu Shujie Liu Zhuo Chen Jinyu Li Naoyuki Kanda Takuya Yoshioka Xiong Xiao et\u00a0al. 2022. WavLM: Large-scale Self-supervised Pre-training for Full Stack Speech Processing. IEEE Journal of Selected Topics in Signal Processing 16 6 (2022) 1505\u20131518. 10.1109\/JSTSP.2022.3209438","DOI":"10.1109\/JSTSP.2022.3209438"},{"key":"e_1_3_3_1_9_2","unstructured":"Vladimir Chernykh Grigoriy Sterling and Pavel Prihodko. 2017. Emotion Recognition from Speech with Recurrent Neural Networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1701.08071 (2017). https:\/\/arxiv.org\/abs\/1701.08071"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","unstructured":"R. Cowie E. Douglas-Cowie N. Tsapatsoulis G. Votsis S. Kollias W. Fellenz and J.\u00a0G. Taylor. 2001. Emotion Recognition in Human-Computer Interaction. IEEE Signal Processing Magazine 18 1 (Jan. 2001) 32\u201380. 10.1109\/79.911197","DOI":"10.1109\/79.911197"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747637"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","unstructured":"Wei-Ning Hsu Benjamin Bolte Yao-Hung\u00a0Hubert Tsai Kushal Lakhotia Ruslan Salakhutdinov and Abdel rahman Mohamed. 2021. HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units. IEEE\/ACM Transactions on Audio Speech and Language Processing 29 (2021) 3451\u20133460. 10.1109\/TASLP.2021.3112933","DOI":"10.1109\/TASLP.2021.3112933"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3252"},{"key":"e_1_3_3_1_14_2","unstructured":"Yoonhyung Lee Seunghyun Yoon and Kyomin Jung. 2020. Multimodal Speech Emotion Recognition Using Cross Attention with Aligned Audio and Text. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2207.12895 (2020). https:\/\/arxiv.org\/abs\/2207.12895"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097135"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","unstructured":"Seong-Gyun Leem Daniel Fulford Jukka-Pekka Onnela David Gard and Carlos Busso. 2024. Selective Acoustic Feature Enhancement for Speech Emotion Recognition With Noisy Speech. IEEE\/ACM Transactions on Audio Speech and Language Processing 32 (2024) 917\u2013929. 10.1109\/TASLP.2023.3340603","DOI":"10.1109\/TASLP.2023.3340603"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","unstructured":"Wei-Cheng Lin and Carlos Busso. 2023. Chunk-Level Speech Emotion Recognition: A General Framework of Sequence-to-One Dynamic Temporal Modeling. IEEE Transactions on Affective Computing 14 2 (2023) 1215\u20131227. 10.1109\/TAFFC.2021.3083821","DOI":"10.1109\/TAFFC.2021.3083821"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","unstructured":"Reza Lotfian and Carlos Busso. 2019. Building Naturalistic Emotionally Balanced Speech Corpus by Retrieving Emotional Speech from Existing Podcast Recordings. IEEE Transactions on Affective Computing 10 4 (2019) 471\u2013483. 10.1109\/TAFFC.2017.2736999","DOI":"10.1109\/TAFFC.2017.2736999"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1968"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414286"},{"key":"e_1_3_3_1_21_2","volume-title":"Frontiers of Computer Science","author":"Triantafyllopoulos Andreas","year":"2021","unstructured":"Andreas Triantafyllopoulos, Uwe\u00a0D. Reichel, Shuo Liu, Simon Huber, Florian Eyben, and Bj\u00f6rn Schuller. 2021. Multistage Linguistic Conditioning of Convolutional Layers for Speech Emotion Recognition. In Frontiers of Computer Science. https:\/\/api.semanticscholar.org\/CorpusID:238744414"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472669"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICWT50448.2020.9243622"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414880"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","unstructured":"Huan Zhao Nianxin Huang and Haijiao Chen. 2024. Knowledge Enhancement for Speech Emotion Recognition via Multi-Level Acoustic Feature. Connection Science 36 1 (2024). 10.1080\/09540091.2024.2312103","DOI":"10.1080\/09540091.2024.2312103"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1649"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747095"}],"event":{"name":"MMAsia '25: ACM Multimedia Asia","location":"Kuala Lumpur Malaysia","acronym":"MMAsia '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 7th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3743093.3771046","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:11:17Z","timestamp":1765008677000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3743093.3771046"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":26,"alternative-id":["10.1145\/3743093.3771046","10.1145\/3743093"],"URL":"https:\/\/doi.org\/10.1145\/3743093.3771046","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]},"assertion":[{"value":"2025-12-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}