{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T12:42:54Z","timestamp":1779885774416,"version":"3.53.1"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62171427"],"award-info":[{"award-number":["62171427"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612859","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"9531-9535","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Hierarchical Audio-Visual Information Fusion with Multi-label Joint Decoding for MER 2023"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-7621-9724","authenticated-orcid":false,"given":"Haotian","family":"Wang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6234-130X","authenticated-orcid":false,"given":"Yuxuan","family":"Xi","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0904-8946","authenticated-orcid":false,"given":"Hang","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2387-0389","authenticated-orcid":false,"given":"Jun","family":"Du","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5668-9068","authenticated-orcid":false,"given":"Yan","family":"Song","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3843-3920","authenticated-orcid":false,"given":"Qing","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7878-6531","authenticated-orcid":false,"given":"Hengshun","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1457-5120","authenticated-orcid":false,"given":"Chenxi","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2416-3720","authenticated-orcid":false,"given":"Jiefeng","family":"Ma","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3345-605X","authenticated-orcid":false,"given":"Pengfei","family":"Hu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8513-3171","authenticated-orcid":false,"given":"Ya","family":"Jiang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1499-0941","authenticated-orcid":false,"given":"Shi","family":"Cheng","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1124-0854","authenticated-orcid":false,"given":"Jie","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7135-5675","authenticated-orcid":false,"given":"Yuzhe","family":"Weng","sequence":"additional","affiliation":[{"name":"Northwestern Polytechnical University, Xi'an, Shaanxi, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. arxiv","author":"Baevski Alexei","year":"2006","unstructured":"Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. arxiv: 2006.11477 [cs.CL]"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2016.7477553"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3133944.3133949"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11041"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00781"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123818.3123852"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2012.06.016"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3462244.3479919"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Wei Han Hui Chen and Soujanya Poria. 2021a. Improving Multimodal Fusion with Hierarchical Mutual Information Maximization for Multimodal Sentiment Analysis. arxiv: 2109.00412 [cs.CL]","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3121050.3121093"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219853"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.534"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410566.3410595"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2936124"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3049898"},{"key":"e_1_3_2_1_19_1","volume-title":"MER 2023: Multi-label Learning, Modality Robustness, and Semi-Supervised Learning. arxiv: 2304","author":"Lian Zheng","year":"2023","unstructured":"Zheng Lian, Haiyang Sun, Licai Sun, Jinming Zhao, Ye Liu, Bin Liu, Jiangyan Yi, Meng Wang, Erik Cambria, Guoying Zhao, et al. 2023. MER 2023: Multi-label Learning, Modality Robustness, and Semi-Supervised Learning. arxiv: 2304.08981 [cs.CL]"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1577"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00406"},{"key":"e_1_3_2_1_22_1","unstructured":"Huaishao Luo Lei Ji Yanyong Huang Bin Wang Shenggong Ji and Tianrui Li. 2021. ScaleVLAD: Improving Multimodal Sentiment Analysis via Multi-Scale Fusion of Locally Descriptors. arxiv: 2112.01368 [cs.CL]"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","unstructured":"Mona Hafez Mahmoud. 2019. A Survey of Some Interdisciplinary Methods and Tools to Measure Learners' Emotions in Intelligent Tutoring Systems. In 2019 6th International Conference on Advanced Control Circuits and Systems (ACCS) and 2019 5th International Conference on New Paradigms in Electronics & information Technology (PEIT). 1--6. https:\/\/doi.org\/10.1109\/ACCS-PEIT48329.2019.9062885","DOI":"10.1109\/ACCS-PEIT48329.2019.9062885"},{"key":"e_1_3_2_1_24_1","first-page":"435","article-title":"A review of emotion regulation in intelligent tutoring systems","volume":"18","author":"Malekzadeh Mehdi","year":"2015","unstructured":"Mehdi Malekzadeh, Mumtaz Begum Mustafa, and Adel Lahsasna. 2015. A review of emotion regulation in intelligent tutoring systems. Journal of Educational Technology & Society, Vol. 18, 4 (2015), 435--445.","journal-title":"Journal of Educational Technology & Society"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2017.2713783"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2993803"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2967306"},{"key":"e_1_3_2_1_28_1","volume-title":"Musan: A music, speech, and noise corpus. arxiv: 1510.08484","author":"Snyder David","year":"2015","unstructured":"David Snyder, Guoguo Chen, and Daniel Povey. 2015. Musan: A music, speech, and noise corpus. arxiv: 1510.08484"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2009-586"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3551876.3554810"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigData55660.2022.10021053"},{"key":"e_1_3_2_1_32_1","unstructured":"Yingzhi Wang Abdelmoumene Boumadane and Abdelwahab Heba. 2022a. A Fine-tuned Wav2vec 2.0\/HuBERT Benchmark For Speech Emotion Recognition Speaker Verification and Spoken Language Understanding. arxiv: 2111.02735 [cs.CL]"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3057270"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3184480"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475292"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3096037"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340555.3355713"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612859","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612859","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:12:53Z","timestamp":1755821573000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612859"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":37,"alternative-id":["10.1145\/3581783.3612859","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612859","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}