{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T11:44:15Z","timestamp":1771501455417,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3696409.3700183","type":"proceedings-article","created":{"date-parts":[[2024,12,28]],"date-time":"2024-12-28T09:55:23Z","timestamp":1735379723000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["HuBERT-CLAP: Contrastive Learning-Based Multimodal Emotion Recognition using Self-Alignment Approach"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-4666-7181","authenticated-orcid":false,"given":"Long H.","family":"Nguyen","sequence":"first","affiliation":[{"name":"Ton Duc Thang University, Ho Chi Minh, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8086-6722","authenticated-orcid":false,"given":"Nhat Truong","family":"Pham","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University, Suwon, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8020-3590","authenticated-orcid":false,"given":"Mustaqeem","family":"Khan","sequence":"additional","affiliation":[{"name":"Mohamed Bin Zayed University of Artificial Intelligence, Abu Dhabi, UAE"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3442-0578","authenticated-orcid":false,"given":"Alice","family":"Othmani","sequence":"additional","affiliation":[{"name":"Universit\u00e9 Paris-Est Cr\u00e9teil (UPEC), Vitry-sur-Seine, France"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7690-8547","authenticated-orcid":false,"given":"Abdulmotaleb","family":"EI Saddik","sequence":"additional","affiliation":[{"name":"Mohamed Bin Zayed University of Artificial Intelligence, Abu Dhabi, UAE"}]}],"member":"320","published-online":{"date-parts":[[2024,12,28]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCWAMTIP51612.2020.9317379"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746679"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","unstructured":"Carlos Busso Murtaza Bulut et\u00a0al. 2008. IEMOCAP: interactive emotional dyadic motion capture database. Lang. Resour. Evaluation 42 4 (2008) 335\u2013359. 10.1007\/S10579-008-9076-6","DOI":"10.1007\/S10579-008-9076-6"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.18653\/V1\/N19-1423"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","unstructured":"Wei-Ning Hsu Benjamin Bolte et\u00a0al. 2021. HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units. IEEE ACM Trans. Audio Speech Lang. Process. 29 (2021) 3451\u20133460. 10.1109\/TASLP.2021.3122291 https:\/\/dl.acm.org\/doi\/10.1109\/TASLP.2021.3122291","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","unstructured":"Hemin Ibrahim Chu\u00a0Kiong Loo et\u00a0al. 2022. Bidirectional parallel echo state network for speech emotion recognition. Neural Comput. Appl. 34 20 (2022) 17581\u201317599. 10.1007\/S00521-022-07410-2 https:\/\/dl.acm.org\/doi\/10.1007\/S00521-022-07410-2","DOI":"10.1007\/S00521-022-07410-2"},{"key":"e_1_3_3_2_8_2","first-page":"482","volume-title":"International Conference on Intelligent Systems Design and Applications","author":"Indra J","year":"2022","unstructured":"J Indra, R\u00a0Kiruba Shankar, et\u00a0al. 2022. Speech Emotion Recognition Using Support Vector Machine and Linear Discriminant Analysis. In International Conference on Intelligent Systems Design and Applications. Springer, 482\u2013492."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","unstructured":"Mustaqeem Khan Abdulmotaleb et\u00a0al. 2023. AAD-Net: Advanced end-to-end signal processing system for human emotion detection & recognition using attention-based deep echo state network. Knowl. Based Syst. 270 (2023) 110525. 10.1016\/J.KNOSYS.2023.110525 https:\/\/dl.acm.org\/doi\/10.1016\/J.KNOSYS.2023.110525","DOI":"10.1016\/J.KNOSYS.2023.110525"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"crossref","unstructured":"Mustaqeem Khan Wail Gueaieb et\u00a0al. 2024. MSER: Multimodal speech emotion recognition using cross-attention with deep fusion. Expert Systems with Applications 245 (2024) 122946.","DOI":"10.1016\/j.eswa.2023.122946"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1718"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","unstructured":"Chi-Chun Lee Emily Mower et\u00a0al. 2011. Emotion recognition using a hierarchical binary decision tree approach. Speech Commun. 53 9-10 (2011) 1162\u20131171. 10.1016\/J.SPECOM.2011.06.004 https:\/\/dl.acm.org\/doi\/10.1016\/J.SPECOM.2011.06.004","DOI":"10.1016\/J.SPECOM.2011.06.004"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","unstructured":"Seong-Gyun Leem Daniel Fulford et\u00a0al. 2024. Selective Acoustic Feature Enhancement for Speech Emotion Recognition With Noisy Speech. IEEE ACM Trans. Audio Speech Lang. Process. 32 (2024) 917\u2013929. 10.1109\/TASLP.2023.3340603 https:\/\/dl.acm.org\/doi\/10.1109\/TASLP.2023.3340603","DOI":"10.1109\/TASLP.2023.3340603"},{"key":"e_1_3_3_2_14_2","first-page":"8024","volume-title":"In Proceeding of NeurIPS 2019, December 8-14, Vancouver, BC, Canada","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, et\u00a0al. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In In Proceeding of NeurIPS 2019, December 8-14, Vancouver, BC, Canada. 8024\u20138035."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414286"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","unstructured":"Nhat\u00a0Truong Pham Duc Ngoc\u00a0Minh Dang et\u00a0al. 2023. Hybrid data augmentation and deep attention-based dilated convolutional-recurrent neural networks for speech emotion recognition. Expert Syst. Appl. 230 (2023) 120608. 10.1016\/J.ESWA.2023.120608 https:\/\/dl.acm.org\/doi\/10.1016\/J.ESWA.2023.120608","DOI":"10.1016\/J.ESWA.2023.120608"},{"key":"e_1_3_3_2_17_2","first-page":"8748","volume-title":"Proceedings ICML 2021, 18-24 July 2021, Virtual Event","volume":"139","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, et\u00a0al. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings ICML 2021, 18-24 July 2021, Virtual Event , Vol.\u00a0139. PMLR, 8748\u20138763."},{"key":"e_1_3_3_2_18_2","unstructured":"Victor Sanh Lysandre Debut et\u00a0al. 2019. DistilBERT a distilled version of BERT: smaller faster cheaper and lighter. CoRR abs\/1910.01108 (2019). arXiv:https:\/\/arXiv.org\/abs\/1910.01108"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461750"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.21437\/INTERSPEECH.2019-2822"},{"key":"e_1_3_3_2_21_2","unstructured":"Laurens Van\u00a0der Maaten et\u00a0al. 2008. Visualizing data using t-SNE. Journal of machine learning research 9 11 (2008)."},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/3475957.3484448"},{"key":"e_1_3_3_2_23_2","unstructured":"Yingzhi Wang Abdelmoumene Boumadane et\u00a0al. 2021. A Fine-tuned Wav2vec 2.0\/HuBERT Benchmark For Speech Emotion Recognition Speaker Verification and Spoken Language Understanding. CoRR abs\/2111.02735 (2021). arXiv:https:\/\/arXiv.org\/abs\/2111.02735"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICSC56153.2023.00032"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096370"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639583"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Shiqing Zhang Yijiao Yang Chen Chen Ruixin Liu Xin Tao Wenping Guo Yicheng Xu and Xiaoming Zhao. 2023. Multimodal emotion recognition based on audio and text by using hybrid attention networks. Biomedical Signal Processing and Control 85 (2023) 105052.","DOI":"10.1016\/j.bspc.2023.105052"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-413"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.21437\/INTERSPEECH.2020-2408"}],"event":{"name":"MMAsia '24: ACM Multimedia Asia","location":"Auckland New Zealand","acronym":"MMAsia '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 6th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700183","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696409.3700183","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:11Z","timestamp":1750295411000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700183"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":29,"alternative-id":["10.1145\/3696409.3700183","10.1145\/3696409"],"URL":"https:\/\/doi.org\/10.1145\/3696409.3700183","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}