{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,8]],"date-time":"2026-03-08T01:27:04Z","timestamp":1772933224062,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,12]],"date-time":"2023-06-12T00:00:00Z","timestamp":1686528000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,12]]},"DOI":"10.1145\/3591106.3592296","type":"proceedings-article","created":{"date-parts":[[2023,6,8]],"date-time":"2023-06-08T22:33:38Z","timestamp":1686263618000},"page":"622-626","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["CLAP: Contrastive Language-Audio Pre-training Model for Multi-modal Sentiment Analysis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-2923-5890","authenticated-orcid":false,"given":"Tianqi","family":"Zhao","sequence":"first","affiliation":[{"name":"Zhejiang University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6177-3707","authenticated-orcid":false,"given":"Ming","family":"Kong","sequence":"additional","affiliation":[{"name":"Zhejiang University, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6406-0077","authenticated-orcid":false,"given":"Tian","family":"Liang","sequence":"additional","affiliation":[{"name":"Zhejiang University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2405-6776","authenticated-orcid":false,"given":"Qiang","family":"Zhu","sequence":"additional","affiliation":[{"name":"Zhejiang University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5524-5185","authenticated-orcid":false,"given":"Kun","family":"Kuang","sequence":"additional","affiliation":[{"name":"Zhejiang University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2139-8807","authenticated-orcid":false,"given":"Fei","family":"Wu","sequence":"additional","affiliation":[{"name":"Zhejiang University, China"}]}],"member":"320","published-online":{"date-parts":[[2023,6,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"TEASEL: A Transformer-Based Speech-Prefixed Language Model. arXiv preprint arXiv:2109.05522","author":"Arjmand Mehdi","year":"2021","unstructured":"Mehdi Arjmand, Mohammad\u00a0Javad Dousti, and Hadi Moradi. 2021. TEASEL: A Transformer-Based Speech-Prefixed Language Model. arXiv preprint arXiv:2109.05522 (2021)."},{"key":"e_1_3_2_1_2_1","volume-title":"Multimodal machine learning: A survey and taxonomy","author":"Baltru\u0161aitis Tadas","year":"2018","unstructured":"Tadas Baltru\u0161aitis, Chaitanya Ahuja, and Louis-Philippe Morency. 2018. Multimodal machine learning: A survey and taxonomy. IEEE transactions on pattern analysis and machine intelligence 41, 2 (2018), 423\u2013443."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3136801"},{"key":"e_1_3_2_1_4_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1207"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3462244.3479919"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413678"},{"key":"e_1_3_2_1_9_1","volume-title":"Deep multimodal multilinear fusion with high-order polynomial pooling. Advances in Neural Information Processing Systems 32","author":"Hou Ming","year":"2019","unstructured":"Ming Hou, Jiajia Tang, Jianhai Zhang, Wanzeng Kong, and Qibin Zhao. 2019. Deep multimodal multilinear fusion with high-order polynomial pooling. Advances in Neural Information Processing Systems 32 (2019)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414460"},{"key":"e_1_3_2_1_11_1","volume-title":"UniMSE: Towards Unified Multimodal Sentiment Analysis and Emotion Recognition. arXiv preprint arXiv:2211.11256","author":"Hu Guimin","year":"2022","unstructured":"Guimin Hu, Ting-En Lin, Yi Zhao, Guangming Lu, Yuchuan Wu, and Yongbin Li. 2022. UniMSE: Towards Unified Multimodal Sentiment Analysis and Emotion Recognition. arXiv preprint arXiv:2211.11256 (2022)."},{"key":"e_1_3_2_1_12_1","first-page":"1165","article-title":"Summary of Multi-modal Sentiment Analysis Technology","volume":"15","author":"Jiming LIU","year":"2021","unstructured":"LIU Jiming, ZHANG Peixiang, LIU Ying, ZHANG Weidong, and FANG Jie. 2021. Summary of Multi-modal Sentiment Analysis Technology. Journal of Frontiers of Computer Science & Technology 15, 7 (2021), 1165.","journal-title":"Journal of Frontiers of Computer Science & Technology"},{"key":"e_1_3_2_1_13_1","volume-title":"Niels Da\u00a0Vitoria Lobo, and Mubarak Shah","author":"Khan Aisha\u00a0Urooj","year":"2020","unstructured":"Aisha\u00a0Urooj Khan, Amir Mazaheri, Niels Da\u00a0Vitoria Lobo, and Mubarak Shah. 2020. Mmft-bert: Multimodal fusion transformer with bert encodings for visual question answering. arXiv preprint arXiv:2010.14095 (2020)."},{"key":"e_1_3_2_1_14_1","volume-title":"International Conference on Machine Learning. PMLR, 5583\u20135594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International Conference on Machine Learning. PMLR, 5583\u20135594."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-018-6445-z"},{"key":"e_1_3_2_1_16_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1209"},{"key":"e_1_3_2_1_18_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_19_1","volume-title":"Voice recognition algorithms using mel frequency cepstral coefficient (MFCC) and dynamic time warping (DTW) techniques. arXiv preprint arXiv:1003.4083","author":"Muda Lindasalwa","year":"2010","unstructured":"Lindasalwa Muda, Mumtaj Begam, and Irraivan Elamvazuthi. 2010. Voice recognition algorithms using mel frequency cepstral coefficient (MFCC) and dynamic time warping (DTW) techniques. arXiv preprint arXiv:1003.4083 (2010)."},{"key":"e_1_3_2_1_20_1","unstructured":"Jiquan Ngiam Aditya Khosla Mingyu Kim Juhan Nam Honglak Lee and Andrew\u00a0Y Ng. 2011. Multimodal deep learning. In ICML."},{"key":"e_1_3_2_1_21_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van\u00a0den Oord Aaron","year":"2018","unstructured":"Aaron van\u00a0den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1303"},{"key":"e_1_3_2_1_25_1","volume-title":"Evaluation: from precision, recall and F-measure to ROC, informedness, markedness and correlation. arXiv preprint arXiv:2010.16061","author":"Powers MW","year":"2020","unstructured":"David\u00a0MW Powers. 2020. Evaluation: from precision, recall and F-measure to ROC, informedness, markedness and correlation. arXiv preprint arXiv:2010.16061 (2020)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054441"},{"key":"e_1_3_2_1_27_1","volume-title":"Imagebert: Cross-modal pre-training with large-scale weak-supervised image-text data. arXiv preprint arXiv:2001.07966","author":"Qi Di","year":"2020","unstructured":"Di Qi, Lin Su, Jia Song, Edward Cui, Taroon Bharti, and Arun Sacheti. 2020. Imagebert: Cross-modal pre-training with large-scale weak-supervised image-text data. arXiv preprint arXiv:2001.07966 (2020)."},{"key":"e_1_3_2_1_28_1","volume-title":"International Conference on Machine Learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748\u20138763."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"e_1_3_2_1_30_1","volume-title":"Vl-bert: Pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530","author":"Su Weijie","year":"2019","unstructured":"Weijie Su, Xizhou Zhu, Yue Cao, Bin Li, Lewei Lu, Furu Wei, and Jifeng Dai. 2019. Vl-bert: Pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530 (2019)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6431"},{"key":"e_1_3_2_1_32_1","volume-title":"Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1656"},{"key":"e_1_3_2_1_34_1","volume-title":"Learning Factorized Multimodal Representations. In International Conference on Representation Learning.","author":"Tsai Hung\u00a0Hubert","year":"2019","unstructured":"Yao-Hung\u00a0Hubert Tsai, Paul\u00a0Pu Liang, Amir Zadeh, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Learning Factorized Multimodal Representations. In International Conference on Representation Learning."},{"key":"e_1_3_2_1_35_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_36_1","volume-title":"Advantages of the mean absolute error (MAE) over the root mean square error (RMSE) in assessing average model performance. Climate research 30, 1","author":"Willmott J","year":"2005","unstructured":"Cort\u00a0J Willmott and Kenji Matsuura. 2005. Advantages of the mean absolute error (MAE) over the root mean square error (RMSE) in assessing average model performance. Climate research 30, 1 (2005), 79\u201382."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2013.34"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","first-page":"10","DOI":"10.1109\/T-AFFC.2010.16","article-title":"Emotion recognition of affective speech based on multiple classifiers using acoustic-prosodic information and semantic labels","volume":"2","author":"Wu Chung-Hsien","year":"2010","unstructured":"Chung-Hsien Wu and Wei-Bin Liang. 2010. Emotion recognition of affective speech based on multiple classifiers using acoustic-prosodic information and semantic labels. IEEE Transactions on Affective Computing 2, 1 (2010), 10\u201321.","journal-title":"IEEE Transactions on Affective Computing"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.79"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16431"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Amir Zadeh Minghai Chen Soujanya Poria Erik Cambria and Louis-Philippe Morency. 2017. Tensor Fusion Network for Multimodal Sentiment Analysis. In EMNLP.","DOI":"10.18653\/v1\/D17-1115"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1208"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2016.94"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1002\/widm.1253"}],"event":{"name":"ICMR '23: International Conference on Multimedia Retrieval","location":"Thessaloniki Greece","acronym":"ICMR '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2023 ACM International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592296","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3591106.3592296","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:37:30Z","timestamp":1750178250000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592296"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,12]]},"references-count":45,"alternative-id":["10.1145\/3591106.3592296","10.1145\/3591106"],"URL":"https:\/\/doi.org\/10.1145\/3591106.3592296","relation":{},"subject":[],"published":{"date-parts":[[2023,6,12]]},"assertion":[{"value":"2023-06-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}