{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T07:43:32Z","timestamp":1765439012256,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":25,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["o.62276259, No.62201572, No.U21B2010, No.62271083"],"award-info":[{"award-number":["o.62276259, No.62201572, No.U21B2010, No.62271083"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3689062.3689087","type":"proceedings-article","created":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T18:29:19Z","timestamp":1729708159000},"page":"52-59","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Social Perception Prediction for MuSe 2024: Joint Learning of Multiple Perceptions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-3978-9373","authenticated-orcid":false,"given":"Zhuofan","family":"Wen","sequence":"first","affiliation":[{"name":"School of Artificial Intelligence, University of Chinese Academy of Sciences &amp; Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2065-0405","authenticated-orcid":false,"given":"Hailiang","family":"Yao","sequence":"additional","affiliation":[{"name":"Tianjin Normal University &amp; Institute of Automation, Chinese Academy of Sciences, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8448-5507","authenticated-orcid":false,"given":"Shun","family":"Chen","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3485-3869","authenticated-orcid":false,"given":"Haiyang","family":"Sun","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, University of Chinese Academy of Sciences &amp; Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9378-3806","authenticated-orcid":false,"given":"Mingyu","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7944-3458","authenticated-orcid":false,"given":"Licai","family":"Sun","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, University of Chinese Academy of Sciences &amp; Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9477-0599","authenticated-orcid":false,"given":"Zheng","family":"Lian","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1529-1552","authenticated-orcid":false,"given":"Bin","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7931-4754","authenticated-orcid":false,"given":"Fengyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9043-5065","authenticated-orcid":false,"given":"Siyuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9344-6428","authenticated-orcid":false,"given":"Jianhua","family":"Tao","sequence":"additional","affiliation":[{"name":"The Department of Automation, Tsinghua University &amp; Beijing National Research Center for Information Science and Technology, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Shahin Amiriparian Lukas Christ Alexander Kathan Maurice Gerczuk Niklas M\u00fcller Steffen Klug Lukas Stappen Andreas K\u00f6nig Erik Cambria Bj\u00f6rn Schuller and Simone Eulitz. 2024. The MuSe 2024 Multimodal Sentiment Analysis Challenge: Social Perception and Humor Recognition. arXiv:2406.07753 [cs.AI] https:\/\/arxiv.org\/abs\/2406.07753","DOI":"10.1145\/3689062.3689088"},{"key":"e_1_3_2_1_2_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33 (2020), 12449--12460."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Zhe Chen Weiyun Wang Hao Tian Shenglong Ye Zhangwei Gao Erfei Cui Wenwen Tong Kongzhi Hu Jiapeng Luo Zheng Ma et al. 2024. How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open- Source Suites. arXiv preprint arXiv:2404.16821 (2024).","DOI":"10.1007\/s11432-024-4231-5"},{"key":"e_1_3_2_1_7_1","volume-title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. arXiv preprint arXiv:2312.14238","author":"Chen Zhe","year":"2023","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, Bin Li, Ping Luo, Tong Lu, Yu Qiao, and Jifeng Dai. 2023. InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. arXiv preprint arXiv:2312.14238 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR abs\/1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR abs\/1810.04805 (2018). arXiv:1810.04805 http:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_2_1_9_1","volume-title":"Eva-02: A visual representation for neon genesis. Image and Vision Computing","author":"Fang Yuxin","year":"2024","unstructured":"Yuxin Fang, Quan Sun, Xinggang Wang, Tiejun Huang, Xinlong Wang, and Yue Cao. 2024. Eva-02: A visual representation for neon genesis. Image and Vision Computing (2024), 105171."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3606039.3613102"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3551876.3554811"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3551876.3554811"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3606039.3613112"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00623"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3475957.3484457"},{"key":"e_1_3_2_1_16_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby Mahmoud Assran Nicolas Ballas Wojciech Galuba Russell Howes Po-Yao Huang Shang-Wen Li Ishan Misra Michael Rabbat Vasu Sharma Gabriel Synnaeve Hu Xu Herv\u00e9 Jegou Julien Mairal Patrick Labatut Armand Joulin and Piotr Bojanowski. 2023. DINOv2: Learning Robust Visual Features without Supervision. arXiv:2304.07193 [cs.CV]"},{"key":"e_1_3_2_1_17_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2212.04356"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3423327.3423672"},{"key":"e_1_3_2_1_20_1","volume-title":"Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang, and Yue Cao. 2023. Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)."},{"key":"e_1_3_2_1_21_1","unstructured":"Liang Wang Nan Yang Xiaolong Huang Linjun Yang Rangan Majumder and Furu Wei. 2024. Multilingual E5 Text Embeddings: A Technical Report. arXiv preprint arXiv:2402.05672 (2024)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3606039.3613110"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3606039.3613107"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3606039.3613106"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/1322192.1322216"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 5th on Multimodal Sentiment Analysis Challenge and Workshop: Social Perception and Humor"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689062.3689087","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3689062.3689087","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T18:23:40Z","timestamp":1755973420000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3689062.3689087"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":25,"alternative-id":["10.1145\/3689062.3689087","10.1145\/3689062"],"URL":"https:\/\/doi.org\/10.1145\/3689062.3689087","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}