{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T16:38:49Z","timestamp":1778258329687,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611974","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"833-842","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":24,"title":["AcFormer: An Aligned and Compact Transformer for Multimodal Sentiment Analysis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-8109-2943","authenticated-orcid":false,"given":"Daoming","family":"Zong","sequence":"first","affiliation":[{"name":"SenseTime Group Limited, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0161-4838","authenticated-orcid":false,"given":"Chaoyue","family":"Ding","sequence":"additional","affiliation":[{"name":"SenseTime Group Limited, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4490-2157","authenticated-orcid":false,"given":"Baoxiang","family":"Li","sequence":"additional","affiliation":[{"name":"SenseTime Group Limited, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2492-3528","authenticated-orcid":false,"given":"Jiakui","family":"Li","sequence":"additional","affiliation":[{"name":"SenseTime Group Limited, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2374-8682","authenticated-orcid":false,"given":"Ken","family":"Zheng","sequence":"additional","affiliation":[{"name":"SenseTime Group Limited, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0746-9249","authenticated-orcid":false,"given":"Qunyan","family":"Zhou","sequence":"additional","affiliation":[{"name":"SenseTime Group Limited, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. NeurIPS, Vol. 33 (2020), 12449--12460.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_2_1","first-page":"423","article-title":"Multimodal machine learning: A survey and taxonomy","volume":"41","author":"Tadas Baltruvs","year":"2018","unstructured":"Tadas Baltruvs aitis, Chaitanya Ahuja, and Louis-Philippe Morency. 2018. Multimodal machine learning: A survey and taxonomy. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 41, 2 (2018), 423--443.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_3_1","volume-title":"Openface: an open source facial behavior analysis toolkit","author":"Tadas Baltruvs","unstructured":"Tadas Baltruvs aitis, Peter Robinson, and Louis-Philippe Morency. 2016. Openface: an open source facial behavior analysis toolkit. In WACV. IEEE, 1--10."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2021.107134"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-008-9076-6"},{"key":"e_1_3_2_1_6_1","volume-title":"Towards multimodal sarcasm detection (an _obviously_ perfect paper). arXiv preprint arXiv:1906.01815","author":"Castro Santiago","year":"2019","unstructured":"Santiago Castro, Devamanyu Hazarika, Ver\u00f3nica P\u00e9rez-Rosas, Roger Zimmermann, Rada Mihalcea, and Soujanya Poria. 2019. Towards multimodal sarcasm detection (an _obviously_ perfect paper). arXiv preprint arXiv:1906.01815 (2019)."},{"key":"e_1_3_2_1_7_1","volume-title":"Asif Ekbal, and Pushpak Bhattacharyya.","author":"Chauhan Dushyant Singh","year":"2019","unstructured":"Dushyant Singh Chauhan, Md Shad Akhtar, Asif Ekbal, and Pushpak Bhattacharyya. 2019. Context-aware interactive attention for multi-modal sentiment and emotion analysis. In EMNLP-IJCNLP. 5647--5657."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3136801"},{"key":"e_1_3_2_1_9_1","volume-title":"Emotionlines: An emotion corpus of multi-party conversations. arXiv preprint arXiv:1802.08379","author":"Chen Sheng-Yeh","year":"2018","unstructured":"Sheng-Yeh Chen, Chao-Chun Hsu, Chuan-Chun Kuo, Lun-Wei Ku, et al. 2018. Emotionlines: An emotion corpus of multi-party conversations. arXiv preprint arXiv:1802.08379 (2018)."},{"key":"e_1_3_2_1_10_1","unstructured":"Ting Chen Simon Kornblith Mohammad Norouzi and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In ICML. PMLR 1597--1607."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Gilles Degottex John Kane Thomas Drugman Tuomo Raitio and Stefan Scherer. 2014. COVAREP-A collaborative voice analysis repository for speech technologies. In ICASSP. 960--964.","DOI":"10.1109\/ICASSP.2014.6853739"},{"key":"e_1_3_2_1_12_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_13_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_14_1","volume-title":"Joint robust voicing detection and pitch estimation based on residual harmonics. arXiv preprint arXiv:2001.00459","author":"Drugman Thomas","year":"2019","unstructured":"Thomas Drugman and Abeer Alwan. 2019. Joint robust voicing detection and pitch estimation based on residual harmonics. arXiv preprint arXiv:2001.00459 (2019)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2170835"},{"key":"e_1_3_2_1_16_1","volume-title":"Simcse: Simple contrastive learning of sentence embeddings. arXiv preprint arXiv:2104.08821","author":"Gao Tianyu","year":"2021","unstructured":"Tianyu Gao, Xingcheng Yao, and Danqi Chen. 2021. Simcse: Simple contrastive learning of sentence embeddings. arXiv preprint arXiv:2104.08821 (2021)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Jiwei Guo Jiajia Tang Weichen Dai Yu Ding and Wanzeng Kong. 2022. Dynamically Adjust Word Representations Using Unaligned Multimodal Information. In ACM Multimedia. 3394--3402.","DOI":"10.1145\/3503161.3548137"},{"key":"e_1_3_2_1_18_1","volume-title":"Louis-Philippe Morency, et al.","author":"Hasan Md Kamrul","year":"2019","unstructured":"Md Kamrul Hasan, Wasifur Rahman, Amir Zadeh, Jianyuan Zhong, Md Iftekhar Tanveer, Louis-Philippe Morency, et al. 2019. UR-FUNNY: A multimodal language dataset for understanding humor. arXiv preprint arXiv:1904.06618 (2019)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413678"},{"key":"e_1_3_2_1_20_1","unstructured":"Kaiming He Haoqi Fan Yuxin Wu Saining Xie and Ross Girshick. 2020. Momentum contrast for unsupervised visual representation learning. In CVPR. 9729--9738."},{"key":"e_1_3_2_1_21_1","unstructured":"Penn Phonetics Laboratory. 2013. p2fa-vislab. https:\/\/github.com\/ucbvislab\/p2fa-vislab\/. A script for audio\/transcript alignment."},{"key":"e_1_3_2_1_22_1","volume-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. PMLR, 12888--12900.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. PMLR, 12888--12900."},{"key":"e_1_3_2_1_23_1","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. NeurIPS, Vol. 34 (2021), 9694--9705.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_24_1","volume-title":"AmirAli Bagher Zadeh, and Louis-Philippe Morency","author":"Liang Paul Pu","year":"2018","unstructured":"Paul Pu Liang, Ziyin Liu, AmirAli Bagher Zadeh, and Louis-Philippe Morency. 2018. Multimodal Language Analysis with Recurrent Multistage Fusion. In EMNLP. 150--161."},{"key":"e_1_3_2_1_25_1","volume-title":"Umt: Unified multi-modal transformers for joint video moment retrieval and highlight detection. In CVPR. 3042--3051.","author":"Liu Ye","year":"2022","unstructured":"Ye Liu, Siyuan Li, Yang Wu, Chang-Wen Chen, Ying Shan, and Xiaohu Qie. 2022. Umt: Unified multi-modal transformers for joint video moment retrieval and highlight detection. In CVPR. 3042--3051."},{"key":"e_1_3_2_1_26_1","volume-title":"Paul Pu Liang, AmirAli Bagher Zadeh, and Louis-Philippe Morency.","author":"Liu Zhun","year":"2018","unstructured":"Zhun Liu, Ying Shen, Varun Bharadhwaj Lakshminarasimhan, Paul Pu Liang, AmirAli Bagher Zadeh, and Louis-Philippe Morency. 2018. Efficient Low-rank Multimodal Fusion With Modality-Specific Factors. In ACL. 2247--2256."},{"key":"e_1_3_2_1_27_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_28_1","volume-title":"NeurIPS","volume":"32","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. NeurIPS, Vol. 32 (2019)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Fengmao Lv Xiang Chen Yanyong Huang Lixin Duan and Guosheng Lin. 2021. Progressive modality reinforcement for human multimodal emotion recognition from unaligned multimodal sequences. In CVPR. 2554--2562.","DOI":"10.1109\/CVPR46437.2021.00258"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Sijie Mai Haifeng Hu and Songlong Xing. 2019a. Divide conquer and combine: Hierarchical feature fusion network with local and global perspectives for multimodal affective computing. In ACL. 481--492.","DOI":"10.18653\/v1\/P19-1046"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i01.5347"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2925966"},{"key":"e_1_3_2_1_33_1","first-page":"14200","article-title":"Attention bottlenecks for multimodal fusion","volume":"34","author":"Nagrani Arsha","year":"2021","unstructured":"Arsha Nagrani, Shan Yang, Anurag Arnab, Aren Jansen, Cordelia Schmid, and Chen Sun. 2021. Attention bottlenecks for multimodal fusion. NeurIPS, Vol. 34 (2021), 14200--14213.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_34_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"e_1_3_2_1_36_1","volume-title":"Glove: Global vectors for word representation. In EMNLP. 1532--1543.","author":"Pennington Jeffrey","year":"2014","unstructured":"Jeffrey Pennington, Richard Socher, and Christopher D Manning. 2014. Glove: Global vectors for word representation. In EMNLP. 1532--1543."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016892"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Soujanya Poria Erik Cambria and Alexander Gelbukh. 2015. Deep convolutional neural network textual features and multiple kernel learning for utterance-level multimodal sentiment analysis. In EMNLP. 2539--2544.","DOI":"10.18653\/v1\/D15-1303"},{"key":"e_1_3_2_1_39_1","volume-title":"Meld: A multimodal multi-party dataset for emotion recognition in conversations. arXiv preprint arXiv:1810.02508","author":"Poria Soujanya","year":"2018","unstructured":"Soujanya Poria, Devamanyu Hazarika, Navonil Majumder, Gautam Naik, Erik Cambria, and Rada Mihalcea. 2018. Meld: A multimodal multi-party dataset for emotion recognition in conversations. arXiv preprint arXiv:1810.02508 (2018)."},{"key":"e_1_3_2_1_40_1","volume-title":"Extending long short-term memory for multi-view structured learning","author":"Rajagopalan Shyam Sundar","unstructured":"Shyam Sundar Rajagopalan, Louis-Philippe Morency, Tadas Baltrusaitis, and Roland Goecke. 2016. Extending long short-term memory for multi-view structured learning. In ECCV. Springer, 338--353."},{"key":"e_1_3_2_1_41_1","volume-title":"What the face reveals: Basic and applied studies of spontaneous expression using the Facial Action Coding System (FACS)","author":"Rosenberg Erika L","unstructured":"Erika L Rosenberg and Paul Ekman. 2020. What the face reveals: Basic and applied studies of spontaneous expression using the Facial Action Coding System (FACS). Oxford University Press."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Hao Sun Hongyi Wang Jiaqing Liu Yen-Wei Chen and Lanfen Lin. 2022. CubeMLP: An MLP-based model for multimodal sentiment analysis and depression estimation. In ACM Multimedia. 3722--3729.","DOI":"10.1145\/3503161.3548025"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2023.3274829"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6431"},{"key":"e_1_3_2_1_45_1","unstructured":"Hugo Touvron Matthieu Cord Matthijs Douze Francisco Massa Alexandre Sablayrolles and Herv\u00e9 J\u00e9gou. 2021. Training data-efficient image transformers & distillation through attention. In ICML. PMLR 10347--10357."},{"key":"e_1_3_2_1_46_1","volume-title":"ACL","volume":"2019","author":"Hubert Tsai Yao-Hung","year":"2019","unstructured":"Yao-Hung Hubert Tsai, Shaojie Bai, Paul Pu Liang, J Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Multimodal transformer for unaligned multimodal language sequences. In ACL, Vol. 2019. NIH Public Access, 6558."},{"key":"e_1_3_2_1_47_1","volume-title":"Amir Zadeh, Louis-Philippe Morency, and Ruslan Salakhutdinov.","author":"Hubert Tsai Yao-Hung","year":"2018","unstructured":"Yao-Hung Hubert Tsai, Paul Pu Liang, Amir Zadeh, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2018. Learning factorized multimodal representations. arXiv preprint arXiv:1806.06176 (2018)."},{"key":"e_1_3_2_1_48_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of Machine Learning Research, Vol. 9, 11 (2008).","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_49_1","volume-title":"NeurIPS","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. NeurIPS, Vol. 30 (2017)."},{"key":"e_1_3_2_1_50_1","unstructured":"Tongzhou Wang and Phillip Isola. 2020. Understanding contrastive representation learning through alignment and uniformity on the hypersphere. In ICML. PMLR 9929--9939."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33017216"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-022-10144-1"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Yang Wu Zijie Lin Yanyan Zhao Bing Qin and Li-Nan Zhu. 2021. A text-centered shared-private framework via cross-modal prediction for multimodal sentiment analysis. In Findings of the Association for Computational Linguistics. 4730--4738.","DOI":"10.18653\/v1\/2021.findings-acl.417"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Dingkang Yang Shuai Huang Haopeng Kuang Yangtao Du and Lihua Zhang. 2022. Disentangled Representation Learning for Multimodal Emotion Recognition. In ACM Multimedia. 1642--1651.","DOI":"10.1145\/3503161.3547754"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Amir Zadeh Minghai Chen Soujanya Poria Erik Cambria and Louis-Philippe Morency. 2017. Tensor Fusion Network for Multimodal Sentiment Analysis. In EMNLP. 1103--1114.","DOI":"10.18653\/v1\/D17-1115"},{"key":"e_1_3_2_1_56_1","volume-title":"Navonil Mazumder, Soujanya Poria, Erik Cambria, and Louis-Philippe Morency.","author":"Zadeh Amir","year":"2018","unstructured":"Amir Zadeh, Paul Pu Liang, Navonil Mazumder, Soujanya Poria, Erik Cambria, and Louis-Philippe Morency. 2018a. Memory fusion network for multi-view sequential learning. In AAAI, Vol. 32."},{"key":"e_1_3_2_1_57_1","volume-title":"Soujanya Poria, Prateek Vij, Erik Cambria, and Louis-Philippe Morency.","author":"Zadeh Amir","year":"2018","unstructured":"Amir Zadeh, Paul Pu Liang, Soujanya Poria, Prateek Vij, Erik Cambria, and Louis-Philippe Morency. 2018c. Multi-attention recurrent network for human communication comprehension. In AAAI, Vol. 32."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2016.94"},{"key":"e_1_3_2_1_59_1","volume-title":"Soujanya Poria, Erik Cambria, and Louis-Philippe Morency.","author":"Bagher Zadeh AmirAli","year":"2018","unstructured":"AmirAli Bagher Zadeh, Paul Pu Liang, Soujanya Poria, Erik Cambria, and Louis-Philippe Morency. 2018b. Multimodal language analysis in the wild: Cmu-mosei dataset and interpretable dynamic fusion graph. In ACL. 2236--2246."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611974","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611974","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:08:43Z","timestamp":1755821323000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611974"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":59,"alternative-id":["10.1145\/3581783.3611974","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611974","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}