{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,25]],"date-time":"2026-02-25T17:52:07Z","timestamp":1772041927373,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":64,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100003169","name":"South China Normal University","doi-asserted-by":"publisher","award":["24KJ01"],"award-info":[{"award-number":["24KJ01"]}],"id":[{"id":"10.13039\/501100003169","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755591","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"1997-2006","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Towards Explainable Fusion and Balanced Learning in Multimodal Sentiment Analysis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-6756-7602","authenticated-orcid":false,"given":"Miaosen","family":"Luo","sequence":"first","affiliation":[{"name":"School of Computer Science, South China Normal University, GuangZhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0402-5382","authenticated-orcid":false,"given":"Yuncheng","family":"Jiang","sequence":"additional","affiliation":[{"name":"School of Computer Science, South China Normal University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9763-375X","authenticated-orcid":false,"given":"Sijie","family":"Mai","sequence":"additional","affiliation":[{"name":"School of Computer Science, South China Normal University, Guangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Peeking inside the black-box: a survey on explainable artificial intelligence (XAI)","author":"Adadi Amina","year":"2018","unstructured":"Amina Adadi and Mohammed Berrada. 2018. Peeking inside the black-box: a survey on explainable artificial intelligence (XAI). IEEE access, Vol. 6 (2018), 52138-52160."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TFUZZ.2023.3243935"},{"key":"e_1_3_2_1_3_1","volume-title":"Deep variational information bottleneck. arXiv preprint arXiv:1612.00410","author":"Alemi Alexander A","year":"2016","unstructured":"Alexander A Alemi, Ian Fischer, Joshua V Dillon, and Kevin Murphy. 2016. Deep variational information bottleneck. arXiv preprint arXiv:1612.00410 (2016)."},{"key":"e_1_3_2_1_4_1","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171-4186."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.119619"},{"key":"e_1_3_2_1_6_1","volume-title":"Knowledge-Guided Dynamic Modality Attention Fusion Framework for Multimodal Sentiment Analysis. arXiv preprint arXiv:2410.04491","author":"Feng Xinyu","year":"2024","unstructured":"Xinyu Feng, Yuming Lin, Lihua He, You Li, Liang Chang, and Ya Zhou. 2024. Knowledge-Guided Dynamic Modality Attention Fusion Framework for Multimodal Sentiment Analysis. arXiv preprint arXiv:2410.04491 (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"International conference on artificial intelligence and statistics. PMLR, 1287-1296","author":"Garreau Damien","year":"2020","unstructured":"Damien Garreau and Ulrike Luxburg. 2020. Explaining the explainer: A first theoretical analysis of LIME. In International conference on artificial intelligence and statistics. PMLR, 1287-1296."},{"key":"e_1_3_2_1_8_1","volume-title":"Classifier-guided Gradient Modulation for Enhanced Multimodal Learning. arXiv preprint arXiv:2411.01409","author":"Guo Zirun","year":"2024","unstructured":"Zirun Guo, Tao Jin, Jingyuan Chen, and Zhou Zhao. 2024. Classifier-guided Gradient Modulation for Enhanced Multimodal Learning. arXiv preprint arXiv:2411.01409 (2024)."},{"key":"e_1_3_2_1_9_1","volume-title":"Improving multimodal fusion with hierarchical mutual information maximization for multimodal sentiment analysis. arXiv preprint arXiv:2109.00412","author":"Han Wei","year":"2021","unstructured":"Wei Han, Hui Chen, and Soujanya Poria. 2021. Improving multimodal fusion with hierarchical mutual information maximization for multimodal sentiment analysis. arXiv preprint arXiv:2109.00412 (2021)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413678"},{"key":"e_1_3_2_1_11_1","volume-title":"Deberta: Decoding-enhanced bert with disentangled attention. arXiv preprint arXiv:2006.03654","author":"He Pengcheng","year":"2020","unstructured":"Pengcheng He, Xiaodong Liu, Jianfeng Gao, and Weizhu Chen. 2020. Deberta: Decoding-enhanced bert with disentangled attention. arXiv preprint arXiv:2006.03654 (2020)."},{"key":"e_1_3_2_1_12_1","volume-title":"TF-BERT: Tensor-based fusion BERT for multimodal sentiment analysis. Neural Networks","author":"Hou Jingming","year":"2025","unstructured":"Jingming Hou, Nazlia Omar, Sabrina Tiun, Saidah Saad, and Qian He. 2025. TF-BERT: Tensor-based fusion BERT for multimodal sentiment analysis. Neural Networks (2025), 107222."},{"key":"e_1_3_2_1_13_1","volume-title":"UniMSE: Towards unified multimodal sentiment analysis and emotion recognition. arXiv preprint arXiv:2211.11256","author":"Hu Guimin","year":"2022","unstructured":"Guimin Hu, Ting-En Lin, Yi Zhao, Guangming Lu, Yuchuan Wu, and Yongbin Li. 2022. UniMSE: Towards unified multimodal sentiment analysis and emotion recognition. arXiv preprint arXiv:2211.11256 (2022)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102725"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/s40747-021-00637-x"},{"key":"e_1_3_2_1_16_1","volume-title":"Sanaz Nikghadam Hojjati, and Jose Barata","author":"Kalateh Sepideh","year":"2024","unstructured":"Sepideh Kalateh, Luis A Estrada-Jimenez, Sanaz Nikghadam Hojjati, and Jose Barata. 2024. A systematic review on multimodal emotion recognition: building blocks, current state, applications, and challenges. IEEE Access (2024)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1111\/exsy.13403"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2023.102563"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2020.08.006"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-024-05623-7"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Yihe Liu Ziqi Yuan Huisheng Mao Zhiyun Liang Wanqiuyue Yang Yuanzhe Qiu Tie Cheng Xiaoteng Li Hua Xu and Kai Gao. 2022. Make Acoustic and Visual Cues Matter: CH-SIMS v2.0 Dataset and AV-Mixup Consistent Module. arXiv:2209.02604 [cs.MM]","DOI":"10.1145\/3536221.3556630"},{"key":"e_1_3_2_1_22_1","volume-title":"Paul Pu Liang, Amir Zadeh, and Louis-Philippe Morency.","author":"Liu Zhun","year":"2018","unstructured":"Zhun Liu, Ying Shen, Varun Bharadhwaj Lakshminarasimhan, Paul Pu Liang, Amir Zadeh, and Louis-Philippe Morency. 2018. Efficient low-rank multimodal fusion with modality-specific factors. arXiv preprint arXiv:1806.00064 (2018)."},{"key":"e_1_3_2_1_23_1","volume-title":"Kan: Kolmogorov-arnold networks. arXiv preprint arXiv:2404.19756","author":"Liu Ziming","year":"2024","unstructured":"Ziming Liu, Yixuan Wang, Sachin Vaidya, Fabian Ruehle, James Halverson, Marin Solja\u010di\u0107, Thomas Y Hou, and Max Tegmark. 2024b. Kan: Kolmogorov-arnold networks. arXiv preprint arXiv:2404.19756 (2024)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2024.112011"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101973"},{"key":"e_1_3_2_1_26_1","volume-title":"A Hypergraph based Contextual Relationship Modeling Method for Multimodal Emotion Recognition in Conversation","author":"Lu Nannan","year":"2024","unstructured":"Nannan Lu, Zhiyuan Han, and Zhen Tan. 2024a. A Hypergraph based Contextual Relationship Modeling Method for Multimodal Emotion Recognition in Conversation. IEEE Transactions on Multimedia (2024)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2023.103538"},{"key":"e_1_3_2_1_28_1","volume-title":"A unified approach to interpreting model predictions. arXiv preprint arXiv:1705.07874","author":"Lundberg Scott","year":"2017","unstructured":"Scott Lundberg. 2017. A unified approach to interpreting model predictions. arXiv preprint arXiv:1705.07874 (2017)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3546577"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1046"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3068598"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3171679"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101920"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3172360"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-023-18032-8"},{"key":"e_1_3_2_1_36_1","volume-title":"Interpretable Multimodal Tucker Fusion Model With Information Filtering for Multimodal Sentiment Analysis","author":"Nie Xin","year":"2024","unstructured":"Xin Nie, Laurence T Yang, Zhe Li, Xianjun Deng, Fulan Fan, and Zecan Yang. 2024. Interpretable Multimodal Tucker Fusion Model With Information Filtering for Multimodal Sentiment Analysis. IEEE Transactions on Computational Social Systems (2024)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00806"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cogsys.2024.101243"},{"key":"e_1_3_2_1_40_1","volume-title":"The Kolmogorov-Arnold representation theorem revisited. Neural networks","author":"Schmidt-Hieber Johannes","year":"2021","unstructured":"Johannes Schmidt-Hieber. 2021. The Kolmogorov-Arnold representation theorem revisited. Neural networks, Vol. 137 (2021), 119-126."},{"key":"e_1_3_2_1_41_1","volume-title":"MFM: Multimodal Sentiment Analysis Based on Modal Focusing Model. In 2024 IEEE International Conference on Systems, Man, and Cybernetics (SMC). IEEE, 1524-1529","author":"Sun Shuangyang","year":"2024","unstructured":"Shuangyang Sun, Guoyan Xu, and Sijun Lu. 2024. MFM: Multimodal Sentiment Analysis Based on Modal Focusing Model. In 2024 IEEE International Conference on Systems, Man, and Cybernetics (SMC). IEEE, 1524-1529."},{"key":"e_1_3_2_1_42_1","volume-title":"The information bottleneck method. arXiv preprint physics\/0004057","author":"Tishby Naftali","year":"2000","unstructured":"Naftali Tishby, Fernando C Pereira, and William Bialek. 2000. The information bottleneck method. arXiv preprint physics\/0004057 (2000)."},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the conference. Association for computational linguistics. Meeting","volume":"2019","author":"Hubert Tsai Yao-Hung","year":"2019","unstructured":"Yao-Hung Hubert Tsai, Shaojie Bai, Paul Pu Liang, J Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Multimodal transformer for unaligned multimodal language sequences. In Proceedings of the conference. Association for computational linguistics. Meeting, Vol. 2019. 6558."},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the conference on empirical methods in natural language processing. Conference on empirical methods in natural language processing","volume":"2020","author":"Hubert Tsai Yao-Hung","year":"2020","unstructured":"Yao-Hung Hubert Tsai, Martin Q Ma, Muqiao Yang, Ruslan Salakhutdinov, and Louis-Philippe Morency. 2020. Multimodal routing: Improving local and global interpretability of multimodal language analysis. In Proceedings of the conference on empirical methods in natural language processing. Conference on empirical methods in natural language processing, Vol. 2020. 1823."},{"key":"e_1_3_2_1_45_1","volume-title":"Yichen Feng, Xianxun Zhu, and Mairie de Compiegne.","author":"Wang Rui","year":"2025","unstructured":"Rui Wang, Chaopeng Guo, Erik Cambria, Imad Rida, Haochen Yuan, Md Jalil Piran, Yichen Feng, Xianxun Zhu, and Mairie de Compiegne. 2025. CIME: Contextual interactionbased multimodal emotion analysis with enhanced semantic information. The Journal of Supercomputing (2025)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2020.3030418"},{"key":"e_1_3_2_1_47_1","volume-title":"Mmpareto: boosting multimodal learning with innocent unimodal assistance. arXiv preprint arXiv:2405.17730","author":"Wei Yake","year":"2024","unstructured":"Yake Wei and Di Hu. 2024. Mmpareto: boosting multimodal learning with innocent unimodal assistance. arXiv preprint arXiv:2405.17730 (2024)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-3309"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-3302"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2716835"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3178236"},{"key":"e_1_3_2_1_52_1","volume-title":"International Conference on Machine Learning. PMLR, 24043-24055","author":"Wu Nan","year":"2022","unstructured":"Nan Wu, Stanislaw Jastrzebski, Kyunghyun Cho, and Krzysztof J Geras. 2022a. Characterizing and overcoming the greedy nature of learning in multi-modal deep neural networks. In International Conference on Machine Learning. PMLR, 24043-24055."},{"key":"e_1_3_2_1_53_1","volume-title":"Neuro-inspired information-theoretic hierarchical perception for multimodal learning. arXiv preprint arXiv:2404.09403","author":"Xiao Xiongye","year":"2024","unstructured":"Xiongye Xiao, Gengshuo Liu, Gaurav Gupta, Defu Cao, Shixuan Li, Yaxing Li, Tianqing Fang, Mingxi Cheng, and Paul Bogdan. 2024. Neuro-inspired information-theoretic hierarchical perception for multimodal learning. arXiv preprint arXiv:2404.09403 (2024)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.421"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"e_1_3_2_1_56_1","volume-title":"Tensor fusion network for multimodal sentiment analysis. arXiv preprint arXiv:1707.07250","author":"Zadeh Amir","year":"2017","unstructured":"Amir Zadeh, Minghai Chen, Soujanya Poria, Erik Cambria, and Louis-Philippe Morency. 2017. Tensor fusion network for multimodal sentiment analysis. arXiv preprint arXiv:1707.07250 (2017)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12021"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2016.94"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1208"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11760-022-02313-0"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102031"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.54254\/2977-3903\/12\/2024128"},{"key":"e_1_3_2_1_63_1","volume-title":"Proceedings of the 31st International Conference on Computational Linguistics. 4611-4621","author":"Zhang Xiangmin","year":"2025","unstructured":"Xiangmin Zhang, Wei Wei, and Shihao Zou. 2025. Modal Feature Optimization Network with Prompt for Multimodal Sentiment Analysis. In Proceedings of the 31st International Conference on Computational Linguistics. 4611-4621."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102787"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755591","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:05:36Z","timestamp":1765310736000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755591"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":64,"alternative-id":["10.1145\/3746027.3755591","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755591","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}