{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:06:39Z","timestamp":1765343199585,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","funder":[{"name":"the National Natural Science Foundation of China","award":["U2268203"],"award-info":[{"award-number":["U2268203"]}]},{"name":"National Railway Group's Science and Technology Research and Development Program","award":["K25D00011"],"award-info":[{"award-number":["K25D00011"]}]},{"name":"Beijing Natural Science Foundation","award":["L221011"],"award-info":[{"award-number":["L221011"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755198","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"7922-7930","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["GMML:Gradient-Modulated Robustness for Imbalance-Aware Multimodal Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0371-1268","authenticated-orcid":false,"given":"Zikai","family":"Zhang","sequence":"first","affiliation":[{"name":"Key Laboratory of Big Data &amp; Artificial Intelligence in Transportation (Beijing Jiaotong University), Ministry of Education, Beijing Jiaotong University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4844-4107","authenticated-orcid":false,"given":"Xu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Big Data &amp; Artificial Intelligence in Transportation (Beijing Jiaotong University), Ministry of Education, Beijing Jiaotong University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5680-5877","authenticated-orcid":false,"given":"Ziyi","family":"Li","sequence":"additional","affiliation":[{"name":"Key Laboratory of Big Data &amp; Artificial Intelligence in Transportation (Beijing Jiaotong University), Ministry of Education, Beijing Jiaotong University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2965-6196","authenticated-orcid":false,"given":"Yidong","family":"Li","sequence":"additional","affiliation":[{"name":"Key Laboratory of Big Data &amp; Artificial Intelligence in Transportation (Beijing Jiaotong University), Ministry of Education, Beijing Jiaotong University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2064-8336","authenticated-orcid":false,"given":"Yuanzhouhan","family":"Cao","sequence":"additional","affiliation":[{"name":"Key Laboratory of Big Data &amp; Artificial Intelligence in Transportation (Beijing Jiaotong University), Ministry of Education, Beijing Jiaotong University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"609","volume-title":"Listen and Learn. 2017 IEEE International Conference on Computer Vision (ICCV)","author":"Arandjelovi\u0107 Relja","year":"2017","unstructured":"Relja Arandjelovi\u0107 and Andrew Zisserman. 2017. Look, Listen and Learn. 2017 IEEE International Conference on Computer Vision (ICCV) (2017), 609-617. https:\/\/api.semanticscholar.org\/CorpusID:10769575"},{"key":"e_1_3_2_1_2_1","unstructured":"Tadas Baltru\u0161aitis Chaitanya Ahuja and Louis-Philippe Morency. 2017. Multimodal Machine Learning: A Survey and Taxonomy. arXiv:1705.09406 [cs.LG] https:\/\/arxiv.org\/abs\/1705.09406"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-021-02166-7"},{"key":"e_1_3_2_1_4_1","first-page":"2631","volume-title":"MUTAN: Multimodal Tucker Fusion for Visual Question Answering. 2017 IEEE International Conference on Computer Vision (ICCV)","author":"Hedi","year":"2017","unstructured":"Hedi Ben-younes, R\u00e9mi Cad\u00e8ne, Matthieu Cord, and Nicolas Thome. 2017. MUTAN: Multimodal Tucker Fusion for Visual Question Answering. 2017 IEEE International Conference on Computer Vision (ICCV) (2017), 2631-2639. https:\/\/api.semanticscholar.org\/CorpusID:12913776"},{"key":"e_1_3_2_1_5_1","unstructured":"Liang Chen Zekun Wang Shuhuai Ren Lei Li Haozhe Zhao Yunshui Li Zefan Cai Hongcheng Guo Lei Zhang Yizhe Xiong Yichi Zhang Ruoyu Wu Qingxiu Dong Ge Zhang Jian Yang Lingwei Meng Shujie Hu Yulong Chen Junyang Lin Shuai Bai Andreas Vlachos Xu Tan Minjia Zhang Wen Xiao Aaron Yee Tianyu Liu and Baobao Chang. 2024. Next Token Prediction Towards Multimodal Intelligence: A Comprehensive Survey. arXiv:2412.18619 [cs.CL] https:\/\/arxiv.org\/abs\/2412.18619"},{"key":"e_1_3_2_1_6_1","first-page":"74","volume-title":"Shenzhen","author":"Dorent Reuben","year":"2019","unstructured":"Reuben Dorent, Samuel Joutard, Marc Modat, S\u00e9bastien Ourselin, and Tom Vercauteren. 2019. Hetero-modal variational encoder-decoder for joint modality completion and segmentation. In Medical Image Computing and Computer Assisted Intervention-MICCAI 2019: 22nd International Conference, Shenzhen, China, October 13-17, 2019, Proceedings, Part II 22. Springer, 74-82."},{"key":"e_1_3_2_1_7_1","unstructured":"Xiao Fu Wei Xi Jie Yang Yutao Bai Zhao Yang Rui Jiang LI XIZHE Jiankang Gao and Jizhong Zhao. 2024. Balanced Multimodal Learning: An Integrated Framework for Multi-Task Learning in Audio-Visual Fusion. https:\/\/openreview.net\/forum?id=V7WjTjX7AY"},{"key":"e_1_3_2_1_8_1","first-page":"27092","article-title":"Datacomp: In search of the next generation of multimodal datasets","volume":"36","author":"Gadre Samir Yitzhak","year":"2023","unstructured":"Samir Yitzhak Gadre, Gabriel Ilharco, Alex Fang, Jonathan Hayase, Georgios Smyrnis, Thao Nguyen, Ryan Marten, Mitchell Wortsman, Dhruba Ghosh, Jieyu Zhang, et al., 2023. Datacomp: In search of the next generation of multimodal datasets. Advances in Neural Information Processing Systems, Vol. 36 (2023), 27092-27112.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2022.09.025"},{"key":"e_1_3_2_1_10_1","volume-title":"Classifier-guided Gradient Modulation for Enhanced Multimodal Learning. ArXiv","author":"Guo Zirun","year":"2024","unstructured":"Zirun Guo, Tao Jin, Jingyuan Chen, and Zhou Zhao. 2024. Classifier-guided Gradient Modulation for Enhanced Multimodal Learning. ArXiv, Vol. abs\/2411.01409 (2024). https:\/\/api.semanticscholar.org\/CorpusID:273811776"},{"key":"e_1_3_2_1_11_1","first-page":"469","volume-title":"Athens","author":"Havaei Mohammad","year":"2016","unstructured":"Mohammad Havaei, Nicolas Guizard, Nicolas Chapados, and Yoshua Bengio. 2016. Hemis: Hetero-modal image segmentation. In Medical Image Computing and Computer-Assisted Intervention-MICCAI 2016: 19th International Conference, Athens, Greece, October 17-21, 2016, Proceedings, Part II 19. Springer, 469-477."},{"key":"e_1_3_2_1_12_1","unstructured":"Matthias Hein and Maksym Andriushchenko. 2017. Formal Guarantees on the Robustness of a Classifier against Adversarial Manipulation. In Neural Information Processing Systems. https:\/\/api.semanticscholar.org\/CorpusID:10490694"},{"key":"e_1_3_2_1_13_1","unstructured":"Yihan Hu Jiazhi Yang Li Chen Keyu Li Chonghao Sima Xizhou Zhu Siqi Chai Senyao Du Tianwei Lin Wenhai Wang Lewei Lu Xiaosong Jia Qiang Liu Jifeng Dai Yu Qiao and Hongyang Li. 2023. Planning-oriented Autonomous Driving. arXiv:2212.10156 [cs.CV] https:\/\/arxiv.org\/abs\/2212.10156"},{"key":"e_1_3_2_1_14_1","volume-title":"Modality Competition: What Makes Joint Training of Multi-modal Network Fail in Deep Learning? (Provably). arXiv:2203.12221 [cs.LG] https:\/\/arxiv.org\/abs\/2203.12221","author":"Huang Yu","year":"2022","unstructured":"Yu Huang, Junyang Lin, Chang Zhou, Hongxia Yang, and Longbo Huang. 2022. Modality Competition: What Makes Joint Training of Multi-modal Network Fail in Deep Learning? (Provably). arXiv:2203.12221 [cs.LG] https:\/\/arxiv.org\/abs\/2203.12221"},{"key":"e_1_3_2_1_15_1","volume-title":"EMMA: End-to-End Multimodal Model for Autonomous Driving. arXiv:2410.23262 [cs.CV] https:\/\/arxiv.org\/abs\/2410.23262","author":"Hwang Jyh-Jing","year":"2024","unstructured":"Jyh-Jing Hwang, Runsheng Xu, Hubert Lin, Wei-Chih Hung, Jingwei Ji, Kristy Choi, Di Huang, Tong He, Paul Covington, Benjamin Sapp, Yin Zhou, James Guo, Dragomir Anguelov, and Mingxing Tan. 2024. EMMA: End-to-End Multimodal Model for Autonomous Driving. arXiv:2410.23262 [cs.CV] https:\/\/arxiv.org\/abs\/2410.23262"},{"key":"e_1_3_2_1_16_1","unstructured":"Cheng Jiang Yihao Chen Jianbo Chang Ming Feng Renzhi Wang and Jianhua Yao. 2021. Fusion of medical imaging and electronic health records with attention and multi-head machanisms. arXiv:2112.11710 [cs.CV] https:\/\/arxiv.org\/abs\/2112.11710"},{"key":"e_1_3_2_1_17_1","first-page":"13286","volume-title":"MMTM: Multimodal Transfer Module for CNN Fusion. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019","author":"Vaezi Joze Hamid Reza","year":"2019","unstructured":"Hamid Reza Vaezi Joze, Amirreza Shaban, Michael L. Iuzzolino, and Kazuhito Koishida. 2019. MMTM: Multimodal Transfer Module for CNN Fusion. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019), 13286-13296. https:\/\/api.semanticscholar.org\/CorpusID:208176099"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3306489"},{"key":"e_1_3_2_1_19_1","volume-title":"Meta-Learn Unimodal Signals with Weak Supervision for Multimodal Sentiment Analysis. ArXiv","author":"Mai Sijie","year":"2024","unstructured":"Sijie Mai, Yu Zhao, Ying Zeng, Jianhua Yao, and Haifeng Hu. 2024. Meta-Learn Unimodal Signals with Weak Supervision for Multimodal Sentiment Analysis. ArXiv, Vol. abs\/2408.16029 (2024). https:\/\/api.semanticscholar.org\/CorpusID:272146034"},{"key":"e_1_3_2_1_20_1","unstructured":"Huisheng Mao Baozheng Zhang Hua Xu Ziqi Yuan and Yihe Liu. 2022. Robust-MSA: Understanding the Impact of Modality Noise on Multimodal Sentiment Analysis. arXiv:2211.13484 [cs.MM] https:\/\/arxiv.org\/abs\/2211.13484"},{"key":"e_1_3_2_1_21_1","first-page":"689","volume-title":"ICML","volume":"11","author":"Ngiam Jiquan","year":"2011","unstructured":"Jiquan Ngiam, Aditya Khosla, Mingyu Kim, Juhan Nam, Honglak Lee, Andrew Y Ng, et al., 2011. Multimodal deep learning.. In ICML, Vol. 11. 689-696."},{"key":"e_1_3_2_1_22_1","unstructured":"Xiaokang Peng Yake Wei Andong Deng Dong Wang and Di Hu. 2022. Balanced Multimodal Learning via On-the-fly Gradient Modulation. arXiv:2203.15332 [cs.CV] https:\/\/arxiv.org\/abs\/2203.15332"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00673"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_25_1","volume-title":"UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild. ArXiv","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Zamir, and Mubarak Shah. 2012. UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild. ArXiv, Vol. abs\/1212.0402 (2012). https:\/\/api.semanticscholar.org\/CorpusID:7197134"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Kai Sun Siyan Xue Fuchun Sun Haoran Sun Yu Luo Ling Wang Siyuan Wang Na Guo Lei Liu Tian Zhao Xinzhou Wang Lei Yang Shuo Jin Jun Yan and Jiahong Dong. 2024. Medical Multimodal Foundation Models in Clinical Diagnosis and Treatment: Applications Challenges and Future Directions. arXiv:2412.02621 [cs.AI] https:\/\/arxiv.org\/abs\/2412.02621","DOI":"10.1016\/j.artmed.2025.103265"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2021.3101421"},{"key":"e_1_3_2_1_28_1","first-page":"6558","volume-title":"Proceedings of the conference. Association for Computational Linguistics. Meeting","volume":"2019","author":"Hubert Tsai Yao-Hung","year":"2019","unstructured":"Yao-Hung Hubert Tsai, Shaojie Bai, Paul Pu Liang, J. Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Multimodal Transformer for Unaligned Multimodal Language Sequences. Proceedings of the conference. Association for Computational Linguistics. Meeting, Vol. 2019 (2019), 6558-6569. https:\/\/api.semanticscholar.org\/CorpusID:173990158"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Zifeng Wang Zhenbang Wu Dinesh Agarwal and Jimeng Sun. 2022. MedCLIP: Contrastive Learning from Unpaired Medical Images and Text. arXiv:2210.10163 [cs.CV] https:\/\/arxiv.org\/abs\/2210.10163","DOI":"10.18653\/v1\/2022.emnlp-main.256"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02581"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Yake Wei Ruoxuan Feng Zihe Wang and Di Hu. 2024b. Enhancing multimodal cooperation via sample-level modality valuation. arXiv:2309.06255 [cs.CV] https:\/\/arxiv.org\/abs\/2309.06255","DOI":"10.1109\/CVPR52733.2024.02581"},{"key":"e_1_3_2_1_32_1","volume-title":"On-the-fly modulation for balanced multimodal learning","author":"Wei Yake","year":"2024","unstructured":"Yake Wei, Di Hu, Henghui Du, and Ji-Rong Wen. 2024c. On-the-fly modulation for balanced multimodal learning. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"Diagnosing and Re-learning for Balanced Multimodal Learning. In European Conference on Computer Vision. https:\/\/api.semanticscholar.org\/CorpusID:271212799","author":"Wei Yake","year":"2024","unstructured":"Yake Wei, Siwei Li, Ruoxuan Feng, and Di Hu. 2024d. Diagnosing and Re-learning for Balanced Multimodal Learning. In European Conference on Computer Vision. https:\/\/api.semanticscholar.org\/CorpusID:271212799"},{"key":"e_1_3_2_1_34_1","unstructured":"Renjie Wu Hu Wang Hsiang-Ting Chen and Gustavo Carneiro. 2024b. Deep Multimodal Learning with Missing Modality: A Survey. arXiv:2409.07825 [cs.CV] https:\/\/arxiv.org\/abs\/2409.07825"},{"key":"e_1_3_2_1_35_1","unstructured":"Yu-Chang Wu Shen-Huan Lyu Haopu Shang Xiangyu Wang and Chao Qian. 2024a. Confidence-aware Contrastive Learning for Selective Classification. arXiv:2406.04745 [cs.LG] https:\/\/arxiv.org\/abs\/2406.04745"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2017.01.035"},{"key":"e_1_3_2_1_37_1","unstructured":"Qingyang Zhang Yake Wei Zongbo Han Huazhu Fu Xi Peng Cheng Deng Qinghua Hu Cai Xu Jie Wen Di Hu and Changqing Zhang. 2024. Multimodal Fusion on Low-quality Data: A Comprehensive Survey. arXiv:2404.18947 [cs.LG] https:\/\/arxiv.org\/abs\/2404.18947"},{"key":"e_1_3_2_1_38_1","volume-title":"Langlotz","author":"Zhang Yuhao","year":"2022","unstructured":"Yuhao Zhang, Hang Jiang, Yasuhide Miura, Christopher D. Manning, and Curtis P. Langlotz. 2022. Contrastive Learning of Medical Visual Representations from Paired Images and Text. arXiv:2010.00747 [cs.CV] https:\/\/arxiv.org\/abs\/2010.00747"},{"key":"e_1_3_2_1_39_1","first-page":"1","volume-title":"Adaptive Mask Co-Optimization for Modal Dependence in Multimodal Learning. ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Zhou Ying","year":"2023","unstructured":"Ying Zhou, Xuefeng Liang, Shi lian Zheng, Huijun Xuan, and Takatsune Kumada. 2023. Adaptive Mask Co-Optimization for Modal Dependence in Multimodal Learning. ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2023), 1-5. https:\/\/api.semanticscholar.org\/CorpusID:258536610"},{"key":"e_1_3_2_1_40_1","volume-title":"Stock Movement Prediction with Multimodal Stable Fusion via Gated Cross-Attention Mechanism. ArXiv","author":"Zong Chang","year":"2024","unstructured":"Chang Zong, Jian Shao, Weiming Lu, and Yueting Zhuang. 2024. Stock Movement Prediction with Multimodal Stable Fusion via Gated Cross-Attention Mechanism. ArXiv, Vol. abs\/2406.06594 (2024). https:\/\/api.semanticscholar.org\/CorpusID:270379967"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755198","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:03:37Z","timestamp":1765343017000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755198"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":40,"alternative-id":["10.1145\/3746027.3755198","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755198","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}