{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T10:56:06Z","timestamp":1778237766850,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62222203; 62476201"],"award-info":[{"award-number":["62222203; 62476201"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Meituan"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755139","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"1337-1345","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Geometric Gradient Divergence Modulation for Imbalanced Multimodal Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-7558-4287","authenticated-orcid":false,"given":"Disen","family":"Hu","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Tongji University, Shanghai, China and School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2209-651X","authenticated-orcid":false,"given":"Xun","family":"Jiang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6531-0769","authenticated-orcid":false,"given":"Zhe","family":"Sun","sequence":"additional","affiliation":[{"name":"Faculty of Health Data Science, Juntendo University, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3599-9876","authenticated-orcid":false,"given":"Hao","family":"Yang","sequence":"additional","affiliation":[{"name":"Meituan, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6070-3600","authenticated-orcid":false,"given":"Chong","family":"Peng","sequence":"additional","affiliation":[{"name":"Meituan, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6211-4543","authenticated-orcid":false,"given":"Peng","family":"Yan","sequence":"additional","affiliation":[{"name":"Meituan, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2999-2088","authenticated-orcid":false,"given":"Heng Tao","family":"Shen","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Tongji University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5685-3123","authenticated-orcid":false,"given":"Xing","family":"Xu","sequence":"additional","affiliation":[{"name":"School of Computer Scicence and Technology, Tongji University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"e_1_3_2_1_2_1","volume-title":"Multimodal machine learning: A survey and taxonomy","author":"Baltru\u0161aitis Tadas","year":"2018","unstructured":"Tadas Baltru\u0161aitis, Chaitanya Ahuja, and Louis-Philippe Morency. 2018. Multimodal machine learning: A survey and taxonomy. IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 2 (2018), 423-443."},{"key":"e_1_3_2_1_3_1","volume-title":"Neural Networks: Tricks of the Trade","author":"Bottou L\u00e9on","unstructured":"L\u00e9on Bottou. 2012. Stochastic gradient descent tricks. In Neural Networks: Tricks of the Trade: Second Edition. 421-436."},{"key":"e_1_3_2_1_4_1","volume-title":"Crema-d: Crowd-sourced emotional multimodal actors dataset","author":"Cao Houwei","year":"2014","unstructured":"Houwei Cao, David G Cooper, Michael K Keutmann, Ruben C Gur, Ani Nenkova, and Ragini Verma. 2014. Crema-d: Crowd-sourced emotional multimodal actors dataset. IEEE transactions on affective computing, Vol. 5, 4 (2014), 377-390."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612349"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i8.16877"},{"key":"e_1_3_2_1_7_1","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171-4186."},{"key":"e_1_3_2_1_8_1","volume-title":"Modality Laziness: Everybody's Business is Nobody's Business.","author":"Du Chenzhuang","year":"2021","unstructured":"Chenzhuang Du, Jiaye Teng, Tingle Li, Yichen Liu, Yue Wang, Yang Yuan, and Hang Zhao. 2021. Modality Laziness: Everybody's Business is Nobody's Business. (2021)."},{"key":"e_1_3_2_1_9_1","volume-title":"International Conference on Machine Learning. 8632-8656","author":"Du Chenzhuang","year":"2023","unstructured":"Chenzhuang Du, Jiaye Teng, Tingle Li, Yichen Liu, Tianyuan Yuan, Yue Wang, Yang Yuan, and Hang Zhao. 2023. On uni-modal feature learning in supervised multi-modal learning. In International Conference on Machine Learning. 8632-8656."},{"key":"e_1_3_2_1_10_1","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"Duchi John","year":"2011","unstructured":"John Duchi, Elad Hazan, and Yoram Singer. 2011. Adaptive subgradient methods for online learning and stochastic optimization. Journal of machine learning research, Vol. 12, 7 (2011).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the Asian Conference on Computer Vision. 1178-1193","author":"Fan Caoyun","year":"2022","unstructured":"Caoyun Fan, Wenqing Chen, Jidong Tian, Yitian Li, Hao He, and Yaohui Jin. 2022. Maxgnr: A dynamic weight strategy via maximizing gradient-to-noise ratio for multi-task learning. In Proceedings of the Asian Conference on Computer Vision. 1178-1193."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680697"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01918"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01049"},{"key":"e_1_3_2_1_15_1","first-page":"25661","article-title":"A theory of the distortion-perception tradeoff in wasserstein space","volume":"34","author":"Freirich Dror","year":"2021","unstructured":"Dror Freirich, Tomer Michaeli, and Ron Meir. 2021. A theory of the distortion-perception tradeoff in wasserstein space. Advances in Neural Information Processing Systems, Vol. 34 (2021), 25661-25672.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_16_1","volume-title":"Hyperbolic neural networks. Advances in neural information processing systems","author":"Ganea Octavian","year":"2018","unstructured":"Octavian Ganea, Gary B\u00e9cigneul, and Thomas Hofmann. 2018. Hyperbolic neural networks. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680949"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02538"},{"key":"e_1_3_2_1_19_1","first-page":"133328","article-title":"Classifier-guided gradient modulation for enhanced multimodal learning","volume":"37","author":"Guo Zirun","year":"2024","unstructured":"Zirun Guo, Tao Jin, Jingyuan Chen, and Zhou Zhao. 2024. Classifier-guided gradient modulation for enhanced multimodal learning. Advances in Neural Information Processing Systems, Vol. 37 (2024), 133328-133344.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_21_1","volume-title":"Reconboost: Boosting can achieve modality reconcilement. arXiv preprint arXiv:2405.09321","author":"Hua Cong","year":"2024","unstructured":"Cong Hua, Qianqian Xu, Shilong Bao, Zhiyong Yang, and Qingming Huang. 2024. Reconboost: Boosting can achieve modality reconcilement. arXiv preprint arXiv:2405.09321 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"International conference on machine learning. 9226-9259","author":"Huang Yu","year":"2022","unstructured":"Yu Huang, Junyang Lin, Chang Zhou, Hongxia Yang, and Longbo Huang. 2022. Modality competition: What makes joint training of multi-modal network fail in deep learning?(provably). In International conference on machine learning. 9226-9259."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548309"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TFUZZ.2024.3405541"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3396272"},{"key":"e_1_3_2_1_26_1","unstructured":"Will Kay Joao Carreira Karen Simonyan Brian Zhang Chloe Hillier Sudheendra Vijayanarasimhan Fabio Viola Tim Green Trevor Back Paul Natsev et al. 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_1_27_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02030"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1398-0"},{"key":"e_1_3_2_1_30_1","volume-title":"Sara Ciucci, Ginestra Bianconi, and Carlo Vittorio Cannistraci.","author":"Muscoloni Alessandro","year":"2017","unstructured":"Alessandro Muscoloni, Josephine Maria Thomas, Sara Ciucci, Ginestra Bianconi, and Carlo Vittorio Cannistraci. 2017. Machine learning meets complex networks via coalescent embedding in the hyperbolic space. Nature communications, Vol. 8, 1 (2017), 1615."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00806"},{"key":"e_1_3_2_1_32_1","volume-title":"International Conference on Machine Learning. PMLR, 9681-9690","author":"Sim Aaron","year":"2021","unstructured":"Aaron Sim, Maciej L Wiatrak, Angus Brayne, P\u00e1id\u00ed Creed, and Saee Paliwal. 2021. Directed graph embeddings in pseudo-riemannian manifolds. In International Conference on Machine Learning. PMLR, 9681-9690."},{"key":"e_1_3_2_1_33_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_34_1","first-page":"1","article-title":"Scalable Bayes via barycenter in Wasserstein space","volume":"19","author":"Srivastava Sanvesh","year":"2018","unstructured":"Sanvesh Srivastava, Cheng Li, and David B Dunson. 2018. Scalable Bayes via barycenter in Wasserstein space. Journal of Machine Learning Research, Vol. 19, 8 (2018), 1-35.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i8.28754"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"e_1_3_2_1_37_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01271"},{"key":"e_1_3_2_1_39_1","first-page":"17863","article-title":"Robust large-margin learning in hyperbolic space","volume":"33","author":"Weber Melanie","year":"2020","unstructured":"Melanie Weber, Manzil Zaheer, Ankit Singh Rawat, Aditya K Menon, and Sanjiv Kumar. 2020. Robust large-margin learning in hyperbolic space. Advances in Neural Information Processing Systems, Vol. 33 (2020), 17863-17873.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_40_1","volume-title":"Mmpareto: boosting multimodal learning with innocent unimodal assistance. arXiv preprint arXiv:2405.17730","author":"Wei Yake","year":"2024","unstructured":"Yake Wei and Di Hu. 2024. Mmpareto: boosting multimodal learning with innocent unimodal assistance. arXiv preprint arXiv:2405.17730 (2024)."},{"key":"e_1_3_2_1_41_1","volume-title":"International Conference on Machine Learning. 24043-24055","author":"Wu Nan","year":"2022","unstructured":"Nan Wu, Stanislaw Jastrzebski, Kyunghyun Cho, and Krzysztof J Geras. 2022. Characterizing and overcoming the greedy nature of learning in multi-modal deep neural networks. In International Conference on Machine Learning. 24043-24055."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2967597"},{"key":"e_1_3_2_1_43_1","first-page":"62108","article-title":"Facilitating multimodal classification via dynamically learning modality gap","volume":"37","author":"Yang Yang","year":"2024","unstructured":"Yang Yang, Fengqiang Wan, Qing-Yuan Jiang, and Yi Xu. 2024. Facilitating multimodal classification via dynamically learning modality gap. Advances in Neural Information Processing Systems, Vol. 37 (2024), 62108-62122.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_44_1","first-page":"1824","article-title":"Modality-specific learning rates for effective multimodal additive late-fusion","volume":"2022","author":"Yao Yiqun","year":"2022","unstructured":"Yiqun Yao and Rada Mihalcea. 2022. Modality-specific learning rates for effective multimodal additive late-fusion. In Findings of the Association for Computational Linguistics 2022. 1824-1834.","journal-title":"Findings of the Association for Computational Linguistics"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/751"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755139","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:03:08Z","timestamp":1765310588000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755139"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":45,"alternative-id":["10.1145\/3746027.3755139","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755139","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}