{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T14:53:33Z","timestamp":1776783213995,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","funder":[{"name":"Key Research and Development Program of Shaanxi Province","award":["2024GX-ZDCYL-02-15"],"award-info":[{"award-number":["2024GX-ZDCYL-02-15"]}]},{"name":"Natural Science Foundation for Distinguished Young Scholars of Shaanxi Province","award":["2025JC-JCQN-079"],"award-info":[{"award-number":["2025JC-JCQN-079"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755276","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:38Z","timestamp":1761377198000},"page":"1588-1597","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["MST-Distill: Mixture of Specialized Teachers for Cross-Modal Knowledge Distillation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4001-1161","authenticated-orcid":false,"given":"Hui","family":"Li","sequence":"first","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4065-4052","authenticated-orcid":false,"given":"Pengfei","family":"Yang","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6143-5339","authenticated-orcid":false,"given":"Juanyang","family":"Chen","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1410-1534","authenticated-orcid":false,"given":"Le","family":"Dong","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5789-563X","authenticated-orcid":false,"given":"Yanxin","family":"Chen","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6913-8604","authenticated-orcid":false,"given":"Quan","family":"Wang","sequence":"additional","affiliation":[{"name":"Xidian University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1609\/icwsm.v12i1.14983"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.321"},{"key":"e_1_3_2_1_4_1","volume-title":"VGGSound: A Large-Scale Audio-Visual Dataset. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 721-725","author":"Chen Honglie","year":"2020","unstructured":"Honglie Chen, Wjournali Xie, Andrea Vedaldi, and Andrew Zisserman. 2020. VGGSound: A Large-Scale Audio-Visual Dataset. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 721-725."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2019.02.010"},{"key":"e_1_3_2_1_6_1","volume-title":"Feature-Map-Level Online Adversarial Knowledge Distillation. In International Conference on Machine Learning. 2006-2015","author":"Chung Inseop","year":"2020","unstructured":"Inseop Chung, SeongUk Park, Jangho Kim, and Nojun Kwak. 2020. Feature-Map-Level Online Adversarial Knowledge Distillation. In International Conference on Machine Learning. 2006-2015."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-022-04095-x"},{"key":"e_1_3_2_1_8_1","first-page":"78674","article-title":"SimMMDG: A Simple and Effective Framework for Multi-Modal Domain Generalization","volume":"36","author":"Dong Hao","year":"2023","unstructured":"Hao Dong, Ismail Nejjar, Han Sun, Eleni Chatzi, and Olga Fink. 2023. SimMMDG: A Simple and Effective Framework for Multi-Modal Domain Generalization. Advances in Neural Information Processing Systems, Vol. 36 (2023), 78674-78695.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","volume-title":"PMR: Prototypical Modal Rebalance for Multimodal Learning. In 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 20029-20038","author":"Fan Yunfeng","year":"2023","unstructured":"Yunfeng Fan, Wenchao Xu, Haozhao Wang, Junxiao Wang, and Song Guo. 2023. PMR: Prototypical Modal Rebalance for Multimodal Learning. In 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 20029-20038."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642491"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of International Conference on Learning Representations.","author":"Gu Yuxian","year":"2024","unstructured":"Yuxian Gu, Li Dong, Furu Wei, and Minlie Huang. 2024. MiniLLM: Knowledge Distillation of Large Language Models. In Proceedings of International Conference on Learning Representations."},{"key":"e_1_3_2_1_12_1","first-page":"79570","article-title":"One-for-All: Bridge the Gap between Heterogeneous Architectures in Knowledge Distillation","volume":"36","author":"Hao Zhiwei","year":"2023","unstructured":"Zhiwei Hao, Jianyuan Guo, Kai Han, Yehui Tang, Han Hu, Yunhe Wang, and Chang Xu. 2023. One-for-All: Bridge the Gap between Heterogeneous Architectures in Knowledge Distillation. Advances in Neural Information Processing Systems, Vol. 36 (2023), 79570-79582.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_13_1","volume-title":"Asian Conference on Computer Vision. Springer, 213-228","author":"Hazirbas Caner","year":"2016","unstructured":"Caner Hazirbas, Lingni Ma, Csaba Domokos, and Daniel Cremers. 2016. FuseNet: Incorporating Depth into Semantic Segmentation via Fusion-Based CNN Architecture. In Asian Conference on Computer Vision. Springer, 213-228."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612651"},{"key":"e_1_3_2_1_15_1","unstructured":"Geoffrey Hinton Oriol Vinyals and Jeff Dean. 2015. Distilling the Knowledge in a Neural Network. arXiv:1503.02531"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3147813"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01515"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02325"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.035"},{"key":"e_1_3_2_1_20_1","volume-title":"Critical Learning Periods for Multisensory Integration in Deep Networks. In 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 24296-24305","author":"Kleinman Michael","year":"2023","unstructured":"Michael Kleinman, Alessandro Achille, and Stefano Soatto. 2023. Critical Learning Periods for Multisensory Integration in Deep Networks. In 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 24296-24305."},{"key":"e_1_3_2_1_21_1","unstructured":"Ke Li Fuyu Dong Di Wang Shaofeng Li Quan Wang Xinbo Gao and Tat-Seng Chua. 2024a. Show Me What and Where Has Changed? Question Answering and Grounding for Remote Sensing Change Detection. arXiv:2410.23828"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2024.3423663"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3257546"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681201"},{"key":"e_1_3_2_1_25_1","volume-title":"Multimodal Material Segmentation. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 19800-19808","author":"Liang Yupeng","year":"2022","unstructured":"Yupeng Liang, Ryosuke Wakaki, Shohei Nobuhara, and Ko Nishino. 2022. Multimodal Material Segmentation. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 19800-19808."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0196391"},{"key":"e_1_3_2_1_27_1","volume-title":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 18177-18186","author":"Ma Mengmeng","year":"2023","unstructured":"Mengmeng Ma, Jian Ren, Long Zhao, Davide Testuggine, and Xi Peng. 2023. Are Multimodal Transformers Robust to Missing Modality?. In 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 18177-18186."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.5555\/3692070.3693442"},{"key":"e_1_3_2_1_29_1","first-page":"689","volume-title":"Multimodal Deep Learning. In International Conference on Machine Learning","volume":"11","author":"Ngiam Jiquan","year":"2011","unstructured":"Jiquan Ngiam, Aditya Khosla, Mingyu Kim, Juhan Nam, Honglak Lee, Andrew Y Ng, et al., 2011. Multimodal Deep Learning. In International Conference on Machine Learning, Vol. 11. 689-696."},{"key":"e_1_3_2_1_30_1","volume-title":"Relational Knowledge Distillation. In 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 3967-3976","author":"Park Wonpyo","year":"2019","unstructured":"Wonpyo Park, Dongju Kim, Yan Lu, and Minsu Cho. 2019. Relational Knowledge Distillation. In 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 3967-3976."},{"key":"e_1_3_2_1_31_1","volume-title":"TimeChat: A Time-Sensitive Multimodal Large Language Model for Long Video Understanding. In 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 14313-14323","author":"Ren Shuhuai","year":"2024","unstructured":"Shuhuai Ren, Linli Yao, Shicheng Li, Xu Sun, and Lu Hou. 2024. TimeChat: A Time-Sensitive Multimodal Large Language Model for Long Video Understanding. In 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 14313-14323."},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of International Conference on Learning Representations.","author":"Romero Adriana","year":"2015","unstructured":"Adriana Romero, Nicolas Ballas, Samira Ebrahimi Kahou, Antoine Chassang, Carlo Gatta, and Yoshua Bengio. 2015. FitNets: Hints for Thin Deep Nets. In Proceedings of International Conference on Learning Representations."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33715-4_54"},{"key":"e_1_3_2_1_35_1","first-page":"6906","article-title":"Does Knowledge Distillation Really Work?","volume":"34","author":"Stanton Samuel","year":"2021","unstructured":"Samuel Stanton, Pavel Izmailov, Polina Kirichenko, Alexander A Alemi, and Andrew G Wilson. 2021. Does Knowledge Distillation Really Work?. In Advances in Neural Information Processing Systems, Vol. 34. 6906-6919.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","volume-title":"Layer-Wise Fusion with Modality Independence Modeling for Multi-Modal Emotion Recognition. In Annual Meeting of the Association for Computational Linguistics. 658-670","author":"Sun Jun","year":"2023","unstructured":"Jun Sun, Shoukang Han, Yu-Ping Ruan, Xiaoning Zhang, Shu-Kai Zheng, Yulong Liu, Yuxin Huang, and Taihao Li. 2023. Layer-Wise Fusion with Modality Independence Modeling for Multi-Modal Emotion Recognition. In Annual Meeting of the Association for Computational Linguistics. 658-670."},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of International Conference on Learning Representations.","author":"Tian Yonglong","year":"2020","unstructured":"Yonglong Tian, Dilip Krishnan, and Phillip Isola. 2020. Contrastive Representation Distillation. In Proceedings of International Conference on Learning Representations."},{"key":"e_1_3_2_1_38_1","volume-title":"Advances in Neural Information Processing Systems","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention Is All You Need. Advances in Neural Information Processing Systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_39_1","volume-title":"CentralNet: A Multilayer Approach for Multimodal Fusion. In European Conference on Computer Vision Workshops.","author":"Vielzeuf Valentin","year":"2018","unstructured":"Valentin Vielzeuf, Alexis Lechervy, St\u00e9phane Pateux, and Fr\u00e9d\u00e9ric Jurie. 2018. CentralNet: A Multilayer Approach for Multimodal Fusion. In European Conference on Computer Vision Workshops."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3140656"},{"key":"e_1_3_2_1_41_1","volume-title":"European Conference on Computer Vision. Springer, 396-416","author":"Wang Yi","year":"2024","unstructured":"Yi Wang, Kunchang Li, Xinhao Li, Jiashuo Yu, Yinan He, Guo Chen, Baoqi Pei, Rongkun Zheng, Zun Wang, Yansong Shi, et al., 2024. InternVideo2: Scaling Foundation Models for Multimodal Video Understanding. In European Conference on Computer Vision. Springer, 396-416."},{"key":"e_1_3_2_1_42_1","volume-title":"MMPareto: Boosting Multimodal Learning with Innocent Unimodal Assistance. In International Conference on Machine Learning. PMLR, 52559-52572","author":"Wei Yake","year":"2024","unstructured":"Yake Wei and Di Hu. 2024. MMPareto: Boosting Multimodal Learning with Innocent Unimodal Assistance. In International Conference on Machine Learning. PMLR, 52559-52572."},{"key":"e_1_3_2_1_43_1","volume-title":"Characterizing and Overcoming the Greedy Nature of Learning in Multi-Modal Deep Neural Networks. In International Conference on Machine Learning. PMLR, 24043-24055","author":"Wu Nan","year":"2022","unstructured":"Nan Wu, Stanislaw Jastrzebski, Kyunghyun Cho, and Krzysztof J Geras. 2022. Characterizing and Overcoming the Greedy Nature of Learning in Multi-Modal Deep Neural Networks. In International Conference on Machine Learning. PMLR, 24043-24055."},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of International Conference on Learning Representations.","author":"Xue Zihui","year":"2023","unstructured":"Zihui Xue, Zhengqi Gao, Sucheng Ren, and Hang Zhao. 2023. The Modality Focusing Hypothesis: Towards Understanding Crossmodal Knowledge Distillation. In Proceedings of International Conference on Learning Representations."},{"key":"e_1_3_2_1_45_1","volume-title":"Multimodal Knowledge Expansion. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 854-863","author":"Xue Zihui","year":"2021","unstructured":"Zihui Xue, Sucheng Ren, Zhengqi Gao, and Hang Zhao. 2021. Multimodal Knowledge Expansion. In 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 854-863."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"e_1_3_2_1_47_1","first-page":"6866","article-title":". Edge-Cloud Polarization and Collaboration: A Comprehensive Survey for AI","volume":"35","author":"Yao Jiangchao","year":"2022","unstructured":"Jiangchao Yao, Shengyu Zhang, Yang Yao, Feng Wang, Jianxin Ma, Jianwei Zhang, Yunfei Chu, Luo Ji, Kunyang Jia, Tao Shen, et al., 2022. Edge-Cloud Polarization and Collaboration: A Comprehensive Survey for AI. IEEE Transactions on Knowledge and Data Engineering, Vol. 35, 7 (2022), 6866-6886.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"e_1_3_2_1_48_1","volume-title":"Network Minimization and Transfer Learning. In 2017 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 4133-4141","author":"Yim Junho","year":"2017","unstructured":"Junho Yim, Donggyu Joo, Jihoon Bae, and Junmo Kim. 2017. A Gift from Knowledge Distillation: Fast Optimization, Network Minimization and Transfer Learning. In 2017 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 4133-4141."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3220051"},{"key":"e_1_3_2_1_50_1","volume-title":"Understanding Unimodal Bias in Multimodal Deep Linear Networks. In International Conference on Machine Learning","volume":"235","author":"Zhang Yedi","year":"2024","unstructured":"Yedi Zhang, Peter Latham, et al., 2024. Understanding Unimodal Bias in Multimodal Deep Linear Networks. In International Conference on Machine Learning, Vol. 235. PMLR."},{"key":"e_1_3_2_1_51_1","volume-title":"Deep Mutual Learning. In 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 4320-4328","author":"Zhang Ying","year":"2018","unstructured":"Ying Zhang, Tao Xiang, Timothy M Hospedales, and Huchuan Lu. 2018. Deep Mutual Learning. In 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 4320-4328."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2017.02.007"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3223688"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755276","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:45:10Z","timestamp":1765309510000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755276"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":53,"alternative-id":["10.1145\/3746027.3755276","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755276","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}